# Prepare Dataset

In [11]:
import csv
import networkx as nx
import numpy as np
from tqdm import tqdm

In [2]:
# prepare dataset

f = open('train.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
trains = []
for line in reader:
    trains.append(line)
f.close()    
print("trains", len(trains))

f = open('node_ingredient.txt', 'r', encoding='utf-8')
reader = csv.reader(f)
nodes = []
for line in reader:
    nodes.append(line)
f.close()    
print("nodes", len(nodes))

f = open('validation_classification_question.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
val_cls_q = []
for line in reader:
    val_cls_q.append(line)
f.close()
print("val_cls_q", len(val_cls_q))

f = open('validation_classification_answer.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
val_cls_a = []
for line in reader:
    val_cls_a.append(line)
f.close()
print("val_cls_a", len(val_cls_a))

f = open('validation_completion_question.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
val_cpt_q = []
for line in reader:
    val_cpt_q.append(line)
f.close()
print("val_cpt_q", len(val_cpt_q))

f = open('validation_completion_answer.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
val_cpt_a = []
for line in reader:
    val_cpt_a.append(line)
f.close()
print("val_cpt_a", len(val_cpt_a))



trains 23547
nodes 6714
val_cls_q 7848
val_cls_a 7848
val_cpt_q 7848
val_cpt_a 7848


In [3]:
# make graph of ingredients
G = nx.Graph()
for i in range(len(nodes)):
    G.add_node(str(i))
    
print(G.number_of_nodes())

for data in trains:
    for i in range(len(data) - 2):
        for j in range(i+1, len(data) - 1):
            if G.has_edge(data[i], data[j]):
                G[data[i]][data[j]]['weight'] += 1
            else:
                G.add_edge(data[i], data[j], weight=1)

print(G.number_of_edges())

6714
355816


# Completion Task 

### Method 1

In [4]:
# completion task : find neighbors of largest weight
# large weight means high probability of being in the same recipe
cpt_answers = []
for data in tqdm(val_cpt_q):
#     print(data)
    weight_dict = {}
    for node in data:
        for adv, w in G.adj[node].items():
            if adv in weight_dict.keys():
                weight_dict[adv] += w['weight']
            else:
                weight_dict[adv] = w['weight']
    for node in data:
        if node in weight_dict.keys():
            del weight_dict[node]
    
    weight_dict = sorted(weight_dict.items(), key=(lambda x: x[1]), reverse=True)
    
#     print(weight_dict)

    cpt_answers.append(weight_dict[0][0])

100%|███████████████████████████████████████| 7848/7848 [01:29<00:00, 87.89it/s]


In [9]:
acc = 0
for i in range(len(cpt_answers)):
    if cpt_answers[i] == val_cpt_a[i][0]:
        acc += 1
print(acc / len(cpt_answers) * 100)

6.167176350662589


### Method 2

In [82]:
# completion task : find neighbors of largest weight (but consider frequency)
# more frequently used ingredient -> not important? (ex. salt)
freq_dict = {}
for data in trains:
    for node in data[0:-1]:
        if node in freq_dict.keys():
            freq_dict[node] += 1
        else:
            freq_dict[node] = 1

for k,v in freq_dict.items():
    print(nodes[int(k)], v)

['pepper'] 2631
['purple onion'] 1134
['grape tomatoes'] 133
['feta cheese crumbles'] 212
['garbanzo beans'] 91
['seasoning'] 83
['black olives'] 135
['romaine lettuce'] 168
['garlic'] 4329
['grilled chicken breasts'] 3
['salt'] 10683
['eggs'] 2007
['cooking oil'] 276
['garlic powder'] 857
['chicken livers'] 40
['green chilies'] 452
['mayonaise'] 460
['soy sauce'] 1928
['yellow onion'] 698
['butter'] 2809
['vegetable oil'] 2594
['wheat'] 17
['water'] 4347
['passata'] 11
['bay leaf'] 486
['cayenne pepper'] 890
['black pepper'] 1580
['boneless chicken skinless thigh'] 214
['cornflour'] 51
['garlic paste'] 157
['milk'] 1323
['chili powder'] 1181
['shallots'] 861
['double cream'] 26
['oil'] 1170
['ground cumin'] 1637
['garam masala'] 557
['onions'] 4685
['natural yogurt'] 10
['lemon juice'] 845
['plain flour'] 87
['sugar'] 3759
['powdered sugar'] 293
['fresh ginger root'] 196
['ground ginger'] 407
['ground cinnamon'] 744
['baking powder'] 999
['vanilla extract'] 782
['skirt steak'] 48
['ch

['shiso'] 12
['caster'] 4
['dried bonito flakes'] 24
['dashi kombu'] 13
['white miso'] 48
['mixed greens'] 35
['branzino fillets'] 1
['barbecue sauce'] 77
['prepared pizza crust'] 3
['Sargento® Traditional Cut Shredded Mozzarella Cheese'] 1
['erythritol'] 1
['asparagus tips'] 4
['artichokes'] 48
['nutmeg'] 146
['fresh chives'] 62
['medium eggs'] 15
['ice water'] 99
['tuna'] 29
['portabello mushroom'] 44
['allspice berries'] 19
['tomatillo salsa'] 7
['ground chicken'] 61
['chipotle'] 15
['white sandwich bread'] 27
['louisiana hot sauce'] 4
['lemon cucumber'] 1
['Himalayan salt'] 9
['arame'] 1
['konbu'] 36
['plain breadcrumbs'] 9
['fresh mushrooms'] 164
['bok choy'] 77
['uncook medium shrimp', ' peel and devein'] 11
['snow peas'] 110
['catfish'] 14
['basil pesto sauce'] 29
['aioli'] 7
['mozzarella balls'] 4
['vegetables'] 125
['sun-dried tomatoes'] 95
['cocoa powder'] 31
['corn chips'] 20
['roasted tomatoes'] 37
['Mexican cheese'] 44
['enchilada sauce'] 173
['pasilla pepper'] 5
['corn'] 

['ditalini pasta'] 9
['littleneck clams'] 38
['cotija'] 90
['frozen sweet corn'] 4
['whole wheat bread'] 12
['apricot nectar'] 1
['whole cloves'] 49
['curry sauce'] 10
['pappardelle pasta'] 5
['toast'] 11
['duck drippings'] 1
['pastry dough'] 19
['stilton'] 6
['fermented black beans'] 28
['softened butter'] 25
['grouper'] 10
['key lime juice'] 19
['pickle relish'] 17
['butter cooking spray'] 7
['sliced ham'] 16
['brandy'] 109
['vanilla beans'] 98
['coffee granules'] 20
['firmly packed light brown sugar'] 29
['lean ground beef'] 226
['shredded zucchini'] 8
['dried pasta'] 10
['diced tomatoes with garlic and onion'] 7
['semisweet chocolate'] 99
['agave nectar'] 54
['dark corn syrup'] 22
['lemon-lime soda'] 8
['frozen strawberries'] 8
['frozen limeade'] 6
['muenster cheese'] 4
['loin pork roast'] 14
['rocket leaves'] 29
['unsweetened apple juice'] 4
['whole allspice'] 16
['bonito flakes'] 23
['small red beans'] 8
['plain chocolate'] 1
['anise extract'] 6
['Poire Williams'] 2
['bartlett pe

['orange soda'] 1
['KRAFT Zesty Italian Dressing'] 2
['fresh corn'] 19
['anchovies'] 32
['chunk light tuna in water'] 5
['gluten'] 11
['soy milk'] 18
['dillweed'] 2
['cacao powder'] 1
['medium shrimp uncook'] 16
['stout'] 12
['japanese rice'] 11
['okra pods'] 17
['baby eggplants'] 4
['cherry gelatin'] 2
['bing cherries'] 4
['crushed pineapples in juice'] 8
['rotelle'] 20
['chilegarlic sauce'] 9
['frozen broccoli'] 10
['creme anglaise'] 3
['green pumpkin seeds'] 6
['nian gao'] 1
['enokitake'] 27
['low sodium worcestershire sauce'] 6
['rib-eye roast'] 3
['haddock fillets'] 4
['Emmenthal'] 6
['Bragg Liquid Aminos'] 2
['shirataki'] 6
['chestnuts'] 11
['marrons'] 1
['liver'] 14
['chopped fresh herbs'] 9
['ackee'] 6
['dried salted codfish'] 14
['halibut'] 18
['dark ale'] 1
['black lentil'] 3
['sweet soy sauce'] 17
['pure vanilla'] 2
['abalone'] 2
['shahi jeera'] 13
['kewra essence'] 2
['mutton'] 14
['soy nuts'] 1
['nuts'] 20
['rainbow trout'] 3
['gold potatoes'] 4
['Mo Qua'] 1
['bonito'] 6
[

['sweet corn kernels'] 6
['brown mustard seeds'] 25
['lemon dressing'] 2
['rosewater'] 2
['lemonade'] 3
['lotus seeds'] 2
['lamb sausage'] 3
['summer savory'] 2
['tallow'] 1
['tzatziki'] 10
['dukkah'] 2
['unsweetened almond milk'] 7
['flaxseed'] 3
['all purpose seasoning'] 1
['cream of shrimp soup'] 2
['pound cake'] 7
['Kahlua Liqueur'] 1
['thin spaghetti'] 4
['white button mushrooms'] 14
['Amaretti Cookies'] 4
['chinese mustard'] 3
['lime leaves'] 27
['holy basil'] 4
['patis'] 3
['chocolate baking bar'] 3
['grit quick'] 5
['links'] 1
['accent'] 2
['rich chicken stock'] 4
['eau de vie'] 2
['pea pods'] 5
['peppermint extract'] 1
['chocolate curls'] 4
['dark crème de cacao'] 1
['gyoza skins'] 5
['crumbled ricotta salata cheese'] 6
['batter'] 6
['grated carrot'] 23
['fresh chile'] 3
['Chianti'] 7
['recipe crumbles'] 3
['celery heart'] 2
['doritos'] 7
['95% lean ground beef'] 2
['taco seasoning reduced sodium'] 5
['baby back ribs'] 11
['cooked pumpkin'] 1
['agar'] 5
['Jell-O Gelatin'] 6
['

['dark chocolate chip'] 4
['jalape'] 7
['whole wheat buns'] 1
['pumpernickel'] 1
['chicken fillets'] 7
['sorghum flour'] 2
['crayfish'] 4
['taco seasoned cheese'] 1
['agar agar flakes'] 1
['campari'] 4
['sweet vermouth'] 9
['veal demi-glace'] 2
['taro leaf'] 1
['cubed pumpkin'] 1
['blue crabs'] 8
['Heinz Tomato Ketchup'] 1
['red rice'] 2
['beef tendons'] 1
['pork and beans'] 5
['crisco'] 6
['dried soba'] 2
['and cook drain pasta ziti'] 1
['red jalapeno peppers'] 2
['gold tequila'] 3
['lamb fillet'] 2
['pomegranate'] 4
['Taco Bell Taco Seasoning Mix'] 4
['lotus roots'] 5
['cabernet sauvignon'] 7
['beef sausage'] 5
['dried tomatoes'] 2
['sugar substitute'] 5
['rye bread'] 7
['dried cornhusks'] 5
['low-fat cream cheese'] 6
['bisquick'] 8
['jambon de bayonne'] 1
['chinese black mushrooms'] 8
['beef bouillon powder'] 1
['chili habanero pepper'] 2
['sparkling mineral water'] 1
['burgundy'] 3
['pinot noir'] 6
['reduced sodium teriyaki sauce'] 1
['gumbo'] 1
['turkey legs'] 4
['sweet peas'] 6
[

['herb cheese'] 1
['yucca root'] 1
['paella rice'] 3
['shishito chile'] 1
['pâté'] 2
['custard dessert mix'] 1
['garlic herb feta'] 3
['mrs. dash seasoning mix'] 1
['char siu'] 3
['red grapefruit'] 1
['Tipo 00 flour'] 1
['king prawns'] 4
['cactus paddles'] 1
['whole wheat fusilli'] 1
['San Marzano tomatoes'] 2
['smoked chicken sausages'] 1
['guacamole seasoning mix'] 1
['sazon goya'] 1
['lingcod'] 1
['pumpernickel bread'] 2
['granular no-calorie sucralose sweetener'] 1
['korean vermicelli'] 1
['meat marinade'] 1
['ragu pizza quick sauc'] 1
['greater galangal'] 1
['dried orange peel'] 4
['regular sugar'] 2
['table syrup'] 1
['grissini'] 3
['Hidden Valley® Original Ranch® Dressing'] 2
['head on shrimp'] 1
['crushed graham crackers'] 3
['fruit juice'] 1
['crushed peppermint candy'] 1
['low-fat pasta sauce'] 3
['vegetable juice cocktail'] 2
['shredded low-fat mozzarella cheese'] 2
['roe'] 2
['beef round'] 3
['sweet gherkin'] 3
['branston pickle'] 1
['peach juice'] 2
['white cannellini bean

['brown sauce'] 1
['cream of potato soup'] 1
['guanabana'] 1
['Knorr Onion Minicubes'] 3
['sack'] 1
['celery powder'] 1
['mellow white miso'] 1
['low fat coleslaw dressing'] 1
['hijiki'] 1
['farofa'] 1
['cannelloni'] 1
['chowchow'] 1
['Hogue Cabernet Sauvignon'] 1
['won ton skins'] 1
['chinese winter melon'] 1
['beef carpaccio'] 1
['whole wheat bread slices'] 1
['ground peanut'] 1
['frozen brussels sprouts'] 1
['homemade beef stock'] 2
['paprika paste'] 1
['gnocchetti sardi'] 1
['Sargento® Artisan Blends® Shredded Parmesan Cheese'] 1
['jerk rub seasoning'] 1
['small pearl tapioca'] 1
['rice mix'] 1
['color food orang'] 3
['non fat chicken stock'] 1
['heirloom squash'] 1
['Bertolli Tomato & Basil Sauce'] 1
['scrod fillets'] 1
['small shells'] 1
['sunchokes'] 2
["Eggland's Best® eggs"] 1
['speck'] 1
['regular sour cream'] 1
['evaporated cane juice'] 1
['pork heart'] 1
['miswa'] 1
['blood'] 1
['brown beech mushrooms'] 1
['framboise liqueur'] 1
['frozen sweetened raspberries'] 1
['fresh ha

In [80]:
cpt_answers = []
for data in tqdm(val_cpt_q[0:10]):
#     print(data)
    weight_dict = {}
    for node in data:
        for adv, w in G.adj[node].items():
            if adv in weight_dict.keys():
#                 print(freq_dict[adv])
                weight_dict[adv] += w['weight'] / (freq_dict[adv] ** 1/2)
            else:
                weight_dict[adv] = w['weight'] / (freq_dict[adv] ** 1/2)
    for node in data:
        if node in weight_dict.keys():
            del weight_dict[node]
    
    weight_dict = sorted(weight_dict.items(), key=(lambda x: x[1]), reverse=True)
    
#     print(weight_dict)

    cpt_answers.append(weight_dict[0][0])

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 35.43it/s]

[('2944', 6.0), ('1995', 6.0), ('4493', 6.0), ('6147', 6.0), ('2537', 5.0), ('3126', 4.0), ('5947', 4.0), ('1331', 4.0), ('4558', 4.0), ('4837', 4.0), ('388', 4.0), ('1638', 4.0), ('5623', 4.0), ('2619', 4.0), ('2037', 4.0), ('153', 4.0), ('1262', 4.0), ('1647', 4.0), ('6331', 4.0), ('4371', 4.0), ('1829', 4.0), ('5911', 4.0), ('740', 4.0), ('165', 4.0), ('1369', 4.0), ('152', 4.0), ('6706', 4.0), ('6374', 4.0), ('4851', 4.0), ('4265', 4.0), ('4384', 4.0), ('5197', 4.0), ('1728', 4.0), ('3890', 4.0), ('3267', 4.0), ('369', 4.0), ('4169', 4.0), ('4348', 4.0), ('2336', 4.0), ('967', 4.0), ('1227', 4.0), ('3218', 4.0), ('3505', 3.333333333333333), ('2231', 3.2), ('4025', 3.0), ('29', 3.0), ('1418', 3.0), ('4137', 3.0), ('3330', 3.0), ('6262', 3.0), ('1217', 3.0), ('421', 3.0), ('3615', 3.0), ('5031', 3.0), ('2019', 3.0), ('5902', 3.0), ('4258', 3.0), ('808', 2.75), ('76', 2.6666666666666665), ('4154', 2.6666666666666665), ('6487', 2.6666666666666665), ('5951', 2.6666666666666665), ('555',




In [77]:
acc = 0
for i in range(len(cpt_answers)):
    if cpt_answers[i] == val_cpt_ans[i]:
        acc += 1
print(acc / len(cpt_answers) * 100)
print(1 / len(cpt_answers) * 100)

0.0382262996941896
0.012742099898063202


# Classification Task

In [53]:
train_data = {}
for data in trains:
    cuis = data[-1]
    if cuis in train_data.keys():
        train_data[cuis].append(data[0:-1])
    else:
        train_data[cuis] = []
        
print(len(train_data.keys()))

for cuisine in train_data.keys():
    np_ing = np.zeros((len(nodes), len(train_data[cuisine])))
    for i in range(len(train_data[cuisine])):
        for ing in train_data[cuisine][i]:
            np_ing[int(ing)][i] = 1
    train_data[cuisine] = np_ing

for cuisine in train_data.keys():
    print(cuisine, train_data[cuisine].shape[1])

20
greek 713
filipino 451
indian 1747
jamaican 279
spanish 589
italian 4677
mexican 3835
vietnamese 486
thai 902
southern_us 2514
chinese 1598
cajun_creole 919
brazilian 282
french 1542
japanese 839
irish 403
moroccan 495
korean 473
british 484
russian 299


In [20]:
train_matrix = np.zeros((len(nodes), len(trains)))
print(train_matrix.shape)
for i in range(len(trains)):
    for 

(6714, 23547)


In [32]:
def similarity(x1, x2):
    return np.dot(x1, x2)

In [34]:
for i in range(1,11):
    print(similarity(train_data['greek'][:,0], train_data['greek'][:,i]))

0.0
0.0
2.0
1.0
1.0
0.0
0.0
1.0
1.0
2.0


In [67]:
# classification validation
answers = []
for ings in tqdm(val_cls_q[:1000]):
#     print(ings)
    np_q = np.zeros((len(nodes),))
    for ing in ings:
        np_q[int(ing)] = 1
    
    max_sim = 0
    max_cuisine = ''
    for cuisine in train_data.keys():
        sim = 0
        for i in range(train_data[cuisine].shape[1]):
            sim += similarity(np_q, train_data[cuisine][:,i])
        sim  = sim / train_data[cuisine].shape[1]
#         print(cuisine, sim)
        if sim >= max_sim:
            max_sim = sim
            max_cuisine = cuisine
    
    answers.append(max_cuisine)

100%|███████████████████████████████████████| 1000/1000 [19:04<00:00,  1.14s/it]


In [68]:
acc = 0
for i in range(1000):
    if answers[i] == val_cls_a[i][0]:
        acc += 1
print(acc)

344


In [61]:
print(val_cls_a)

[['japanese'], ['french'], ['filipino'], ['brazilian'], ['italian'], ['southern_us'], ['italian'], ['filipino'], ['japanese'], ['vietnamese'], ['spanish'], ['mexican'], ['indian'], ['mexican'], ['southern_us'], ['southern_us'], ['italian'], ['korean'], ['indian'], ['mexican'], ['vietnamese'], ['italian'], ['vietnamese'], ['italian'], ['southern_us'], ['mexican'], ['italian'], ['italian'], ['italian'], ['italian'], ['spanish'], ['southern_us'], ['indian'], ['mexican'], ['jamaican'], ['italian'], ['southern_us'], ['french'], ['italian'], ['japanese'], ['italian'], ['mexican'], ['french'], ['mexican'], ['japanese'], ['jamaican'], ['mexican'], ['vietnamese'], ['japanese'], ['cajun_creole'], ['greek'], ['mexican'], ['southern_us'], ['mexican'], ['mexican'], ['chinese'], ['southern_us'], ['moroccan'], ['mexican'], ['italian'], ['chinese'], ['greek'], ['southern_us'], ['brazilian'], ['japanese'], ['chinese'], ['greek'], ['italian'], ['french'], ['mexican'], ['spanish'], ['greek'], ['mexican']