In [652]:
import pandas as pd
import numpy as np

### # Upload the dataset (should be on server)

In [653]:
data=pd.read_csv('data/data.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

In [654]:
ingredients=pd.read_csv('data/ingredients_short.csv')
ingredients.drop('Unnamed: 0', axis=1, inplace=True)

In [658]:
ingredients.head()

Unnamed: 0,stem,substitute 1,substitute 2,substitute 3,food type
0,cream,half-and-half,,,
1,flaxseed,,,,
2,farro,,,,
3,applesauce,,,,
4,cranberry,,,,fruit


In [659]:
# Upload the list of non_key ingredients
spices=list(pd.read_csv('data/spices.csv')['spices'])
spices=[x for x in spices if x in data.columns]
garnish=['parsley', 'dried parsley', 'cilantro', 'cilantro leaves', 'dill', 
         'celery leaves', 'chives', 'chocolate chips', 'sesame', 'black sesame seeds', 'sesame seeds']

In [660]:
# separate ingredients from non-ingredients 
non_ingredients=['meal','title','calories','protein','carbs','fats','sodium','cuisine', 'complexity']
non_ingredients.extend(spices)

In [661]:
group_keys=['pasta', 'mold cheese', 'soft cheese', 'brined cheese', 'medium cheese', 'hard cheese', 'cottage cheese', 'dry wine', 
            'liquer', 'white wine', 'red wine']
group=dict(zip(group_keys,
            [list(ingredients[ingredients['substitute 1']==key]['stem']) for key in group_keys]))

In [662]:
group_values=[y for x in group.values() for y in x]

### Inputs
 
grocery: list <br>

calories_max: int <br>
calories_min: int <br>
protein_min: int <br>
meal_type: breakfast, lunch, dinner, dessert, drink <br>
cuisine: 20 cuisines in the list <br>
complexity: easy, medium, hard <br>
n_additional_ingredients: int <br>


In [663]:
# Should be at home
home=['pepper', 'butter', 'olive oil', 'sugar', 'salt', 'water', 'lemon juice', 'dijon mustard', 'black pepper', 'ketchup']
grocery=['apple', 'milk', 'hazelnut', 'cucumber', 'bread', 'tomato', 
         'cod',  'rice', 'garlic', 'chicken', 'egg', 'feta', 'cream cheese', 'pasta']
grocery.extend(home)

In [664]:
# extend grocery for all available substitutes
ingredients_nongroup=[gr for gr in grocery if gr not in group_values]
ingredients_group=list(set([ingredients[ingredients.stem==gr]['substitute 1'].values[0] for gr in grocery if gr in group_values]))


In [665]:
for x in ingredients_nongroup:
    #extend to own substitutes
    idx=ingredients[ingredients['stem']==x].index
    a=[ing for ing in ingredients.loc[idx][['substitute 1', 'substitute 2', 'substitute 3']].values if type(ing)==str]
    if len(a)>0:
        grocery.extend(a)
    #check whether the ingredient in other products substitutes
    idx=ingredients[(ingredients['substitute 1']==x)|(ingredients['substitute 2']==x)|(ingredients['substitute 3']==x)].index
    if len(idx)>0:
        grocery.extend(list(ingredients.loc[idx]['stem']))
        
for x in ingredients_group:
    grocery.extend(group[x])
    

### Function

In [704]:
# quick calculation of additional products
def products_to_add(options, i):
    s=options.loc[i][needed]
    return ', '.join(list(s[s==1].index))

def return_recipes(calories_max=800, 
                   calories_min=500,
                   protein_min=25,
                   meal_type='lunch',
                   cuisine='non_specified',
                   complexity='easy',
                   n_additional_ingredients=4,
                   grocery=grocery):
    if meal_type in ['lunch', 'dinner']: meal_type='lunch/dinner'
    
    #filter basic parameters 
    options=data[(data.meal==meal_type)&
                       (data.calories<calories_max)&
                       (data.calories>calories_min)&
                       (data.protein>protein_min)&
                       (data.complexity==complexity)
                      ]
    if cuisine!='non_specified':
        options=options[options.cuisine==cuisine]
        
    #filter based on grocery
    
    # drop columns with unused ingredients
    ingredients=options.drop(non_ingredients, axis=1).columns
    options.drop([x for x in ingredients if sum(options[x])==0], axis=1, inplace=True)
    
    #update ingredients
    ingredients=options.drop(non_ingredients, axis=1).columns
    
    ## products outside the groccery list   
        
    needed=[x for x in ingredients if x not in grocery]

    # Keep only the recipes if the number of additional key ingredients doesnt exceed 3
    sums=options[needed].sum(axis=1)
    ind=[x for x in sums.index if sums.loc[x]<=n_additional_ingredients]
    options=options.loc[ind]
    
    recommendation=options.loc[ind][['title', 'calories', 'protein', 'carbs', 'fats']]

    recommendation['products to add']=pd.Series([products_to_add(options, i) for i in ind], index=ind)

    recommendation['# of products to add']=pd.Series([len([y for y in recommendation.loc[i]['products to add'].split(",")  
                                                       if y not in non_ingredients]) for i in ind], index=ind)
    
    return recommendation
    

In [715]:
def score_nutrition(x, nutr_values):
    # less protein - penalized, coefficient 2
    # more carbs than recommended - penalized, coefficient 2
    # more fat than recommended - penalized, coefficient 1
    
    penalty=-2*(min(0, nutr_values['protein']-x['protein']))+\
    2*(max(0, nutr_values['carbs']-x['carbs']))+max(0, nutr_values['fats']-x['fats'])
    
    return penalty

In [795]:
from sklearn.metrics.pairwise import cosine_similarity

def score_preference(i, user_score):   
    
    weighted_score=np.mean([cosine_similarity(np.asarray(temp_data.loc[i].reshape(1, -1)), 
                                              np.asarray(temp_data.loc[key]).reshape(1, -1))*user_score[key] 
                            for key in user_score.keys()])
    return weighted_score
    

### Example

In [709]:
### Example of recommended nutrition values (in user's profile)
nutr_values={'protein':30,
'carbs':100,
'fats':30}
nutr_values['calories']=nutr_values['fats']*9+(nutr_values['protein']+nutr_values['carbs'])*4

In [829]:
### Example of certain recipes scored (in user's profile): 5-high, 1-low
### {index: score}
user_score={461:5, 1693:4, 400:5, 189:2, 2704:3, 4867:3, 5470:4, 159:3, 1588:4, 447:3, 26:2, 101:1}

In [723]:
# Calculate all suggested meals after filtering

result=return_recipes(calories_max=calories*1.2, 
                   calories_min=calories*0.8,
                   protein_min=protein*0.8,
                   meal_type='lunch',
                   cuisine='italian',
                   complexity='medium',
                   n_additional_ingredients=4,
                   grocery=grocery)


In [671]:
np.sqrt()

3.0

In [835]:
result['nutrition penalty']=pd.Series([score_nutrition(result.loc[i], nutr_values) for i in result.index], index=result.index)

In [836]:
result.sort_values("nutrition penalty").head(5)

Unnamed: 0,title,calories,protein,carbs,fats,products to add,# of products to add,nutrition penalty,user score
1155,Strozzapreti Carbonara With Radishes,666.0,25.0,141.5,32.0,shallot,1,0.0,0.858941
19046,Lasagne Rolls with Roasted Tomato and Eggplant,698.0,27.0,147.5,41.0,"eggplant, oil",2,0.0,0.996953
1666,Spicy Spinach Linguine with Olive Oil and Garlic,708.0,27.0,150.0,35.0,"spinach, ground red pepper, pecorino cheese, d...",4,0.0,0.865919
1754,Baked Rigatoni alla Norma,930.0,30.0,202.5,56.0,"onion, eggplant, marinara sauce, mozzarella",4,0.0,0.696199
17034,Florentine White Bean Soup with Pasta,676.0,28.0,141.0,34.0,"onion, bean",2,0.0,0.915546


In [803]:
### Create the temp_data dataset upfront
meal_type='lunch' ## in flask application should come from the input field
if meal_type in ['lunch', 'dinner']: 
    meal_type='lunch/dinner'

temp_data=data[data.meal==meal_type]
temp_data=temp_data.drop(non_ingredients, axis=1)

In [830]:
# Create the temp user_score dict upfront
keys=list(filter(lambda x: data.loc[x].meal=="lunch/dinner", user_score.keys()))
temp_user_score=dict(zip(keys, [user_score[key] for key in keys]))

In [833]:
result['user score']=pd.Series([score_preference(i, temp_user_score) for i in result.index], index=result.index)

  import sys


In [837]:
result.sort_values("user score", ascending=False).head(5)

Unnamed: 0,title,calories,protein,carbs,fats,products to add,# of products to add,nutrition penalty,user score
12205,Rosemary and Garlic Lamb Chops,889.0,31.0,191.25,83.0,meat,1,2.0,1.267792
18112,Pork Chops with Vinegar Peppers,675.0,36.0,132.75,57.0,,1,12.0,1.224595
6778,Chicken Muffuletta Salad,668.0,42.0,125.0,50.0,"ice, olives",2,24.0,1.221013
12438,Roast Leg of Lamb with Rosemary and Garlic,862.0,73.0,140.25,61.0,"fat, lemon peel",2,86.0,1.212622
7314,Roast Chicken with Rosemary-Garlic Paste,936.0,78.0,156.0,66.0,"juniper berries, sea salt",2,96.0,1.165979


## Find pictures for the most common types of foods

In [92]:
titles=list(data_short['title'])

In [93]:
titles=[x.replace(',','') for x in titles]

In [94]:
titles[350]

'Steamed Mussels with Sherry Tomatoes and Garlic '

In [95]:
titles=[x.lower().split() for x in titles]

In [96]:
titles=[[y for y in x if nltk.pos_tag([y])[0][1] in ['NN', 'NNS']] for x in titles]

### Extract top unigrams

In [None]:
from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder

In [None]:
from nltk import word_tokenize 
from nltk.util import ngrams

In [201]:
init_unigrams=[y for x in titles for y in x]

In [203]:
unigrams_count=[(x, init_unigrams.count(x)) for x in set(init_unigrams)]

In [204]:
unigrams_count.sort(key = lambda t: t[1])

In [386]:
unigrams_count_d=dict(zip([x[0] for x in unigrams_count], [x[1] for x in unigrams_count]))

In [388]:
unigrams_count[::-1][:200]

[('sauce', 1680),
 ('salad', 1540),
 ('chicken', 1221),
 ('cream', 869),
 ('cheese', 817),
 ('soup', 617),
 ('tomato', 610),
 ('lemon', 557),
 ('chocolate', 528),
 ('pork', 512),
 ('potato', 461),
 ('cake', 459),
 ('garlic', 436),
 ('shrimp', 434),
 ('rice', 414),
 ('turkey', 414),
 ('pepper', 402),
 ('lamb', 396),
 ('butter', 394),
 ('potatoes', 384),
 ('corn', 383),
 ('orange', 378),
 ('beef', 376),
 ('roast', 370),
 ('pie', 354),
 ('apple', 349),
 ('ginger', 341),
 ('onion', 333),
 ('vinaigrette', 329),
 ('salsa', 320),
 ('bean', 313),
 ('salmon', 311),
 ('bacon', 310),
 ('tomatoes', 309),
 ('bread', 299),
 ('spicy', 299),
 ('sweet', 296),
 ('spinach', 289),
 ('vegetable', 288),
 ('fennel', 284),
 ('onions', 275),
 ('ice', 274),
 ('goat', 269),
 ('pasta', 259),
 ('mustard', 253),
 ('sausage', 249),
 ('tart', 249),
 ('mint', 245),
 ('mushroom', 244),
 ('squash', 241),
 ('beans', 233),
 ('coconut', 227),
 ('lime', 220),
 ('mushrooms', 212),
 ('arugula', 211),
 ('vegetables', 208),
 ('

In [387]:
unigrams_count_d['pasta']

259

### Extract bigrams

In [73]:
from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder

In [74]:
from nltk import word_tokenize 
from nltk.util import ngrams

In [97]:
init_bigrams=[list(ngrams(x, 2)) for x in titles]

  """Entry point for launching an IPython kernel.


In [98]:
bigrams=[y for x in init_bigrams for y in x]

In [99]:
bigrams_count=[(x, bigrams.count(x)) for x in set(bigrams)]

In [100]:
bigrams_count.sort(key = lambda t: t[1])

In [101]:
bigrams_count=bigrams_count[::-1]

In [767]:
[(x,' '.join(x[0])) for x in bigrams_count[400:500]]

[((('giblet', 'gravy'), 9), 'giblet gravy'),
 ((('salt', 'pepper'), 9), 'salt pepper'),
 ((('snow', 'peas'), 9), 'snow peas'),
 ((('gold', 'potato'), 9), 'gold potato'),
 ((('monterey', 'jack'), 9), 'monterey jack'),
 ((('cider', 'vinaigrette'), 9), 'cider vinaigrette'),
 ((('pork', 'roast'), 9), 'pork roast'),
 ((('salad', 'blue'), 9), 'salad blue'),
 ((('macaroni', 'cheese'), 9), 'macaroni cheese'),
 ((('stuffed', 'eggs'), 9), 'stuffed eggs'),
 ((('beef', 'stock'), 9), 'beef stock'),
 ((('spinach', 'feta'), 9), 'spinach feta'),
 ((('potato', 'purã©e'), 9), 'potato purã©e'),
 ((('romesco', 'sauce'), 9), 'romesco sauce'),
 ((('carrot', 'soup'), 9), 'carrot soup'),
 ((('chocolate', 'torte'), 9), 'chocolate torte'),
 ((('game', 'hen'), 9), 'game hen'),
 ((('chicken', 'tomato'), 9), 'chicken tomato'),
 ((('cranberry', 'orange'), 9), 'cranberry orange'),
 ((('bundt', 'cake'), 9), 'bundt cake'),
 ((('salmon', 'horseradish'), 9), 'salmon horseradish'),
 ((('baby', 'greens'), 9), 'baby greens

### Extract trigrams

In [209]:
init_trigrams=[list(ngrams(x, 3)) for x in titles]

  """Entry point for launching an IPython kernel.


In [210]:
trigrams=[y for x in init_trigrams for y in x]

In [212]:
trigrams_count=[(x, trigrams.count(x)) for x in set(trigrams)]
trigrams_count.sort(key = lambda t: t[1])
trigrams_count=trigrams_count[::-1]

In [213]:
[(x,' '.join(x[0])) for x in trigrams_count[:200]]

[((('sugar', 'snap', 'peas'), 31), 'sugar snap peas'),
 ((('vanilla', 'ice', 'cream'), 28), 'vanilla ice cream'),
 ((('turkey', 'giblet', 'stock'), 20), 'turkey giblet stock'),
 ((('cake', 'cream', 'cheese'), 17), 'cake cream cheese'),
 ((('goat', 'cheese', 'salad'), 16), 'goat cheese salad'),
 ((('spicy', 'tomato', 'sauce'), 15), 'spicy tomato sauce'),
 ((('ice', 'cream', 'cake'), 13), 'ice cream cake'),
 ((('sugar', 'snap', 'pea'), 12), 'sugar snap pea'),
 ((('baby', 'bok', 'choy'), 12), 'baby bok choy'),
 ((('butternut', 'squash', 'soup'), 12), 'butternut squash soup'),
 ((('roast', 'leg', 'lamb'), 11), 'roast leg lamb'),
 ((('lemon', 'ice', 'cream'), 11), 'lemon ice cream'),
 ((('pico', 'de', 'gallo'), 11), 'pico de gallo'),
 ((('roast', 'pork', 'tenderloin'), 10), 'roast pork tenderloin'),
 ((('bittersweet', 'chocolate', 'sauce'), 10), 'bittersweet chocolate sauce'),
 ((('bell', 'pepper', 'sauce'), 10), 'bell pepper sauce'),
 ((('roast', 'pork', 'loin'), 10), 'roast pork loin'),
 

### Write function and create links

In [839]:
import os

In [845]:
b_list=os.listdir("images/bigram")
b_list=[x.split(".")[0] for x in b_list]

In [847]:
u_list=os.listdir("images/unigram")
u_list=[x.split(".")[0] for x in u_list]

In [865]:
def define_image(i):
    a=[x for x in init_bigrams[i] if ' '.join(x) in b_list]
    if len(a)>1:
        a=[(x, bigrams.count(x)) for x in a]
        a=max(a, key = lambda t: t[1])[0]
    elif len(a)==0:
        a=[x for x in titles[i] if x in u_list]
        if len(a)>1:
            a=[(x, init_unigrams.count(x)) for x in a]
            a=max(a, key = lambda t: t[1])[0]
    return a

In [871]:
[x for x in init_bigrams[13] if ' '.join(x) in b_list]

[('beef', 'tenderloin')]

In [872]:
data['image']=pd.Series([define_image(i) for i in data.index])

### Update top bigrams for empty lines

In [877]:
empty=list(filter(lambda x: len(data.loc[x].image)==0, data.index))

In [881]:
bigrams_empty=[init_bigrams[i] for i in empty]

In [882]:
bigrams=[y for x in bigrams_empty for y in x]

bigrams_count=[(x, bigrams.count(x)) for x in set(bigrams)]

bigrams_count.sort(key = lambda t: t[1])

bigrams_count=bigrams_count[::-1]

In [885]:
unigrams_empty=[titles[i] for i in empty]

unigrams=[y for x in unigrams_empty for y in x]

unigrams_count=[(x, unigrams.count(x)) for x in set(unigrams)]

unigrams_count.sort(key = lambda t: t[1])

unigrams_count=unigrams_count[::-1]

In [887]:
unigrams_count[:100]

[('sauce', 983),
 ('cheese', 436),
 ('chocolate', 328),
 ('cream', 327),
 ('lemon', 301),
 ('tomato', 290),
 ('shrimp', 275),
 ('rice', 267),
 ('butter', 235),
 ('pie', 231),
 ('garlic', 216),
 ('salsa', 215),
 ('pepper', 197),
 ('tart', 196),
 ('ginger', 189),
 ('orange', 187),
 ('apple', 165),
 ('bread', 160),
 ('vinaigrette', 160),
 ('bacon', 160),
 ('corn', 150),
 ('spinach', 150),
 ('onion', 149),
 ('onions', 145),
 ('beans', 145),
 ('syrup', 136),
 ('tomatoes', 135),
 ('spicy', 134),
 ('mint', 130),
 ('cranberry', 129),
 ('lime', 129),
 ('goat', 129),
 ('coconut', 126),
 ('eggs', 119),
 ('fruit', 116),
 ('vegetables', 114),
 ('yogurt', 111),
 ('mustard', 110),
 ('mushrooms', 110),
 ('raspberry', 109),
 ('pear', 108),
 ('chutney', 107),
 ('vegetable', 106),
 ('almond', 106),
 ('eggplant', 102),
 ('fennel', 102),
 ('relish', 100),
 ('strawberry', 100),
 ('ham', 98),
 ('mushroom', 97),
 ('bell', 97),
 ('asparagus', 95),
 ('steak', 94),
 ('zucchini', 94),
 ('dip', 92),
 ('peppers', 8

In [886]:
[(x,' '.join(x[0])) for x in unigrams_count[:100]]

[(('sauce', 983), 's a u c e'),
 (('cheese', 436), 'c h e e s e'),
 (('chocolate', 328), 'c h o c o l a t e'),
 (('cream', 327), 'c r e a m'),
 (('lemon', 301), 'l e m o n'),
 (('tomato', 290), 't o m a t o'),
 (('shrimp', 275), 's h r i m p'),
 (('rice', 267), 'r i c e'),
 (('butter', 235), 'b u t t e r'),
 (('pie', 231), 'p i e'),
 (('garlic', 216), 'g a r l i c'),
 (('salsa', 215), 's a l s a'),
 (('pepper', 197), 'p e p p e r'),
 (('tart', 196), 't a r t'),
 (('ginger', 189), 'g i n g e r'),
 (('orange', 187), 'o r a n g e'),
 (('apple', 165), 'a p p l e'),
 (('bread', 160), 'b r e a d'),
 (('vinaigrette', 160), 'v i n a i g r e t t e'),
 (('bacon', 160), 'b a c o n'),
 (('corn', 150), 'c o r n'),
 (('spinach', 150), 's p i n a c h'),
 (('onion', 149), 'o n i o n'),
 (('onions', 145), 'o n i o n s'),
 (('beans', 145), 'b e a n s'),
 (('syrup', 136), 's y r u p'),
 (('tomatoes', 135), 't o m a t o e s'),
 (('spicy', 134), 's p i c y'),
 (('mint', 130), 'm i n t'),
 (('cranberry', 12

In [878]:
len(empty)

10592

In [874]:
data['image']

0                     [turkey]
1                           []
2                         soup
3                           []
4                           []
5                           []
6         [(vegetable, salad)]
7                           []
8                       [beef]
9            [(potato, salad)]
10                          []
11            [(noodle, soup)]
12                      [cake]
13        [(beef, tenderloin)]
14                          []
15                      [soup]
16                          []
17          [(pork, shoulder)]
18                          []
19           [(potato, salad)]
20       [(watercress, salad)]
21                   [chicken]
22           [(fish, fillets)]
23                          []
24                          []
25                          []
26              [(egg, salad)]
27                          []
28           [(potato, salad)]
29                          []
                 ...          
20081                [mussels]
20082   

In [None]:

init_unigrams=[y for x in titles for y in x]