In [652]:
import pandas as pd
import numpy as np

### # Upload the dataset (should be on server)

In [962]:
data=pd.read_csv('data/data.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

In [953]:
ingredients=pd.read_csv('data/ingredients_short.csv')
ingredients.drop('Unnamed: 0', axis=1, inplace=True)

In [658]:
ingredients.head()

Unnamed: 0,stem,substitute 1,substitute 2,substitute 3,food type
0,cream,half-and-half,,,
1,flaxseed,,,,
2,farro,,,,
3,applesauce,,,,
4,cranberry,,,,fruit


In [659]:
# Upload the list of non_key ingredients
spices=list(pd.read_csv('data/spices.csv')['spices'])
spices=[x for x in spices if x in data.columns]
garnish=['parsley', 'dried parsley', 'cilantro', 'cilantro leaves', 'dill', 
         'celery leaves', 'chives', 'chocolate chips', 'sesame', 'black sesame seeds', 'sesame seeds']

In [660]:
# separate ingredients from non-ingredients 
non_ingredients=['meal','title','calories','protein','carbs','fats','sodium','cuisine', 'complexity']
non_ingredients.extend(spices)

In [661]:
group_keys=['pasta', 'mold cheese', 'soft cheese', 'brined cheese', 'medium cheese', 'hard cheese', 'cottage cheese', 'dry wine', 
            'liquer', 'white wine', 'red wine']
group=dict(zip(group_keys,
            [list(ingredients[ingredients['substitute 1']==key]['stem']) for key in group_keys]))

In [662]:
group_values=[y for x in group.values() for y in x]

### Inputs
 
grocery: list <br>

calories_max: int <br>
calories_min: int <br>
protein_min: int <br>
meal_type: breakfast, lunch, dinner, dessert, drink <br>
cuisine: 20 cuisines in the list <br>
complexity: easy, medium, hard <br>
n_additional_ingredients: int <br>


In [663]:
# Should be at home
home=['pepper', 'butter', 'olive oil', 'sugar', 'salt', 'water', 'lemon juice', 'dijon mustard', 'black pepper', 'ketchup']
grocery=['apple', 'milk', 'hazelnut', 'cucumber', 'bread', 'tomato', 
         'cod',  'rice', 'garlic', 'chicken', 'egg', 'feta', 'cream cheese', 'pasta']
grocery.extend(home)

In [664]:
# extend grocery for all available substitutes
ingredients_nongroup=[gr for gr in grocery if gr not in group_values]
ingredients_group=list(set([ingredients[ingredients.stem==gr]['substitute 1'].values[0] for gr in grocery if gr in group_values]))


In [665]:
for x in ingredients_nongroup:
    #extend to own substitutes
    idx=ingredients[ingredients['stem']==x].index
    a=[ing for ing in ingredients.loc[idx][['substitute 1', 'substitute 2', 'substitute 3']].values if type(ing)==str]
    if len(a)>0:
        grocery.extend(a)
    #check whether the ingredient in other products substitutes
    idx=ingredients[(ingredients['substitute 1']==x)|(ingredients['substitute 2']==x)|(ingredients['substitute 3']==x)].index
    if len(idx)>0:
        grocery.extend(list(ingredients.loc[idx]['stem']))
        
for x in ingredients_group:
    grocery.extend(group[x])
    

### Function

In [704]:
# quick calculation of additional products
def products_to_add(options, i):
    s=options.loc[i][needed]
    return ', '.join(list(s[s==1].index))

def return_recipes(calories_max=800, 
                   calories_min=500,
                   protein_min=25,
                   meal_type='lunch',
                   cuisine='non_specified',
                   complexity='easy',
                   n_additional_ingredients=4,
                   grocery=grocery):
    if meal_type in ['lunch', 'dinner']: meal_type='lunch/dinner'
    
    #filter basic parameters 
    options=data[(data.meal==meal_type)&
                       (data.calories<calories_max)&
                       (data.calories>calories_min)&
                       (data.protein>protein_min)&
                       (data.complexity==complexity)
                      ]
    if cuisine!='non_specified':
        options=options[options.cuisine==cuisine]
        
    #filter based on grocery
    
    # drop columns with unused ingredients
    ingredients=options.drop(non_ingredients, axis=1).columns
    options.drop([x for x in ingredients if sum(options[x])==0], axis=1, inplace=True)
    
    #update ingredients
    ingredients=options.drop(non_ingredients, axis=1).columns
    
    ## products outside the groccery list   
        
    needed=[x for x in ingredients if x not in grocery]

    # Keep only the recipes if the number of additional key ingredients doesnt exceed 3
    sums=options[needed].sum(axis=1)
    ind=[x for x in sums.index if sums.loc[x]<=n_additional_ingredients]
    options=options.loc[ind]
    
    recommendation=options.loc[ind][['title', 'calories', 'protein', 'carbs', 'fats']]

    recommendation['products to add']=pd.Series([products_to_add(options, i) for i in ind], index=ind)

    recommendation['# of products to add']=pd.Series([len([y for y in recommendation.loc[i]['products to add'].split(",")  
                                                       if y not in non_ingredients]) for i in ind], index=ind)
    
    return recommendation
    

In [715]:
def score_nutrition(x, nutr_values):
    # less protein - penalized, coefficient 2
    # more carbs than recommended - penalized, coefficient 2
    # more fat than recommended - penalized, coefficient 1
    
    penalty=-2*(min(0, nutr_values['protein']-x['protein']))+\
    2*(max(0, nutr_values['carbs']-x['carbs']))+max(0, nutr_values['fats']-x['fats'])
    
    return penalty

In [795]:
from sklearn.metrics.pairwise import cosine_similarity

def score_preference(i, user_score):   
    
    weighted_score=np.mean([cosine_similarity(np.asarray(temp_data.loc[i].reshape(1, -1)), 
                                              np.asarray(temp_data.loc[key]).reshape(1, -1))*user_score[key] 
                            for key in user_score.keys()])
    return weighted_score
    

### Example

In [709]:
### Example of recommended nutrition values (in user's profile)
nutr_values={'protein':30,
'carbs':100,
'fats':30}
nutr_values['calories']=nutr_values['fats']*9+(nutr_values['protein']+nutr_values['carbs'])*4

In [829]:
### Example of certain recipes scored (in user's profile): 5-high, 1-low
### {index: score}
user_score={461:5, 1693:4, 400:5, 189:2, 2704:3, 4867:3, 5470:4, 159:3, 1588:4, 447:3, 26:2, 101:1}

In [723]:
# Calculate all suggested meals after filtering

result=return_recipes(calories_max=calories*1.2, 
                   calories_min=calories*0.8,
                   protein_min=protein*0.8,
                   meal_type='lunch',
                   cuisine='italian',
                   complexity='medium',
                   n_additional_ingredients=4,
                   grocery=grocery)


In [671]:
np.sqrt()

3.0

In [835]:
result['nutrition penalty']=pd.Series([score_nutrition(result.loc[i], nutr_values) for i in result.index], index=result.index)

In [836]:
result.sort_values("nutrition penalty").head(5)

Unnamed: 0,title,calories,protein,carbs,fats,products to add,# of products to add,nutrition penalty,user score
1155,Strozzapreti Carbonara With Radishes,666.0,25.0,141.5,32.0,shallot,1,0.0,0.858941
19046,Lasagne Rolls with Roasted Tomato and Eggplant,698.0,27.0,147.5,41.0,"eggplant, oil",2,0.0,0.996953
1666,Spicy Spinach Linguine with Olive Oil and Garlic,708.0,27.0,150.0,35.0,"spinach, ground red pepper, pecorino cheese, d...",4,0.0,0.865919
1754,Baked Rigatoni alla Norma,930.0,30.0,202.5,56.0,"onion, eggplant, marinara sauce, mozzarella",4,0.0,0.696199
17034,Florentine White Bean Soup with Pasta,676.0,28.0,141.0,34.0,"onion, bean",2,0.0,0.915546


In [803]:
### Create the temp_data dataset upfront
meal_type='lunch' ## in flask application should come from the input field
if meal_type in ['lunch', 'dinner']: 
    meal_type='lunch/dinner'

temp_data=data[data.meal==meal_type]
temp_data=temp_data.drop(non_ingredients, axis=1)

In [830]:
# Create the temp user_score dict upfront
keys=list(filter(lambda x: data.loc[x].meal=="lunch/dinner", user_score.keys()))
temp_user_score=dict(zip(keys, [user_score[key] for key in keys]))

In [833]:
result['user score']=pd.Series([score_preference(i, temp_user_score) for i in result.index], index=result.index)

  import sys


In [837]:
result.sort_values("user score", ascending=False).head(5)

Unnamed: 0,title,calories,protein,carbs,fats,products to add,# of products to add,nutrition penalty,user score
12205,Rosemary and Garlic Lamb Chops,889.0,31.0,191.25,83.0,meat,1,2.0,1.267792
18112,Pork Chops with Vinegar Peppers,675.0,36.0,132.75,57.0,,1,12.0,1.224595
6778,Chicken Muffuletta Salad,668.0,42.0,125.0,50.0,"ice, olives",2,24.0,1.221013
12438,Roast Leg of Lamb with Rosemary and Garlic,862.0,73.0,140.25,61.0,"fat, lemon peel",2,86.0,1.212622
7314,Roast Chicken with Rosemary-Garlic Paste,936.0,78.0,156.0,66.0,"juniper berries, sea salt",2,96.0,1.165979


## Find pictures for the most common types of foods

In [1030]:
titles=list(data['title'])

In [1031]:
titles=[x.replace(',','') for x in titles]

In [1032]:
titles=[x.lower().split() for x in titles]

### Extract top unigrams

In [1035]:
from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder

In [1036]:
from nltk import word_tokenize 
from nltk.util import ngrams

In [1037]:
init_unigrams=[y for x in titles for y in x]

In [1038]:
unigrams_count=[(x, init_unigrams.count(x)) for x in set(init_unigrams)]

In [1039]:
unigrams_count.sort(key = lambda t: t[1])

In [1040]:
unigrams_count_d=dict(zip([x[0] for x in unigrams_count], [x[1] for x in unigrams_count]))

In [1041]:
unigrams_count[::-1][:200]

[('with', 5933),
 ('and', 5307),
 ('sauce', 1189),
 ('salad', 1095),
 ('chicken', 872),
 ('cream', 655),
 ('cheese', 609),
 ('grilled', 479),
 ('soup', 466),
 ('roasted', 445),
 ('red', 437),
 ('lemon', 423),
 ('tomato', 420),
 ('chocolate', 393),
 ('pork', 376),
 ('cake', 374),
 ('potato', 333),
 ('green', 319),
 ('shrimp', 314),
 ('garlic', 304),
 ('lamb', 300),
 ('butter', 296),
 ('orange', 280),
 ('pepper', 276),
 ('rice', 272),
 ('potatoes', 272),
 ('beef', 270),
 ('roast', 269),
 ('in', 264),
 ('pie', 263),
 ('apple', 257),
 ('onion', 250),
 ('dressing', 244),
 ('bacon', 244),
 ('turkey', 243),
 ('corn', 236),
 ('salmon', 227),
 ('tomatoes', 224),
 ('ginger', 220),
 ('vinaigrette', 215),
 ('tart', 213),
 ('bean', 213),
 ('fennel', 211),
 ('goat', 211),
 ('spicy', 210),
 ('spinach', 202),
 ('sweet', 201),
 ('onions', 199),
 ('spiced', 198),
 ('white', 197),
 ('of', 193),
 ('mustard', 190),
 ('fresh', 187),
 ('mushroom', 187),
 ('ice', 184),
 ('baked', 182),
 ('pasta', 180),
 ('sau

### Extract bigrams

In [1042]:
from nltk import word_tokenize
from nltk.collocations import BigramCollocationFinder

In [1043]:
from nltk import word_tokenize 
from nltk.util import ngrams

In [1044]:
init_bigrams=[list(ngrams(x, 2)) for x in titles]

In [1045]:
bigrams=[y for x in init_bigrams for y in x]

In [1046]:
bigrams_count=[(x, bigrams.count(x)) for x in set(bigrams)]

In [1047]:
bigrams_count.sort(key = lambda t: t[1])

In [1048]:
bigrams_count=bigrams_count[::-1]

### Extract trigrams

In [1057]:
init_trigrams=[list(ngrams(x, 3)) for x in titles]

  """Entry point for launching an IPython kernel.


In [1058]:
trigrams=[y for x in init_trigrams for y in x]

In [1059]:
trigrams_count=[(x, trigrams.count(x)) for x in set(trigrams)]
trigrams_count.sort(key = lambda t: t[1])
trigrams_count=trigrams_count[::-1]

In [1060]:
[(x,' '.join(x[0])) for x in trigrams_count[:200]]

[((('and', 'goat', 'cheese'), 71), 'and goat cheese'),
 ((('pork', 'chops', 'with'), 59), 'pork chops with'),
 ((('of', 'lamb', 'with'), 55), 'of lamb with'),
 ((('goat', 'cheese', 'and'), 54), 'goat cheese and'),
 ((('with', 'goat', 'cheese'), 48), 'with goat cheese'),
 ((('red', 'bell', 'pepper'), 48), 'red bell pepper'),
 ((('leg', 'of', 'lamb'), 39), 'leg of lamb'),
 ((('lamb', 'chops', 'with'), 39), 'lamb chops with'),
 ((('pork', 'tenderloin', 'with'), 38), 'pork tenderloin with'),
 ((('with', 'lemon', 'and'), 38), 'with lemon and'),
 ((('roast', 'turkey', 'with'), 38), 'roast turkey with'),
 ((('roast', 'chicken', 'with'), 38), 'roast chicken with'),
 ((('potato', 'salad', 'with'), 36), 'potato salad with'),
 ((('cream', 'cheese', 'frosting'), 35), 'cream cheese frosting'),
 ((('rack', 'of', 'lamb'), 34), 'rack of lamb'),
 ((('with', 'garlic', 'and'), 31), 'with garlic and'),
 ((('and', 'pine', 'nuts'), 30), 'and pine nuts'),
 ((('and', 'bell', 'pepper'), 30), 'and bell pepper')

### Write function and create links

In [839]:
import os

In [1191]:
b_list=os.listdir("images/bigram")
b_list=[x.split(".")[0] for x in b_list]

In [1192]:
u_list=os.listdir("images/unigram")
u_list=[x.split(".")[0] for x in u_list]

In [1193]:
def define_image(i):
    a=[x for x in init_bigrams[i] if ' '.join(x) in b_list]
    if len(a)>1:
        a=[(x, bigrams.count(x)) for x in a]
        a=min(a, key = lambda t: t[1])[0]
    elif len(a)==0:
        a=[x for x in titles[i] if x in u_list]
        if len(a)>1:
            a=[(x, init_unigrams.count(x)) for x in a]
            a=min(a, key = lambda t: t[1])[0]
    if (type(a)==list)&(len(a)>0):
        a=a[0]
    return a

In [1194]:
data['image']=pd.Series([define_image(i) for i in data.index])

In [1163]:
def return_link(x):
    if type(x)==str:
        return str('images/unigram/'+x+".jpg")
    elif type(x)==tuple:
        return str('images/bigram/'+' '.join(x)+".jpg")

In [1197]:
data['image_link']=pd.Series([return_link(x) for x in list(data.image)])

In [1172]:
data['titles_l']=pd.Series([x.lower() for x in data['title']])

In [1190]:
data[['titles_l','image_link']][900:950]

Unnamed: 0,titles_l,image_link
900,ceviche acapulqueño,
901,easy slow-cooker pot roast,images/bigram/pot roast.jpg
902,jerk pork,images/unigram/pork.jpg
903,fudgy chocolate-raspberry bars,
904,sweet potato and molasses muffins,images/unigram/muffins.jpg
905,sweet chocolate glaze,
906,roasted sweet potatoes and onions with rosemar...,images/unigram/potatoes.jpg
907,charred onion and fennel soup,images/unigram/soup.jpg
908,strawberry ice cream,images/bigram/ice cream.jpg
909,venetian liver and onions,


In [1198]:
data.drop('titles_l', axis=1, inplace=True)

In [1199]:
data.to_csv('data/data.csv', encoding='UTF-8')

### Update top bigrams for empty lines

In [1195]:
empty=list(filter(lambda x: len(data.loc[x].image)==0, data.index))

In [1196]:
len(empty)

2647

In [1106]:
bigrams_empty=[init_bigrams[i] for i in empty]

In [1107]:
bigrams=[y for x in bigrams_empty for y in x]

bigrams_count=[(x, bigrams.count(x)) for x in set(bigrams)]

bigrams_count.sort(key = lambda t: t[1])

bigrams_count=bigrams_count[::-1]

In [1108]:
unigrams_empty=[titles[i] for i in empty]

unigrams=[y for x in unigrams_empty for y in x]

unigrams_count=[(x, unigrams.count(x)) for x in set(unigrams)]

unigrams_count.sort(key = lambda t: t[1])

unigrams_count=unigrams_count[::-1]

In [1109]:
len(init_bigrams)

13841

In [1110]:
[(' '.join(x[0]), [(i, data.title[i].lower()) for i in empty if x[0] in init_bigrams[i]][:20]) for x in bigrams_count[:50]]

[('and mint',
  [(1121, 'baby carrot crudites with green onion and mint dip '),
   (1881, 'melon and mint tabbouleh '),
   (2618, 'oranges and pineapple with orange-flower water and mint '),
   (2740, 'blended vodka daiquiris with lime and mint '),
   (3469, 'watermelon with parmesan and mint '),
   (4502, 'quinoa with corn, scallions, and mint '),
   (5383, 'chickpea, garlic, and mint topping '),
   (5525, 'crab, mango, and mint nori rolls '),
   (7609, 'citrus "jell-o" with honey and mint '),
   (7916, 'iced vodka with cucumber, lemon, and mint '),
   (9008, 'flan with pink grapefruit and mint syrup '),
   (10134, 'frozen peaches with strawberries and mint '),
   (10915, 'grilled artichokes with olive oil, lemon, and mint '),
   (11000, 'melon and blueberry coupe with white wine, vanilla and mint '),
   (11495, 'bow ties with peas, lemon, and mint '),
   (11653, 'artichoke bottoms braised in olive oil with garlic and mint '),
   (11896, 'limoncello and mint sparklers '),
   (12331,
 

In [1115]:
[(x, list(data.loc[empty]['meal']).count(x)) for x in set(data.loc[empty]['meal'])]

[('lunch/dinner', 1783),
 ('breakfast', 339),
 ('drink', 698),
 ('dessert', 567),
 ('snack', 16)]

In [1114]:
set(data.loc[empty]['meal'])

{'breakfast', 'dessert', 'drink', 'lunch/dinner', 'snack'}