In [1]:
import pandas as pd
import numpy as np
import gensim.downloader
import re
from gensim.models import word2vec, phrases
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_numeric,\
                    strip_non_alphanum, strip_multiple_whitespaces, strip_short

from textblob import TextBlob, Word
import collections
from numpy import dot
from numpy.linalg import norm

## Import & Clean Data

In [2]:
recipe_df = pd.read_csv('../../data/kaggle_recipes/RAW_recipes.csv')

In [3]:
recipe_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
#Extracting only Calories info from nutrition column
def nutrition_clean(i):
    i=i.strip("[]")
    i=i.split(", ")
    i = i[0]
    return i
    
recipe_df["calories_kcal"] = recipe_df["nutrition"].apply(nutrition_clean).astype("float64")

In [5]:
recipe_df.dtypes

name               object
id                  int64
minutes             int64
contributor_id      int64
submitted          object
tags               object
nutrition          object
n_steps             int64
steps              object
description        object
ingredients        object
n_ingredients       int64
calories_kcal     float64
dtype: object

In [6]:
#ingredients, and tags and steps are in string format, need to convert strings to list
def string_to_list(string):
    st = string.strip("[]").split(", ")
    s = [i.strip("''") for i in st]
    return s

In [7]:
recipe_df["tags_list"] = recipe_df["tags"].apply(string_to_list)

In [8]:
recipe_df.head(2)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,calories_kcal,tags_list
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,"[60-minutes-or-less, time-to-make, course, mai..."
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,"[30-minutes-or-less, time-to-make, course, mai..."


In [9]:
tag_list = []
for x in recipe_df["tags_list"]:
    for y in x:
        tag_list.append(y)

In [10]:
tag_list

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'side-dishes',
 'vegetables',
 'mexican',
 'easy',
 'fall',
 'holiday-event',
 'vegetarian',
 'winter',
 'dietary',
 'christmas',
 'seasonal',
 'squash',
 '30-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'breakfast',
 'main-dish',
 'pork',
 'american',
 'oven',
 'easy',
 'kid-friendly',
 'pizza',
 'dietary',
 'northeastern-united-states',
 'meat',
 'equipment',
 'time-to-make',
 'course',
 'preparation',
 'main-dish',
 'chili',
 'crock-pot-slow-cooker',
 'dietary',
 'equipment',
 '4-hours-or-less',
 '60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'preparation',
 'occasion',
 'side-dishes',
 'eggs-dairy',
 'potatoes',
 'vegetables',
 'oven',
 'easy',
 'dinner-party',
 'holiday-event',
 'easter',
 'cheese',
 'stove-top',
 'dietary',
 'christmas',
 'new-ye

## Analyze distribution of ingredients & tags

In [11]:
tag_df = pd.DataFrame(tag_list ,columns = {"tags"})
tag_df["n_tags"] = 1
tag_df.groupby("tags").count().sort_values("n_tags", ascending = False)

Unnamed: 0_level_0,n_tags
tags,Unnamed: 1_level_1
preparation,230546
time-to-make,225326
course,218148
main-ingredient,170446
dietary,165091
...,...
marinara-sauce,1
middle-eastern-main-dish,1
breakfast-casseroles,1
high-in-something-diabetic-friendly,1


In [12]:
recipe_df["ingredients_list"] = recipe_df["ingredients"].apply(string_to_list)

In [13]:
ingred_list =[]
for x in recipe_df["ingredients_list"]:
    for y in x:
        ingred_list.append(y)

In [14]:
ingred_df = pd.DataFrame(ingred_list, columns=["ingredients"])
ingred_df["n_ingred"] = 1
ingred_order = ingred_df.groupby("ingredients").sum().sort_values("n_ingred", ascending=False)
ingred_order

Unnamed: 0_level_0,n_ingred
ingredients,Unnamed: 1_level_1
salt,85746
butter,54975
sugar,44535
onion,39065
water,34914
...,...
fat free reduced-sugar cherry yogurt,1
fat free raspberry pecan salad dressing,1
reduced-fat beef hot dog,1
reduced-fat beef hot dogs,1


In [15]:
ingred_order[ingred_order['n_ingred']>1000]

Unnamed: 0_level_0,n_ingred
ingredients,Unnamed: 1_level_1
salt,85746
butter,54975
sugar,44535
onion,39065
water,34914
...,...
salmon fillets,1024
pork chops,1024
red bell peppers,1015
elbow macaroni,1007


In [16]:
recipe_df["tags_list"] = recipe_df["tags"].apply(string_to_list)

In [17]:
recipe_df.dtypes

name                 object
id                    int64
minutes               int64
contributor_id        int64
submitted            object
tags                 object
nutrition            object
n_steps               int64
steps                object
description          object
ingredients          object
n_ingredients         int64
calories_kcal       float64
tags_list            object
ingredients_list     object
dtype: object

In [18]:
recipe_df.head(1)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,calories_kcal,tags_list,ingredients_list
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,"[60-minutes-or-less, time-to-make, course, mai...","[winter squash, mexican seasoning, mixed spice..."


## Clean ingredients column

In [121]:
def clean_words(list_o_words):
    
    output_list = []
    for word in list_o_words:

        ing = remove_stopwords(word)
        ing = strip_numeric(ing)
        ing = re.sub(r'\(.*oz.\)|(®)|(.*ed)|(.*ly)|boneless|skinless|chunks|fresh|large|cook drain|green|frozen|ground','',ing).strip()
        ing = strip_short(ing,2)
        ing = strip_multiple_whitespaces(ing)
        ing = strip_punctuation(ing)
        ing = strip_non_alphanum(ing)
        if word != 'asparagus':
            ing = (" ".join(TextBlob(ing).words.singularize()))
        output_list.append(ing)
            
    return output_list

In [122]:
rdf_test = recipe_df[:10]

In [123]:
recipe_df['ingredients_cleaned'] = recipe_df['ingredients_list'].apply(clean_words)

In [124]:
recipe_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,calories_kcal,tags_list,ingredients_list,ingredients_cleaned
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,51.5,"[60-minutes-or-less, time-to-make, course, mai...","[winter squash, mexican seasoning, mixed spice...","[winter squash, mexican seasoning, spice, hone..."
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,173.4,"[30-minutes-or-less, time-to-make, course, mai...","[prepared pizza crust, sausage patty, eggs, mi...","[pizza crust, sausage patty, egg, milk, salt p..."
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,269.8,"[time-to-make, course, preparation, main-dish,...","[ground beef, yellow onions, diced tomatoes, t...","[beef, yellow onion, tomato, tomato paste, tom..."
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,368.1,"[60-minutes-or-less, time-to-make, course, mai...","[spreadable cheese with garlic and herbs, new ...","[spreadable cheese garlic herb, new potato, sh..."
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,352.9,"[weeknight, time-to-make, course, main-ingredi...","[tomato juice, apple cider vinegar, sugar, sal...","[tomato juice, apple cider vinegar, sugar, sal..."


In [125]:
list_ing = []

for index, row in recipe_df.iterrows():
    list_ing.append(row['ingredients_cleaned'])


## Make word2vec embeddings

In [51]:
model = word2vec.Word2Vec(list_ing, \
            vector_size=300, min_count=0, \
            window=10)

In [53]:
similar_words = {search_term: [item[0] for item in model.wv.most_similar([search_term], topn=5)]
                  for search_term in ['egg','mango','bread', 'rice']}
similar_words

{'egg': ['egg white',
  'extra egg',
  'egg yolk',
  'egg substitute',
  'chocolate mint kiss'],
 'mango': ['papaya',
  'jicama',
  'caribbean jerk marinade',
  'kiwi fruit',
  'grapefruit'],
 'bread': ['white bread',
  'wheat bread',
  'french bread',
  'toast',
  'stale bread'],
 'rice': ['long grain rice',
  'white rice',
  'long grain white rice',
  'basmatus rice',
  'noodle']}

In [54]:
def ingredients_vector(list_ingredients):
    final_vector = np.zeros(300)
    
    for ingredient in list_ingredients:
        final_vector += model.wv[ingredient]
    
    return final_vector

In [56]:
recipe_vector_dict = {}

In [95]:
recipe_df_dict = recipe_df.to_dict(orient='index')

In [99]:
for index, value in recipe_df_dict.items():
    print(recipe_df_dict[index]['name'], recipe_df_dict[index]['ingredients_cleaned'])
    recipe_df_dict[index]['vector'] = ingredients_vector(recipe_df_dict[index]['ingredients_cleaned'])

arriba   baked winter squash mexican style ['winter squash', 'mexican seasoning', 'spice', 'honey', 'butter', 'olive oil', 'salt']
a bit different  breakfast pizza ['pizza crust', 'sausage patty', 'egg', 'milk', 'salt pepper', 'cheese']
all in the kitchen  chili ['beef', 'yellow onion', 'tomato', 'tomato paste', 'tomato soup', 'rotel tomato', 'kidney bean', 'water', 'chili powder', 'cumin', 'salt', 'lettuce', 'dar cheese']
alouette  potatoes ['spreadable cheese garlic herb', 'new potato', 'shallot', 'parsley', 'tarragon', 'olive oil', 'wine vinegar', 'salt', 'pepper', 'bell pepper', 'yellow bell pepper']
amish  tomato ketchup  for canning ['tomato juice', 'apple cider vinegar', 'sugar', 'salt', 'pepper', 'clove oil', 'cinnamon oil', 'dry mustard']
apple a day  milk shake ['milk', 'vanilla ice cream', 'apple juice concentrate', 'apple']
aww  marinated olives ['', 'olife', 'ripe olife', 'garlic', 'peppercorn', 'orange rind', 'orange juice', 'chile', 'extra virgin olive oil']
backyard sty

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



spiced beer batter ['plain flmy', 'salt', 'curry powder', 'cumin', 'chili', 'olive oil', 'beer', 'egg white']
spiced beer jelly ['dark beer', 'granny smith apple', 'water', 'sugar', 'lemon', 'juice', 'orange', 'zest', 'cinnamon stick', 'cardamom']
spiced beetroot ['beetroot', 'cayenne pepper', 'paprika', 'cider vinegar', 'honey', 'black pepper', 'greek yogurt']
spiced beets ['beet', 'vinegar', 'sugar', 'black pepper', 'bay leaf', 'clove']
spiced beets  refrigerater pickles ['beet', 'yellow onion', 'apple cider vinegar', 'water', 'sugar', 'brown sugar', 'cinnamon stick', 'clove', 'allspice berry', 'pickling spice', 'salt']
spiced black bean soup ['onion', 'bell pepper', 'roma tomato', 'celery', 'garlic clove', 'ginger', 'olive oil', 'black bean', 'vegetable broth', 'oregano', 'cumin', 'celery salt', 'cilantro', 'thyme', 'allspice', 'nutmeg', 'clove', 'nonfat smy cream', 'cilantro']
spiced black beans ['black bean', 'bacon', 'garlic clove', 'onion', 'bell pepper', 'jalapeno', 'celery', '

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [97]:
recipe_df_dict

{0: {'name': 'arriba   baked winter squash mexican style',
  'id': 137739,
  'minutes': 55,
  'contributor_id': 47892,
  'submitted': '2005-09-16',
  'tags': "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",
  'nutrition': '[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]',
  'n_steps': 11,
  'steps': "['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierc

## Return best recipe based on cosine similarity

In [58]:
def np_cos_sim(a,b):
    return dot(a, b)/(norm(a)*norm(b))

In [107]:
def return_best_recipe(list_ing):
    iv = ingredients_vector(list_ing)
    top_score = -2
    top_recipe = None
    for key, value in recipe_df_dict.items():
        if np_cos_sim(iv,recipe_df_dict[key]['vector']) > top_score:
            top_recipe = recipe_df_dict[key]
            top_score = np_cos_sim(iv,recipe_df_dict[key]['vector'])
#             print(key,np_cos_sim(iv,value))

    return top_recipe

In [118]:
return_best_recipe(["onion", "garlic", "tomato", "celery", "carrot", "potato", "zucchini", "bell pepper",\
                    "ginger", "mushroom" ,"cucumber", "cabbage", "lettuce", "spinach", "leek",\
                   "pumpkin", "asparagu", "eggplant", "broccoli", "cauliflower","lemon", "orange", "apple", "banana", "pineapple", "cranberry", "strawberry", \
                    "peach", "avocado", "kiwi", "peach", "lime"])['ingredients_cleaned']

['cucumber', 'granny smith apple', 'carrot', 'parsley', 'lemon', 'gingerroot']