In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import numpy as np
import statistics
import math

### Ingredient Similarity
### Reading file recipes_in into a dataframe 'recipes_in'

In [29]:
recipes_in = pd.read_parquet(r'../01_DataCleansing/recipes_in.parquet')
recipes_in.head()

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,side-dishes,"[winter squash, mexican seasoning, mixed spice..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,breakfast,"[prepared pizza crust, sausage patty, eggs, mi..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,main-dish,"[ground beef, yellow onions, diced tomatoes, t..."
59389,"{'calories': 368.1, 'carbohydrates': 20.0, 'pr...",45,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,side-dishes,"[spreadable cheese with garlic and herbs, new ..."
44061,"{'calories': 352.9, 'carbohydrates': 28.0, 'pr...",190,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,NO MEAL,"[tomato juice, apple cider vinegar, sugar, sal..."


### Create the corpus required to train the genism word2vec model
a list of ingredients lists and each ingredient list is from one recipe 

In [30]:
ingredient_list=[]
for i in range(0,len(recipes_in)):
    ingredient_list.append(recipes_in.iloc[i].loc['ingredients'].tolist())
ingredient_list[0:5]

[['winter squash',
  'mexican seasoning',
  'mixed spice',
  'honey',
  'butter',
  'olive oil',
  'salt'],
 ['prepared pizza crust',
  'sausage patty',
  'eggs',
  'milk',
  'salt and pepper',
  'cheese'],
 ['ground beef',
  'yellow onions',
  'diced tomatoes',
  'tomato paste',
  'tomato soup',
  'rotel tomatoes',
  'kidney beans',
  'water',
  'chili powder',
  'ground cumin',
  'salt',
  'lettuce',
  'cheddar cheese'],
 ['spreadable cheese with garlic and herbs',
  'new potatoes',
  'shallots',
  'parsley',
  'tarragon',
  'olive oil',
  'red wine vinegar',
  'salt',
  'pepper',
  'red bell pepper',
  'yellow bell pepper'],
 ['tomato juice',
  'apple cider vinegar',
  'sugar',
  'salt',
  'pepper',
  'clove oil',
  'cinnamon oil',
  'dry mustard']]

### Number of Ingredients We are Working with

In [31]:
ingredients=list(set(i for x in ingredient_list for i in x))
len(ingredients)

14942

### Train Word2Vec Models

Use Default Parameters
<br>Reference: https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314

In [32]:
model = Word2Vec(ingredient_list, min_count=1)


### Now Each Ingredient is a Vector

In [33]:
model.wv.__getitem__('tomato paste')

array([ 1.3962319 ,  0.4271814 ,  1.5748521 ,  0.399139  , -0.15688665,
        1.5771635 , -0.6919115 , -0.0695715 , -0.41118005,  0.6774158 ,
       -0.21325423,  0.18818071, -0.94735336,  1.8496097 ,  1.9725869 ,
       -0.7694102 ,  0.4631382 , -0.01829285, -0.38158754, -0.17754999,
       -1.2521286 ,  1.8483313 ,  1.3243692 ,  1.3027529 ,  1.8216466 ,
       -0.23206396,  1.8037609 , -1.7476158 ,  1.1567531 , -2.022955  ,
        0.6580762 ,  0.28219718,  0.9629183 , -0.25779623,  0.48896566,
        0.48033178, -0.92856336,  0.9239151 , -1.1036108 , -0.05262694,
       -0.33574107, -2.0250132 ,  0.35639846, -1.2506629 , -1.1147768 ,
       -0.5971193 , -0.15951352,  0.05871995, -0.5625914 ,  1.1142678 ,
        1.5630797 ,  0.9732683 ,  0.11251692,  0.25580555, -1.0224637 ,
        1.0075767 ,  1.6258405 ,  1.122648  , -1.4554864 , -0.22417465,
       -0.12667611, -0.0978438 ,  0.5684083 ,  0.49465257, -0.6075554 ,
        1.8511976 , -0.831534  , -0.67877173,  1.4002752 ,  1.85

### A Few Tests for Model Outputs for Most Similar Ingredients

In [34]:
display(model.wv.most_similar('tomato paste'))

[('tomato puree', 0.7994508147239685),
 ('tomato sauce', 0.7294666171073914),
 ('crushed tomatoes', 0.6976363062858582),
 ('tomato juice', 0.6956126689910889),
 ('whole tomatoes', 0.6946349740028381),
 ('dry red wine', 0.6840712428092957),
 ('bay leaves', 0.6802037954330444),
 ('red wine', 0.672786295413971),
 ('tomatoes with juice', 0.6624387502670288),
 ('canned tomatoes', 0.6620414853096008)]

In [35]:
display(model.wv.most_similar('ground beef'))


[('lean ground beef', 0.9439941048622131),
 ('ground chuck', 0.8643509149551392),
 ('extra lean ground beef', 0.8101382851600647),
 ('hamburger', 0.7973396182060242),
 ('ground round', 0.7913963794708252),
 ('ground turkey', 0.766654908657074),
 ('ground meat', 0.6845386028289795),
 ('lean ground turkey', 0.6574926972389221),
 ('90% lean ground beef', 0.6495521664619446),
 ('lean hamburger', 0.6449505090713501)]

In [36]:
display(model.wv.most_similar('olive oil'))


[('extra virgin olive oil', 0.9226544499397278),
 ('virgin olive oil', 0.7579464912414551),
 ('light olive oil', 0.7155978679656982),
 ('lite olive oil', 0.6425748467445374),
 ('vegetable oil', 0.6206772923469543),
 ('olive oil flavored cooking spray', 0.6138953566551208),
 ('canola oil', 0.6095079779624939),
 ('chili pepper flakes', 0.575196385383606),
 ('oil', 0.5733540654182434),
 ('oregano leaves', 0.5602754950523376)]

In [37]:
display(model.wv.most_similar('eggs'))


[('egg', 0.9236786961555481),
 ('extra large eggs', 0.5855139493942261),
 ('egg whites', 0.5508499145507812),
 ('egg substitute', 0.5169443488121033),
 ('egg yolk', 0.49542543292045593),
 ('egg yolks', 0.48913753032684326),
 ('egg white', 0.46426424384117126),
 ('extra-large eggs', 0.43051624298095703),
 ('beef medallions', 0.41059550642967224),
 ('red food coloring paste', 0.38984766602516174)]

### Vectorize ingredient list for each recipe
Add columns 'recipe_ingredients_vecto'to Datafram 'recipes_in' by averaging all ingredient vectors from each recipe.

In [38]:

def list_to_vec_model(l):
    return model.wv.__getitem__(x for x in l).sum(axis=0)/len(l)   
recipes_in['recipe_ingredients_vector']=recipes_in.apply(lambda x: list_to_vec_model(x['ingredients']), axis=1)



recipes_in.head()

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients,recipe_ingredients_vector
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,side-dishes,"[winter squash, mexican seasoning, mixed spice...","[-0.19628285, 0.3473978, 0.07327713, -0.008137..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,breakfast,"[prepared pizza crust, sausage patty, eggs, mi...","[0.8475206, -0.17422628, 0.78840834, 0.3105595..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,main-dish,"[ground beef, yellow onions, diced tomatoes, t...","[0.81354225, 1.0795594, 1.3617136, -0.60430837..."
59389,"{'calories': 368.1, 'carbohydrates': 20.0, 'pr...",45,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,side-dishes,"[spreadable cheese with garlic and herbs, new ...","[0.2621753, -0.11066092, 0.4143465, -0.2925097..."
44061,"{'calories': 352.9, 'carbohydrates': 28.0, 'pr...",190,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,NO MEAL,"[tomato juice, apple cider vinegar, sugar, sal...","[0.0114387795, -0.2737496, 0.55514616, -0.3837..."


### Get a Dataframe with Recipe id and Ingredient List Vector

In [39]:
recipes_ingredient_vec=recipes_in['recipe_ingredients_vector'].reset_index()
recipes_ingredient_vec.head()

Unnamed: 0,recipe_id,recipe_ingredients_vector
0,137739,"[-0.19628285, 0.3473978, 0.07327713, -0.008137..."
1,31490,"[0.8475206, -0.17422628, 0.78840834, 0.3105595..."
2,112140,"[0.81354225, 1.0795594, 1.3617136, -0.60430837..."
3,59389,"[0.2621753, -0.11066092, 0.4143465, -0.2925097..."
4,44061,"[0.0114387795, -0.2737496, 0.55514616, -0.3837..."


In [40]:
len(recipes_ingredient_vec)

231637

In [41]:
recipes_ingredient_vec.to_parquet('../05_RecipeExplorationTool/data/recipes_ingredient_vec.parquet')