In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import numpy as np
import statistics
import math

### Ingredient Similarity
### Reading file recipes_in into a dataframe 'recipes_in'

In [2]:
recipes_in = pd.read_parquet(r'../01_DataCleansing/recipes_in.parquet')
recipes_in.head()

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Side Dishes,"[winter squash, mexican seasoning, mixed spice..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Breakfast,"[prepared pizza crust, sausage patty, eggs, mi..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Main Dish,"[ground beef, yellow onions, diced tomatoes, t..."
59389,"{'calories': 368.1, 'carbohydrates': 20.0, 'pr...",45,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Side Dishes,"[spreadable cheese with garlic and herbs, new ..."
44061,"{'calories': 352.9, 'carbohydrates': 28.0, 'pr...",190,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,NO MEAL,"[tomato juice, apple cider vinegar, sugar, sal..."


### Create the corpus required to train the genism word2vec model
a list of ingredients lists and each ingredient list is from one recipe 

In [3]:
ingredient_list=[]
for i in range(0,len(recipes_in)):
    ingredient_list.append(recipes_in.iloc[i].loc['ingredients'].tolist())
ingredient_list[0:5]

[['winter squash',
  'mexican seasoning',
  'mixed spice',
  'honey',
  'butter',
  'olive oil',
  'salt'],
 ['prepared pizza crust',
  'sausage patty',
  'eggs',
  'milk',
  'salt and pepper',
  'cheese'],
 ['ground beef',
  'yellow onions',
  'diced tomatoes',
  'tomato paste',
  'tomato soup',
  'rotel tomatoes',
  'kidney beans',
  'water',
  'chili powder',
  'ground cumin',
  'salt',
  'lettuce',
  'cheddar cheese'],
 ['spreadable cheese with garlic and herbs',
  'new potatoes',
  'shallots',
  'parsley',
  'tarragon',
  'olive oil',
  'red wine vinegar',
  'salt',
  'pepper',
  'red bell pepper',
  'yellow bell pepper'],
 ['tomato juice',
  'apple cider vinegar',
  'sugar',
  'salt',
  'pepper',
  'clove oil',
  'cinnamon oil',
  'dry mustard']]

### Number of Ingredients We are Working with

In [4]:
ingredients=list(set(i for x in ingredient_list for i in x))
len(ingredients)

14942

### Train Word2Vec Models

Use Default Parameters
<br>Reference: https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314

In [5]:
model = Word2Vec(ingredient_list, min_count=1)


### Now Each Ingredient is a Vector

In [6]:
model.wv.__getitem__('tomato paste')

array([-0.9149145 ,  1.123325  , -1.0936317 ,  0.35871062,  0.12787175,
       -1.1499809 ,  0.26543984, -0.14168608,  1.6377338 ,  0.9014223 ,
       -0.14462878,  2.210522  , -0.29816937, -3.0175092 , -0.28193074,
       -0.03931889, -3.1425076 , -0.41869506,  1.9348537 ,  0.6651366 ,
       -1.0720613 , -0.74189544, -0.284265  ,  2.0057163 ,  1.1976069 ,
        0.8758734 ,  1.2095399 , -0.1881468 ,  0.4635824 , -1.8249965 ,
       -0.2559701 ,  1.6428765 ,  1.2894908 ,  2.6693702 ,  0.34306175,
       -1.7052001 , -1.043298  , -0.05284423,  1.5147152 , -1.5847232 ,
        0.12595949,  0.97451633, -0.43317333,  0.17339042, -0.2127125 ,
        1.0654296 , -0.6217671 , -0.49158177, -0.73933244,  0.585873  ,
       -0.46522814, -0.5471861 , -0.9437526 ,  1.741153  ,  1.5496726 ,
       -0.2501186 ,  0.11984954,  0.35340428,  0.13787517,  1.1541123 ,
       -1.6020988 ,  0.33208135, -0.04793662, -2.374738  , -0.05949667,
        0.10574434,  1.3349599 , -1.3285564 , -0.5479753 ,  0.05

### A Few Tests for Model Outputs for Most Similar Ingredients

In [7]:
display(model.wv.most_similar('tomato paste'))

[('tomato puree', 0.7866252064704895),
 ('tomato sauce', 0.6950498223304749),
 ('tomato juice', 0.6798321604728699),
 ('bay leaves', 0.6718400716781616),
 ('crushed tomatoes', 0.6695051789283752),
 ('whole tomatoes', 0.6572368144989014),
 ('dried brown lentils', 0.6538439393043518),
 ('dry red wine', 0.640514612197876),
 ('tomatoes with juice', 0.635966956615448),
 ('bay leaf', 0.6343404054641724)]

In [8]:
display(model.wv.most_similar('ground beef'))


[('lean ground beef', 0.9391839504241943),
 ('ground chuck', 0.8461577296257019),
 ('hamburger', 0.8221970796585083),
 ('ground round', 0.7940616607666016),
 ('extra lean ground beef', 0.7911683320999146),
 ('ground meat', 0.7581408619880676),
 ('ground turkey', 0.753473699092865),
 ('lean hamburger', 0.6712562441825867),
 ('ground sirloin', 0.6615191102027893),
 ('ground venison', 0.6538377404212952)]

In [9]:
display(model.wv.most_similar('olive oil'))


[('extra virgin olive oil', 0.9052597284317017),
 ('virgin olive oil', 0.752072811126709),
 ('light olive oil', 0.7234945297241211),
 ('lite olive oil', 0.6731634140014648),
 ('vegetable oil', 0.6121304035186768),
 ('canola oil', 0.5828148126602173),
 ('oil', 0.5787796378135681),
 ('dried red pepper flakes', 0.5720276832580566),
 ('chili pepper flakes', 0.566417396068573),
 ('olive oil flavored cooking spray', 0.5576046109199524)]

In [10]:
display(model.wv.most_similar('eggs'))


[('egg', 0.9105204939842224),
 ('extra large eggs', 0.5839084982872009),
 ('egg substitute', 0.534137487411499),
 ('egg whites', 0.5213854908943176),
 ('egg yolk', 0.5125996470451355),
 ('egg white', 0.4849655330181122),
 ('egg yolks', 0.4775627851486206),
 ('extra-large eggs', 0.3986189067363739),
 ('oranges, juice and rind of', 0.39736953377723694),
 ('egg beaters egg substitute', 0.3731057047843933)]

### Vectorize ingredient list for each recipe
Add columns 'recipe_ingredients_vecto'to Datafram 'recipes_in' by averaging all ingredient vectors from each recipe.

In [11]:

def list_to_vec_model(l):
    return model.wv.__getitem__(x for x in l).sum(axis=0)/len(l)   
recipes_in['recipe_ingredients_vector']=recipes_in.apply(lambda x: list_to_vec_model(x['ingredients']), axis=1)



recipes_in.head()

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients,recipe_ingredients_vector
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Side Dishes,"[winter squash, mexican seasoning, mixed spice...","[-0.21430667, 0.059886776, -0.31777206, 0.3828..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Breakfast,"[prepared pizza crust, sausage patty, eggs, mi...","[-0.2475955, -0.2873861, 0.19562733, -0.690071..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Main Dish,"[ground beef, yellow onions, diced tomatoes, t...","[-0.98470646, 0.14299391, -0.5030059, -0.49472..."
59389,"{'calories': 368.1, 'carbohydrates': 20.0, 'pr...",45,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Side Dishes,"[spreadable cheese with garlic and herbs, new ...","[-0.13308103, -0.22797266, -0.31254694, -1.042..."
44061,"{'calories': 352.9, 'carbohydrates': 28.0, 'pr...",190,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,NO MEAL,"[tomato juice, apple cider vinegar, sugar, sal...","[0.000708051, -0.058615696, -0.6788341, -0.414..."


### Get a Dataframe with Recipe id and Ingredient List Vector

In [12]:
recipes_ingredient_vec=recipes_in['recipe_ingredients_vector'].reset_index()
recipes_ingredient_vec.head()

Unnamed: 0,recipe_id,recipe_ingredients_vector
0,137739,"[-0.21430667, 0.059886776, -0.31777206, 0.3828..."
1,31490,"[-0.2475955, -0.2873861, 0.19562733, -0.690071..."
2,112140,"[-0.98470646, 0.14299391, -0.5030059, -0.49472..."
3,59389,"[-0.13308103, -0.22797266, -0.31254694, -1.042..."
4,44061,"[0.000708051, -0.058615696, -0.6788341, -0.414..."


In [13]:
len(recipes_ingredient_vec)

231637