In [90]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import numpy as np
import statistics
import math
import itertools

In [91]:
recipes_in = pd.read_parquet('../01_DataCleansing/recipes_in.parquet')
users_in = pd.read_csv('../01_DataCleansing/users_in.csv')

### Define Function for Cosine Similarity Calculation

In [92]:
def cosine_sim(List1, List2):
    return List1.dot(List2)/ (np.linalg.norm(List1) * np.linalg.norm(List2))

### Create the corpus required to train the genism word2vec model

In [93]:
ingredient_list=[]
for i in range(0,len(recipes_in)):
    ingredient_list.append(recipes_in.iloc[i].loc['ingredients'].tolist())

### Train the Word2Vec Model with Parameters Used in the Algorithm

In [94]:
model = Word2Vec(ingredient_list, min_count=1)

### Ingredient Similarity Validation from the Word2Vec Model

In [95]:
display(model.wv.most_similar('tomato paste'))
display(model.wv.most_similar('ground beef'))
display(model.wv.most_similar('olive oil'))
display(model.wv.most_similar('eggs'))

[('tomato puree', 0.8048397302627563),
 ('tomato sauce', 0.7564524412155151),
 ('crushed tomatoes', 0.7190336585044861),
 ('whole tomatoes', 0.7148817777633667),
 ('bay leaves', 0.7098484039306641),
 ('canned tomatoes', 0.7092584371566772),
 ('tomato juice', 0.7071650624275208),
 ('dry red wine', 0.6920954585075378),
 ('bay leaf', 0.6816862225532532),
 ('whole canned tomatoes', 0.6762807965278625)]

[('lean ground beef', 0.9390567541122437),
 ('ground chuck', 0.8555924296379089),
 ('ground round', 0.8065312504768372),
 ('extra lean ground beef', 0.8043294548988342),
 ('hamburger', 0.8000902533531189),
 ('ground turkey', 0.7617867588996887),
 ('ground meat', 0.6804905533790588),
 ('lean ground turkey', 0.6656901836395264),
 ('80% lean ground beef', 0.6641635298728943),
 ('ground venison', 0.6418049335479736)]

[('extra virgin olive oil', 0.9176726341247559),
 ('virgin olive oil', 0.7595925331115723),
 ('light olive oil', 0.7354088425636292),
 ('lite olive oil', 0.723577082157135),
 ('vegetable oil', 0.625278115272522),
 ('olive oil flavored cooking spray', 0.6245958209037781),
 ('canola oil', 0.6084417104721069),
 ('dried red pepper flakes', 0.5884072184562683),
 ('oil', 0.5689454078674316),
 ('dried black turtle beans', 0.5634503364562988)]

[('egg', 0.9097540974617004),
 ('extra large eggs', 0.5710679888725281),
 ('egg whites', 0.5516806244850159),
 ('egg substitute', 0.5352007746696472),
 ('egg yolk', 0.519622802734375),
 ('egg white', 0.4978804886341095),
 ('egg yolks', 0.4796869158744812),
 ('extra-large eggs', 0.4300239682197571),
 ('canistel', 0.39016205072402954),
 ('egg beaters egg substitute', 0.38578641414642334)]

### Vectorize ingredient list for each recipe
Add column 'recipe_ingredients_vector'by averaging all ingredient vectors from each recipe.

In [96]:
recipes=recipes_in.copy()
def list_to_vec_model(l):
    return model.wv.__getitem__(x for x in l).sum(axis=0)/len(l)   
recipes['recipe_ingredients_vector']=recipes.apply(lambda x: list_to_vec_model(x['ingredients']), axis=1)
recipes

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients,recipe_ingredients_vector
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,side-dishes,"[winter squash, mexican seasoning, mixed spice...","[-0.13884355, -0.7336089, 0.17222415, -0.13168..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,breakfast,"[prepared pizza crust, sausage patty, eggs, mi...","[-0.29892376, 0.22885656, -0.119868435, -0.599..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,main-dish,"[ground beef, yellow onions, diced tomatoes, t...","[-0.71829444, -0.22238098, 1.0307469, 0.212352..."
59389,"{'calories': 368.1, 'carbohydrates': 20.0, 'pr...",45,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,side-dishes,"[spreadable cheese with garlic and herbs, new ...","[0.0975564, -0.2747319, 0.3460284, 0.028355127..."
44061,"{'calories': 352.9, 'carbohydrates': 28.0, 'pr...",190,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,NO MEAL,"[tomato juice, apple cider vinegar, sugar, sal...","[-0.30732197, -0.33622223, -0.40343505, -0.040..."
...,...,...,...,...,...,...,...
486161,"{'calories': 415.2, 'carbohydrates': 15.0, 'pr...",60,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",north-american,NO MEAL,"[celery, onion, green sweet pepper, garlic clo...","[-0.2052051, -0.23893109, 0.37392014, 0.043180..."
493372,"{'calories': 14.8, 'carbohydrates': 1.0, 'prot...",5,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,NO MEAL,"[paprika, salt, garlic powder, onion powder, d...","[0.18415414, -0.46574455, 0.12295501, -0.24088..."
308080,"{'calories': 59.2, 'carbohydrates': 0.0, 'prot...",40,"{'bake': 0, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,appetizers,"[hard-cooked eggs, mayonnaise, dijon mustard, ...","[-0.3699473, -0.16714454, -0.2248142, 0.158965..."
298512,"{'calories': 188.0, 'carbohydrates': 9.0, 'pro...",29,"{'bake': 1, 'barbecue': 0, 'blanch': 0, 'blend...",NO CUISINE,desserts,"[butter, eagle brand condensed milk, light bro...","[-0.30778033, 0.0313038, -0.7040587, -0.483502..."


### Get a Dataframe with Recipe id, Ingredient List and Ingredient List Vector

In [97]:
recipes_ingredient_vec=recipes[['ingredients','recipe_ingredients_vector']].reset_index()
recipes_ingredient_vec.head()

Unnamed: 0,recipe_id,ingredients,recipe_ingredients_vector
0,137739,"[winter squash, mexican seasoning, mixed spice...","[-0.13884355, -0.7336089, 0.17222415, -0.13168..."
1,31490,"[prepared pizza crust, sausage patty, eggs, mi...","[-0.29892376, 0.22885656, -0.119868435, -0.599..."
2,112140,"[ground beef, yellow onions, diced tomatoes, t...","[-0.71829444, -0.22238098, 1.0307469, 0.212352..."
3,59389,"[spreadable cheese with garlic and herbs, new ...","[0.0975564, -0.2747319, 0.3460284, 0.028355127..."
4,44061,"[tomato juice, apple cider vinegar, sugar, sal...","[-0.30732197, -0.33622223, -0.40343505, -0.040..."


### Set Test Users As Those Who Rate At Least 500 Recipes as Non Zero

In [98]:
users_in_exclude0=users_in.copy()
users_in_exclude0=users_in_exclude0[users_in_exclude0['ratings']!=0]
users=list(set(users_in_exclude0['user_id']))
len(users)

24961

In [99]:
user_recipe_count=users_in_exclude0.groupby(['user_id']).count()
test_users=user_recipe_count[user_recipe_count['recipe_id']>=500].reset_index()
test_users=list(test_users['user_id'])
len(test_users)

193

In [100]:
for u in test_users:
    input_user_id=u
    idx = np.where((users_in['user_id']==input_user_id) & (users_in['ratings']>=4))
    live_user_recipes=users_in.copy()
    live_user_recipes=live_user_recipes.loc[idx][['user_id','recipe_id']] 
    live_user_ingredient=pd.merge(live_user_recipes,recipes_ingredient_vec,how="inner",on='recipe_id')
    #Rank recipe similarity by calculating cosine similarity between each recipe and user's ingredient preference
    user_pref=live_user_ingredient['recipe_ingredients_vector'].to_numpy().sum(axis=0)
    ingredient_sim=recipes_ingredient_vec.copy()
    ingredient_sim['user_ingredient_pref']=ingredient_sim.apply(lambda x:user_pref, axis=1)
    ingredient_sim['ingredient_sim']=ingredient_sim.apply(lambda x: cosine_sim(x.user_ingredient_pref, x.recipe_ingredients_vector), axis=1)
    ingredient_rank=ingredient_sim.sort_values(by='ingredient_sim',ascending=False)
    
    #Get all ingredients that show up in the top-rated recipes by this user into a list
    user_pref_in=[i for x in list(live_user_ingredient['ingredients']) for i in x]
    
    #Get all ingredients that show up in the top 20 recipes from the similarity ranking into a list
    high_rank=ingredient_rank[0:20]
    high_rank_in=[i for x in list(high_rank['ingredients']) for i in x]
    
    #Calculate the similarity scores for each pair of recipes showing up from the above 2 lists and sum up as high_score
    temp_high = list(itertools.product(high_rank_in, user_pref_in))
    high_score=0
    for t in temp_high:
        high_score += cosine_sim(model.wv.__getitem__(t[0]),model.wv.__getitem__(t[1]))
    
    #Get all ingredients that show up in the bottowm 20 recipes from the similarity ranking into a list
    low_rank=ingredient_rank[-21:-1]
    low_rank_in=[i for x in list(low_rank['ingredients']) for i in x]
    
    #Calculate the similarity scores for each pair of recipes showing up in the bottowm 20 recipes from the similarity ranking and all ingredients that show up in the top-rated recipes by this user
    # Sum up the scores as low_score
    temp_low = list(itertools.product(low_rank_in, user_pref_in))
    low_score=0
    for t in temp_low:
        low_score += cosine_sim(model.wv.__getitem__(t[0]),model.wv.__getitem__(t[1]))
    
    #Print the average of high_score and low_score for each test user
    print((high_score/len(temp_high),low_score/len(temp_low)))
    

(0.09225152735255573, -0.018318897926582416)
(0.09168175751369509, -0.01721999065107429)
(0.09043556094172742, -0.0151979628753456)
(0.09151165312091282, -0.01409542798858299)
(0.09489244536244211, -0.030191852196440667)
(0.09310263912949922, -0.013965827920261882)
(0.0921908982209716, -0.023691058692349113)
(0.0950449793516636, -0.024210110112993007)
(0.09218468545728639, -0.017544410332536075)
(0.09650439194626864, -0.030501880687173882)
(0.09055240345180939, -0.020652937506410667)
(0.09241878223038381, -0.02197409327834735)
(0.09443059883690796, -0.008913279415738365)
(0.0915364235070126, -0.017060338490450182)
(0.09317555821885035, -0.027064569040973534)
(0.08957460688233689, -0.009539909841483379)
(0.09413281515928434, -0.02784361924615751)
(0.09426688775447616, -0.019268361652142917)
(0.09141801465545116, -0.010733021978640032)
(0.09029459792623024, -0.010073057055928777)
(0.09250571575814918, -0.020729092936851472)
(0.09030867323321154, -0.02259099101536749)
(0.09370941327379237

The average score of ingredient to ingredient similarity is much higher between ingredients showing up in user's top-rated recipes and those in top 20 ranking recipes than between ingredients showing up in user's top-rated recipes and those in bottom 20 ranking recipes