In [1]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
import re
import numpy as np
import statistics
import math
import itertools

In [2]:
recipes_in = pd.read_parquet('../01_DataCleansing/recipes_in.parquet')
users_in = pd.read_csv('../01_DataCleansing/users_in.csv')

### Define Function for Cosine Similarity Calculation

In [3]:
def cosine_sim(List1, List2):
    return List1.dot(List2)/ (np.linalg.norm(List1) * np.linalg.norm(List2))

### Create the corpus required to train the genism word2vec model

In [4]:
ingredient_list=[]
for i in range(0,len(recipes_in)):
    ingredient_list.append(recipes_in.iloc[i].loc['ingredients'].tolist())

### Train the Word2Vec Model with Parameters Used in the Algorithm

In [5]:
model = Word2Vec(ingredient_list, min_count=1)

### Ingredient Similarity Validation from the Word2Vec Model

In [6]:
display(model.wv.most_similar('tomato paste'))
display(model.wv.most_similar('ground beef'))
display(model.wv.most_similar('olive oil'))
display(model.wv.most_similar('eggs'))

[('tomato puree', 0.7689598202705383),
 ('tomato sauce', 0.6704436540603638),
 ('crushed tomatoes', 0.6690917015075684),
 ('whole tomatoes', 0.6635255217552185),
 ('tomato juice', 0.6633947491645813),
 ('bay leaves', 0.6569827198982239),
 ('red wine', 0.6488566994667053),
 ('dry red wine', 0.6406188011169434),
 ('tomatoes with juice', 0.6372570991516113),
 ('dried chili pepper flakes', 0.6194062232971191)]

[('lean ground beef', 0.9392921924591064),
 ('ground chuck', 0.8433729410171509),
 ('hamburger', 0.7979741096496582),
 ('extra lean ground beef', 0.7952334880828857),
 ('ground round', 0.7843409180641174),
 ('ground turkey', 0.754817008972168),
 ('ground meat', 0.7473284602165222),
 ('ground sirloin', 0.64708012342453),
 ('ground venison', 0.638462245464325),
 ('90% lean ground beef', 0.6384150981903076)]

[('extra virgin olive oil', 0.9020969867706299),
 ('virgin olive oil', 0.7491848468780518),
 ('light olive oil', 0.7039201855659485),
 ('lite olive oil', 0.6744064688682556),
 ('vegetable oil', 0.6077904105186462),
 ('canola oil', 0.6065067052841187),
 ('dried chili pepper flakes', 0.5814028382301331),
 ('chili pepper flakes', 0.5796778798103333),
 ('spanish olive oil', 0.5781193971633911),
 ('olive oil flavored cooking spray', 0.5743055939674377)]

[('egg', 0.9116581678390503),
 ('extra large eggs', 0.5568637847900391),
 ('egg whites', 0.5317972302436829),
 ('egg yolk', 0.5168436765670776),
 ('egg substitute', 0.5098385214805603),
 ('egg yolks', 0.4603763818740845),
 ('egg white', 0.4577726125717163),
 ('extra-large eggs', 0.41717207431793213),
 ('all-bran cereal with raisins', 0.3838845491409302),
 ('egg beaters egg substitute', 0.3824930489063263)]

### Vectorize ingredient list for each recipe
Add column 'recipe_ingredients_vector'by averaging all ingredient vectors from each recipe.

In [7]:
recipes=recipes_in.copy()
def list_to_vec_model(l):
    return model.wv.__getitem__(x for x in l).sum(axis=0)/len(l)   
recipes['recipe_ingredients_vector']=recipes.apply(lambda x: list_to_vec_model(x['ingredients']), axis=1)
recipes

Unnamed: 0_level_0,nutrition,minutes,techniques,cuisine,meal_of_day,ingredients,recipe_ingredients_vector
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
137739,"{'calories': 51.5, 'carbohydrates': 4.0, 'prot...",55,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Side Dishes,"[winter squash, mexican seasoning, mixed spice...","[-0.18798865, -0.08859058, -0.42880353, 0.0475..."
31490,"{'calories': 173.4, 'carbohydrates': 1.0, 'pro...",30,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,Breakfast,"[prepared pizza crust, sausage patty, eggs, mi...","[-0.50021666, -0.013074194, 0.61354846, -0.557..."
112140,"{'calories': 269.8, 'carbohydrates': 5.0, 'pro...",130,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Main Dish,"[ground beef, yellow onions, diced tomatoes, t...","[-0.78042513, -0.06335425, -0.16580707, -0.683..."
59389,"{'calories': 368.1, 'carbohydrates': 20.0, 'pr...",45,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Side Dishes,"[spreadable cheese with garlic and herbs, new ...","[0.26123694, -0.295517, -0.35570833, -1.08916,..."
44061,"{'calories': 352.9, 'carbohydrates': 28.0, 'pr...",190,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,NO MEAL,"[tomato juice, apple cider vinegar, sugar, sal...","[0.03214314, -0.11727638, -0.46814972, -0.2997..."
...,...,...,...,...,...,...,...
486161,"{'calories': 415.2, 'carbohydrates': 15.0, 'pr...",60,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",North American,NO MEAL,"[celery, onion, green sweet pepper, garlic clo...","[-0.21480156, -0.19607411, -0.48717177, -0.670..."
493372,"{'calories': 14.8, 'carbohydrates': 1.0, 'prot...",5,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,NO MEAL,"[paprika, salt, garlic powder, onion powder, d...","[-0.36401704, -0.40349442, 0.15369682, -0.7406..."
308080,"{'calories': 59.2, 'carbohydrates': 0.0, 'prot...",40,"{'Bake': 0, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Appetizers,"[hard-cooked eggs, mayonnaise, dijon mustard, ...","[0.12334852, -0.33342174, 0.06355281, -1.05657..."
298512,"{'calories': 188.0, 'carbohydrates': 9.0, 'pro...",29,"{'Bake': 1, 'Barbecue': 0, 'Blanch': 0, 'Blend...",Uncategorized,Desserts,"[butter, eagle brand condensed milk, light bro...","[0.06909981, 0.2974269, -0.59410584, -0.052630..."


### Get a Dataframe with Recipe id, Ingredient List and Ingredient List Vector

In [8]:
recipes_ingredient_vec=recipes[['ingredients','recipe_ingredients_vector']].reset_index()
recipes_ingredient_vec.head()

Unnamed: 0,recipe_id,ingredients,recipe_ingredients_vector
0,137739,"[winter squash, mexican seasoning, mixed spice...","[-0.18798865, -0.08859058, -0.42880353, 0.0475..."
1,31490,"[prepared pizza crust, sausage patty, eggs, mi...","[-0.50021666, -0.013074194, 0.61354846, -0.557..."
2,112140,"[ground beef, yellow onions, diced tomatoes, t...","[-0.78042513, -0.06335425, -0.16580707, -0.683..."
3,59389,"[spreadable cheese with garlic and herbs, new ...","[0.26123694, -0.295517, -0.35570833, -1.08916,..."
4,44061,"[tomato juice, apple cider vinegar, sugar, sal...","[0.03214314, -0.11727638, -0.46814972, -0.2997..."


### Set Test Users As Those Who Rate At Least 500 Recipes as Non Zero

In [9]:
users_in_exclude0=users_in.copy()
users_in_exclude0=users_in_exclude0[users_in_exclude0['ratings']!=0]
users=list(set(users_in_exclude0['user_id']))
len(users)

24961

In [10]:
user_recipe_count=users_in_exclude0.groupby(['user_id']).count()
test_users=user_recipe_count[user_recipe_count['recipe_id']>=500].reset_index()
test_users=list(test_users['user_id'])
len(test_users)

193

In [None]:
for u in test_users:
    input_user_id=u
    idx = np.where((users_in['user_id']==input_user_id) & (users_in['ratings']>=4))
    live_user_recipes=users_in.copy()
    live_user_recipes=live_user_recipes.loc[idx][['user_id','recipe_id']] 
    live_user_ingredient=pd.merge(live_user_recipes,recipes_ingredient_vec,how="inner",on='recipe_id')
    #Rank recipe similarity by calculating cosine similarity between each recipe and user's ingredient preference
    user_pref=live_user_ingredient['recipe_ingredients_vector'].to_numpy().sum(axis=0)
    ingredient_sim=recipes_ingredient_vec.copy()
    ingredient_sim['user_ingredient_pref']=ingredient_sim.apply(lambda x:user_pref, axis=1)
    ingredient_sim['ingredient_sim']=ingredient_sim.apply(lambda x: cosine_sim(x.user_ingredient_pref, x.recipe_ingredients_vector), axis=1)
    ingredient_rank=ingredient_sim.sort_values(by='ingredient_sim',ascending=False)
    
    #Get all ingredients that show up in the top-rated recipes by this user into a list
    user_pref_in=[i for x in list(live_user_ingredient['ingredients']) for i in x]
    
    #Get all ingredients that show up in the top 20 recipes from the similarity ranking into a list
    high_rank=ingredient_rank[0:20]
    high_rank_in=[i for x in list(high_rank['ingredients']) for i in x]
    
    #Calculate the similarity scores for each pair of recipes showing up from the above 2 lists and sum up as high_score
    temp_high = list(itertools.product(high_rank_in, user_pref_in))
    high_score=0
    for t in temp_high:
        high_score += cosine_sim(model.wv.__getitem__(t[0]),model.wv.__getitem__(t[1]))
    
    #Get all ingredients that show up in the bottowm 20 recipes from the similarity ranking into a list
    low_rank=ingredient_rank[-21:-1]
    low_rank_in=[i for x in list(low_rank['ingredients']) for i in x]
    
    #Calculate the similarity scores for each pair of recipes showing up in the bottowm 20 recipes from the similarity ranking and all ingredients that show up in the top-rated recipes by this user
    # Sum up the scores as low_score
    temp_low = list(itertools.product(low_rank_in, user_pref_in))
    low_score=0
    for t in temp_low:
        low_score += cosine_sim(model.wv.__getitem__(t[0]),model.wv.__getitem__(t[1]))
    
    #Print the average of high_score and low_score for each test user
    print((high_score/len(temp_high),low_score/len(temp_low)))
    

(0.09085094535947513, -0.02056315952339822)


The average score of ingredient to ingredient similarity is much higher between ingredients showing up in user's top-rated recipes and those in top 20 ranking recipes than between ingredients showing up in user's top-rated recipes and those in bottom 20 ranking recipes