# Word2Vec Model

In [372]:
import numpy as np
import pandas as pd
import re
import warnings

warnings.filterwarnings("ignore")

In [373]:
data = pd.read_pickle("processed_cookbook.pkl")
data

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe,https://www.justonecookbook.com/easy-tonkotsu-...,"[mirin, ginger, chashu, water, niboshi_boiled,...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken Video,https://www.justonecookbook.com/pan-fried-curr...,"[kewpie_mayonnaise, oil_neutral, allpurpose_fl...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles Video,https://www.justonecookbook.com/udon-noodles/,"[salt, starch_potato, allpurpose_flour, water]",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Tomato Egg Vermicelli Soup Video,https://www.justonecookbook.com/tomato-egg-ver...,"[toasted_sesame_oil, salt, white_pepper, tomat...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
4,Butter Ponzu Beef Video,https://www.justonecookbook.com/butter-ponzu-b...,"[oil_neutral, pepper_black, komatsuna, garlic,...",13.0,40.0,10.0,10.0,,386.0,8.0,...,84.0,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6669,Scallion Ginger Shrimp Recipe Redux,https://thewoksoflife.com/scallion-ginger-shri...,"[scallion, white_pepper, ginger, oil_peanut, s...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
6670,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[oil, butter_peanut, cream_cheese, baking, but...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
6671,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[peach, ice, wedge_lime, lime, mango, tequila,...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
6672,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[breast_chicken, oil, scallion, rice, onion, l...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


In [374]:
from gensim.models import Word2Vec
# Skip-gram works better for infrequent words and lower data size
# CBOW works better for frequent words and larger data size
w2v_model = Word2Vec(data.ingredients, vector_size=100, window=11, sg=0, epochs=1000)

In [401]:
w2v_model.save("word2vec.model")


In [375]:
# Choose half of median length of ingredients as window size
ingredient_length = np.array([len(ingredient) for ingredient in data.ingredients])
np.ceil(np.median(ingredient_length))

11.0

In [376]:
w2v_model.wv.most_similar("oil")

[('oil_vegetable', 0.6746701598167419),
 ('oil_neutral', 0.5632079839706421),
 ('oil_peanut', 0.5547826290130615),
 ('oil_canola', 0.4721050262451172),
 ('olive_oil', 0.4080338478088379),
 ('sodium_low_chicken_stock', 0.3009519577026367),
 ('passata_tomato', 0.27367496490478516),
 ('dark_soy', 0.24730858206748962),
 ('wine_chinese', 0.24586676061153412),
 ('oil_sesame', 0.23991228640079498)]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recipe_vector_mean(ingredients: list, model: Word2Vec) -> list:
    """
    Takes in a list of recipe ingredients, embeds it and calculate the mean
    """
    vector_embedding = [model.wv[ing] for ing in ingredients if ing in model.wv]
    return np.mean(vector_embedding, axis=0) if vector_embedding else np.zeros(model.vector_size) * -1
    

In [378]:
# recipes_vector_mean = [get_recipe_vector_mean(recipe, w2v_model) for recipe in data.ingredients]
# # get_recipe_vector_mean(["blueberries", "salt", "oil", "egg", "flour", "apple"], w2v_model)
# target_recipe_mean = get_recipe_vector_mean(["salmon", "cheese", "broccoli",
#                                              "garlid_bread", "fries"], w2v_model)
# get_top_n_similarities(target_recipe_mean, recipes_vector_mean, 5)

recipes_vector_mean = [get_recipe_vector_mean(recipe, w2v_model) for recipe in data.ingredients]
recipes_vector_mean = np.array(recipes_vector_mean)

sims = cosine_similarity(recipes_vector_mean[0].reshape(1, -1), recipes_vector_mean)[0]
sims.argsort()[::-1][:5]

array([  0, 193, 236, 194, 162])

In [379]:
recipes_vector_mean = [get_recipe_vector_mean(recipe, w2v_model) for recipe in data.ingredients]
recipes_vector_mean = np.array(recipes_vector_mean)

class User:
    def __init__(self, vector_dim):
        self.vec = np.zeros(vector_dim)
        self.dislike_step = 0.3
        self.like_step = 1
        self.exclude_idx = set()

    def like(self, recipe_vector_mean):
        self.vec += self.like_step * recipe_vector_mean

    def dislike(self, recipe_vector_mean):
        self.vec -= self.dislike_step * recipe_vector_mean

    def recommend(self, n):
        if not self.vec.any():
            rand_idx = np.random.choice(len(recipes_vector_mean), n, replace=False)
            return rand_idx

        sims = cosine_similarity(self.vec.reshape(1, -1), recipes_vector_mean)[0]

        if self.exclude_idx:
            sims[list(self.exclude_idx)] = -1 # Exclude recipes shown

        return sims.argsort()[::-1][:n]
    
test_user = User(w2v_model.vector_size)

In [380]:
idx = test_user.recommend(1)
print(idx)
recipes_vector_mean[0]
# print(data.recipe_url.iloc[idx[0]])


[4842]


array([-0.10023388,  0.16819969, -0.40811539,  0.13804659, -0.71784955,
        0.68280339, -0.41562891, -0.86221683, -0.21616371,  1.23246622,
        0.19887307,  0.74750191,  1.05205739, -1.2177763 ,  0.25120279,
        0.51293486,  1.6982584 , -0.50825351,  0.79143286, -0.65848428,
       -0.89923817, -0.05031337,  0.02640142, -1.16703999,  0.18478982,
        0.81570214, -0.91748208,  0.46568438, -0.1856329 ,  1.17636359,
       -1.00252652,  0.14327186,  0.55919969,  3.12238359, -0.77545834,
       -0.16121835, -0.98320872,  0.13223547,  0.23196132,  0.9598186 ,
        1.39291549,  0.08231154,  3.09048486, -1.07112348,  0.39819682,
       -0.44351569, -1.05881405,  0.2907325 ,  0.64811867, -0.35993516,
       -1.77843213,  2.50097108,  0.21221894,  0.00828209,  0.18296818,
        0.39630279,  0.18050738,  1.26421928, -0.03191545,  1.57921517,
       -0.7248807 ,  0.98646837,  0.72815192,  0.73370814,  0.03950532,
        0.25977531, -0.30149865, -0.17246841,  0.67797232, -0.99

In [386]:
from time import sleep

user_input = 0

while True:
    
    if user_input == "-1":
        print("Exiting the recommendation system.", flush=True)
        break

    rec_idx = test_user.recommend(1)
    rec_url = data.recipe_url.iloc[rec_idx[0]]

    print(rec_url, flush=True)

    user_input = input("0 for dislike, 1 for like, -1 to quit: ")

    if user_input == "1":
        test_user.like(recipes_vector_mean[rec_idx[0]])
        print("Liked the recipe!", flush=True)
    else:
        test_user.dislike(recipes_vector_mean[rec_idx[0]])
        print("Disliked the recipe!", flush=True)

    test_user.exclude_idx.add(rec_idx[0])

https://thewoksoflife.com/homemade-chinese-egg-noodles/
Liked the recipe!
https://thewoksoflife.com/liangpi-noodles/
Liked the recipe!
https://thewoksoflife.com/nai-wong-bao-custard-buns/
Liked the recipe!
https://thewoksoflife.com/milk-bread-croissants-recipe/
Disliked the recipe!
https://thewoksoflife.com/flaky-apple-pie-recipe/
Disliked the recipe!
https://thewoksoflife.com/hong-kong-egg-tarts/
Liked the recipe!
https://natashaskitchen.com/sweet-cherry-filled-buns-vatrushka-recipe/
Liked the recipe!
https://www.recipetineats.com/brioche/
Disliked the recipe!
https://thewoksoflife.com/peach-squares/
Disliked the recipe!
https://thewoksoflife.com/lotus-leaf-buns/
Liked the recipe!
https://thewoksoflife.com/steamed-red-bean-buns/
Disliked the recipe!
https://thewoksoflife.com/chinese-sausage-buns/
Liked the recipe!
https://thewoksoflife.com/chinese-pineapple-buns-pork/
Disliked the recipe!
https://natashaskitchen.com/snickerdoodle-cookies/
Disliked the recipe!
https://natashaskitchen.c

In [None]:
# data.iloc[test_user.recommend(100), :]

# data.iloc[cosine_similarity(test_user.vec.reshape(1, -1), recipes_vector_mean).argsort()[0][::-1], :]

['asdasd', 'asd', 'asd', 'a', 'sd', 'a', 'asdasd', 'asd']

In [88]:
np.hstack([recipes_vector_mean[0], [100, 100, 0]])

array([-1.91354714e-02,  6.63986132e-02,  8.90939236e-02, ...,
        1.00000000e+02,  1.00000000e+02,  0.00000000e+00])

In [5]:
np.mean([w2v_model.wv[k] for k in ["salt", "water"]], axis=0)

array([-0.27168563, -0.16897143,  0.3787686 , -0.09206532,  0.04078965,
       -0.06412343,  0.10212671, -0.23964828, -0.08686227, -0.20235255,
        0.29980344, -0.255956  , -0.06663971,  0.24358888,  0.19246778,
       -0.12094   , -0.0811328 , -0.02435989, -0.1146047 , -0.16485226,
       -0.03452671, -0.0789252 ,  0.1704962 ,  0.0194522 ,  0.24522936,
       -0.0017973 , -0.24600455, -0.11081199, -0.06806016, -0.0466217 ,
       -0.33850163,  0.10324778,  0.00628475, -0.31563273,  0.01918057,
        0.22137383, -0.08368203, -0.12495574,  0.12044365, -0.20289102,
        0.2751795 , -0.2860985 , -0.09349979,  0.00259076,  0.11370799,
       -0.2269685 ,  0.10482226,  0.15076938, -0.14638446, -0.21089286,
       -0.21453089, -0.02883867, -0.03782435,  0.11072452,  0.00501031,
        0.0309107 ,  0.06699391,  0.10127933,  0.07713623,  0.15280007,
        0.12606956,  0.05258124, -0.20845813, -0.24534018, -0.26487112,
        0.19250645,  0.10080535, -0.06604837, -0.03239607, -0.13