# Word2Vec Model

In [13]:
import numpy as np
import pandas as pd
import re
import warnings

warnings.filterwarnings("ignore")

In [87]:
test = set()
test.add(1)
test.add(2)
test.add(3)

set(np.array([i for i in test]))

{1, 2, 3}

In [14]:
data = pd.read_pickle("processed_cookbook.pkl")

In [15]:
from gensim.models import Word2Vec
# Skip-gram works better for infrequent words and lower data size
# CBOW works better for frequent words and larger data size
w2v_model = Word2Vec(data.ingredients, vector_size=100, window=11, sg=0, epochs=1000)

In [16]:
# w2v_model.save("word2vec.model")
w2v_model = Word2Vec.load("word2vec.model")


In [17]:
# Choose median length of ingredients as window size
ingredient_length = np.array([len(ingredient) for ingredient in data.ingredients])
np.ceil(np.median(ingredient_length))

11.0

In [18]:
w2v_model.wv.most_similar("oil")

[('vegetable_oil', 0.6726031303405762),
 ('neutral_oil', 0.5508599281311035),
 ('peanut_oil', 0.4571745693683624),
 ('olive_oil', 0.4171428978443146),
 ('canola_oil', 0.3822813630104065),
 ('lowsodium_chicken_stock', 0.30548906326293945),
 ('chinese_wine', 0.2943076491355896),
 ('chili_oil', 0.27140793204307556),
 ('scallion', 0.261827290058136),
 ('tomato_passata', 0.26169365644454956)]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recipe_vector_mean(ingredients: list, model: Word2Vec) -> list:
    """
    Takes in a list of recipe ingredients, embeds it and calculate the mean
    """
    vector_embedding = [model.wv[ing] for ing in ingredients if ing in model.wv]
    return np.mean(vector_embedding, axis=0) if vector_embedding else np.zeros(model.vector_size) * -1
    

In [65]:
# recipes_vector_mean = [get_recipe_vector_mean(recipe, w2v_model) for recipe in data.ingredients]
# # get_recipe_vector_mean(["blueberries", "salt", "oil", "egg", "flour", "apple"], w2v_model)
# target_recipe_mean = get_recipe_vector_mean(["salmon", "cheese", "broccoli",
#                                              "garlid_bread", "fries"], w2v_model)
# get_top_n_similarities(target_recipe_mean, recipes_vector_mean, 5)

recipes_vector_mean = [get_recipe_vector_mean(recipe, w2v_model) for recipe in data.ingredients]
recipes_vector_mean = np.array(recipes_vector_mean)

sims = cosine_similarity(recipes_vector_mean[0].reshape(1, -1), recipes_vector_mean)[0]
sims.argsort()[::-1]

array([   0,  189,  232, ..., 1285, 1588, 6148])

In [None]:
recipes_vector_mean = [get_recipe_vector_mean(recipe, w2v_model) for recipe in data.ingredients]
recipes_vector_mean = np.array(recipes_vector_mean)

class User:
    def __init__(self, vector_dim):
        self.vec = np.zeros(vector_dim)
        self.dislike_step = 0.3
        self.like_step = 1
        self.exclude_idx = set()

    def like(self, recipe_vector_mean):
        self.vec += self.like_step * recipe_vector_mean

    def dislike(self, recipe_vector_mean):
        self.vec -= self.dislike_step * recipe_vector_mean

    def recommend(self, n):
        if not self.vec.any():
            rand_idx = np.random.choice(len(recipes_vector_mean), n, replace=False)
            return rand_idx

        sims = cosine_similarity(self.vec.reshape(1, -1), recipes_vector_mean)[0]

        if self.exclude_idx:
            sims[list(self.exclude_idx)] = -1 # Exclude recipes shown

        return sims.argsort()[::-1][:n]
    
test_user = User(w2v_model.vector_size)

In [74]:
idx = test_user.recommend(1)
print(idx)
recipes_vector_mean[0]
# print(data.recipe_url.iloc[idx[0]])
w2v_model.vector_size

[2404]


100

In [386]:
from time import sleep

user_input = 0

while True:
    
    if user_input == "-1":
        print("Exiting the recommendation system.", flush=True)
        break

    rec_idx = test_user.recommend(1)
    rec_url = data.recipe_url.iloc[rec_idx[0]]

    print(rec_url, flush=True)

    user_input = input("0 for dislike, 1 for like, -1 to quit: ")

    if user_input == "1":
        test_user.like(recipes_vector_mean[rec_idx[0]])
        print("Liked the recipe!", flush=True)
    else:
        test_user.dislike(recipes_vector_mean[rec_idx[0]])
        print("Disliked the recipe!", flush=True)

    test_user.exclude_idx.add(rec_idx[0])

https://thewoksoflife.com/homemade-chinese-egg-noodles/
Liked the recipe!
https://thewoksoflife.com/liangpi-noodles/
Liked the recipe!
https://thewoksoflife.com/nai-wong-bao-custard-buns/
Liked the recipe!
https://thewoksoflife.com/milk-bread-croissants-recipe/
Disliked the recipe!
https://thewoksoflife.com/flaky-apple-pie-recipe/
Disliked the recipe!
https://thewoksoflife.com/hong-kong-egg-tarts/
Liked the recipe!
https://natashaskitchen.com/sweet-cherry-filled-buns-vatrushka-recipe/
Liked the recipe!
https://www.recipetineats.com/brioche/
Disliked the recipe!
https://thewoksoflife.com/peach-squares/
Disliked the recipe!
https://thewoksoflife.com/lotus-leaf-buns/
Liked the recipe!
https://thewoksoflife.com/steamed-red-bean-buns/
Disliked the recipe!
https://thewoksoflife.com/chinese-sausage-buns/
Liked the recipe!
https://thewoksoflife.com/chinese-pineapple-buns-pork/
Disliked the recipe!
https://natashaskitchen.com/snickerdoodle-cookies/
Disliked the recipe!
https://natashaskitchen.c

In [None]:
# data.iloc[test_user.recommend(100), :]

# data.iloc[cosine_similarity(test_user.vec.reshape(1, -1), recipes_vector_mean).argsort()[0][::-1], :]

['asdasd', 'asd', 'asd', 'a', 'sd', 'a', 'asdasd', 'asd']

In [88]:
np.hstack([recipes_vector_mean[0], [100, 100, 0]])

array([-1.91354714e-02,  6.63986132e-02,  8.90939236e-02, ...,
        1.00000000e+02,  1.00000000e+02,  0.00000000e+00])

In [5]:
np.mean([w2v_model.wv[k] for k in ["salt", "water"]], axis=0)

array([-0.27168563, -0.16897143,  0.3787686 , -0.09206532,  0.04078965,
       -0.06412343,  0.10212671, -0.23964828, -0.08686227, -0.20235255,
        0.29980344, -0.255956  , -0.06663971,  0.24358888,  0.19246778,
       -0.12094   , -0.0811328 , -0.02435989, -0.1146047 , -0.16485226,
       -0.03452671, -0.0789252 ,  0.1704962 ,  0.0194522 ,  0.24522936,
       -0.0017973 , -0.24600455, -0.11081199, -0.06806016, -0.0466217 ,
       -0.33850163,  0.10324778,  0.00628475, -0.31563273,  0.01918057,
        0.22137383, -0.08368203, -0.12495574,  0.12044365, -0.20289102,
        0.2751795 , -0.2860985 , -0.09349979,  0.00259076,  0.11370799,
       -0.2269685 ,  0.10482226,  0.15076938, -0.14638446, -0.21089286,
       -0.21453089, -0.02883867, -0.03782435,  0.11072452,  0.00501031,
        0.0309107 ,  0.06699391,  0.10127933,  0.07713623,  0.15280007,
        0.12606956,  0.05258124, -0.20845813, -0.24534018, -0.26487112,
        0.19250645,  0.10080535, -0.06604837, -0.03239607, -0.13