# Word2Vec Model

In [3]:
# Import required libraries
import numpy as np
import pandas as pd
import re

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import mlflow

import json
import requests
import boto3

import streamlit as st

import warnings
warnings.filterwarnings("ignore")

In [4]:
"""
Run dependencies:
    mlflow server --host 127.0.0.1 --port 5000 
    python word2vec_service.py 
    streamlit run streamlit_web.py
""";

## Random Testing

In [11]:
def process_recipe_vector_means(data: pd.DataFrame, _model: Word2Vec) -> list:
    """Takes in a list of recipe ingredients, embeds it and calculate the mean"""
    recipe_vector_means = []

    for recipe_ingredients in data.ingredients:
        embedding_vec = [model.wv[ing] for ing in recipe_ingredients if ing in model.wv]
        mean_vec = np.mean(embedding_vec, axis=0) if embedding_vec else np.zeros(model.vector_size) * -1
        recipe_vector_means.append(mean_vec)
    
    return np.array(recipe_vector_means)

def connect_database():
    """Connect to the DynamoDB database."""
    dynamodb = boto3.resource("dynamodb",
                            aws_access_key_id=st.secrets.s3.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=st.secrets.s3.AWS_SECRET_ACCESS_KEY,
                            region_name=st.secrets.s3.AWS_DEFAULT_REGION)
    table = dynamodb.Table(st.secrets.s3.DB_NAME)
    return table

data = pd.read_pickle("processed_cookbook.pkl")
model = Word2Vec.load("word2vec.model")
table = connect_database()
recipe_vector_means = process_recipe_vector_means(data, model)

def connect_database():
    """Connect to the DynamoDB database."""
    dynamodb = boto3.resource("dynamodb",
                            aws_access_key_id=st.secrets.s3.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=st.secrets.s3.AWS_SECRET_ACCESS_KEY,
                            region_name=st.secrets.s3.AWS_DEFAULT_REGION)
    table = dynamodb.Table(st.secrets.s3.DB_NAME)
    return table

table = connect_database()

user_id = 110833230122006731136
n=10

# Get user configuration from DynamoDB
user_config = table.get_item(Key={"user_id": int(user_id)})["Item"]
liked_idx = set(map(int, user_config.get("liked_idx").keys()))
disliked_idx = set(map(int, user_config.get("disliked_idx").keys()))

exclude_indices = set(liked_idx).union(disliked_idx)

# If there are no liked or disliked recipes, return random indices
if not exclude_indices:
    print(np.random.choice(len(recipe_vector_means), n, replace=False))
    
user_vec = np.zeros(recipe_vector_means.shape[1]) + np.sum(recipe_vector_means[list(liked_idx)], axis=0) - np.sum(recipe_vector_means[list(disliked_idx)], axis=0)
sims = cosine_similarity(user_vec.reshape(1, -1), recipe_vector_means)[0]

# Create a DataFrame to hold the recipe IDs and their similarity scores
recipe_similarity = pd.DataFrame({
    "id": data.id,
    "similarity": sims
})

recipe_similarity = recipe_similarity.sort_values(by="similarity", ascending=False) # Sort by similarity in descending order

# Exclude liked and disliked recipes from the similarity scores
if exclude_indices:
    recipe_similarity = recipe_similarity[~recipe_similarity["id"].isin(exclude_indices)]

arg_sorted_sims = recipe_similarity["similarity"].argsort()[::-1]  # Sort indices by similarity in descending order
# arg_pool = np.append(arg_sorted_sims[:n], arg_sorted_sims[n:(len(arg_sorted_sims)//2)])  # Get a pool of indices to choose from
rand = np.random.choice(arg_sorted_sims[:n], n, replace=False)

# np.random.choice(arg_sorted_sims[:n], n, replace=False)


In [12]:
user_id = 110833230122006731136
table_test = table.get_item(Key={"user_id": int(user_id)})["Item"]

table_test

{'disliked_idx': {'191': '2025-07-22 22:03:20',
  '2130': '2025-07-22 22:03:24',
  '572': '2025-07-22 22:03:38',
  '3207': '2025-07-22 22:03:21',
  '1216': '2025-07-22 22:09:55',
  '1149': '2025-07-22 22:09:54',
  '1258': '2025-07-22 22:03:16',
  '322': '2025-07-22 22:03:39',
  '1104': '2025-07-22 22:06:46',
  '1246': '2025-07-22 22:03:34',
  '3457': '2025-07-22 22:07:03',
  '2774': '2025-07-22 22:03:22',
  '2466': '2025-07-22 22:06:53',
  '524': '2025-07-22 22:03:23',
  '3763': '2025-07-22 22:03:14',
  '4589': '2025-07-22 22:03:31',
  '3985': '2025-07-22 22:03:35'},
 'user_id': Decimal('110833230122006731136'),
 'liked_idx': {'4914': '2025-07-22 22:03:36',
  '3504': '2025-07-22 22:03:28',
  '2118': '2025-07-22 22:03:32',
  '2755': '2025-07-22 22:07:50',
  '4025': '2025-07-22 22:03:27',
  '2244': '2025-07-22 22:08:10',
  '4720': '2025-07-22 22:03:26',
  '284': '2025-07-22 22:03:15',
  '6161': '2025-07-22 22:03:18',
  '274': '2025-07-22 22:03:29'}}

In [138]:
table_test.get("liked_idx")

{'5717': '2025-07-17'}

In [153]:
from datetime import datetime
test_map = {"liked_idx": list(map(int, list(liked_idx))), 
            "disliked_idx": list(map(int, list(disliked_idx))),
            "date": datetime.now().strftime("%Y-%m-%d" + "%H:%M:%S")}
test_map


{'liked_idx': [2128, 277, 5509],
 'disliked_idx': [2819, 195, 3719, 2378, 2092, 916],
 'date': '2025-07-1722:41:21'}

In [7]:
liked_idx = set(table_test.get("liked_idx").keys())
disliked_idx = set(table_test.get("disliked_idx").keys())

exclude_indices = set(liked_idx).union(disliked_idx)


In [8]:
exclude_indices

{'1106', '1758', '2227', '425', '5869', '5948', '795'}

In [78]:
req = requests.get("http://localhost:8000/recommend/110833230122006731136/8")
np.asarray(json.loads(req.json()))

ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /recommend/110833230122006731136/8 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x30b9230e0>: Failed to establish a new connection: [Errno 61] Connection refused'))

## Model Training

In [None]:
# Skip-gram works better for infrequent words and lower data size
# CBOW works better for frequent words and larger data size

# Store this in .yaml later
params = {
    "vector_size": 100,
    "window": 11,
    "sg": 0,
    "epochs": 1000
}

model_path = "word2vec.model"
recipe_path = "processed_cookbook.pkl"

# Training
data = pd.read_pickle(recipe_path)

# w2v_model = Word2Vec(data.ingredients, **params)
w2v_model = Word2Vec.load(model_path) # Just for testing

### Wrapping Word2Vec Model in MLFlow Pyfunc

Saves model as Python file for MLflow (Models From Code)

In [2]:
%%writefile ./models/word2vec.py 
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import mlflow
import json

class Word2VecModel(mlflow.pyfunc.PythonModel):
    def __init__(self, n_recs=10):
        self.n_recs = n_recs
        self.model = None

    def load_context(self, context):
        self.model = Word2Vec.load(context.artifacts["model_path"])

        # Calculate recipe vector means
        recipe_vector_means = []
        self.data = pd.read_pickle(context.artifacts["recipe_path"])
        for recipe_ingredients in self.data.ingredients:
            embedding_vec = [self.model.wv[ing] for ing in recipe_ingredients if ing in self.model.wv]
            mean_vec = np.mean(embedding_vec, axis=0) if embedding_vec else np.zeros(self.model.vector_size) * -1
            recipe_vector_means.append(mean_vec)

        self.recipe_vector_means = np.array(recipe_vector_means)

    def predict(self, context, model_input: list[list[int]]) -> str:
        """
        Predicts the top N recommended recipes based on user preferences.
        
        args:
            model_input: A list containing two sets:
                - The first set contains indices of liked recipes.
                - The second set contains indices of disliked recipes.
        returns:
            A JSON string containing the indices of the recommended recipes.
        """

        liked_idx = set(model_input[0])
        disliked_idx = set(model_input[1])

        exclude_indices = liked_idx.union(disliked_idx)

        # If there are no liked or disliked recipes, return random indices
        if not exclude_indices:
            rand = np.random.choice(len(self.recipe_vector_means), self.n_recs, replace=False)
            return json.dumps(rand.tolist())
        
        # Calculate user vector based on liked and disliked recipes
        user_vec = np.zeros(self.recipe_vector_means.shape[1]) + (0.8 * np.sum(self.recipe_vector_means[list(liked_idx)], axis=0)) - (0.3 * np.sum(self.recipe_vector_means[list(disliked_idx)], axis=0))

        # Calculate cosine similarity between user vector and recipe vectors
        sims = cosine_similarity(user_vec.reshape(1, -1), self.recipe_vector_means)[0]

        # Create a DataFrame to hold the recipe IDs and their similarity scores to prevent shifting issues
        recipe_similarity = pd.DataFrame({
            "id": self.data.id,
            "similarity": sims
        })
        
        # Exclude liked and disliked recipes from the similarity scores
        if exclude_indices:
            recipe_similarity = recipe_similarity[~recipe_similarity.id.isin(exclude_indices)]

        recipe_similarity = recipe_similarity.sort_values(by="similarity", ascending=False) # Sort by similarity in descending order
        sorted_ids = recipe_similarity.id[:self.n_recs] # Grab top N indices by similarity

        rand = np.random.choice(sorted_ids[:self.n_recs], self.n_recs, replace=False)
        return json.dumps(rand.tolist())

# Config
n_recs = 10
mlflow.models.set_model(Word2VecModel(n_recs))

Overwriting ./models/word2vec.py


## More Testing

In [24]:
w2v_model.wv.most_similar("salt", topn=10)

[('salt_black_pepper', 0.22019974887371063),
 ('water', 0.21628636121749878),
 ('red_chili', 0.2155621200799942),
 ('snow_pea', 0.20578241348266602),
 ('onion', 0.20268560945987701),
 ('spicy_bean', 0.2002008855342865),
 ('carrot', 0.1976989507675171),
 ('salt_plus', 0.18816617131233215),
 ('garlic', 0.17746944725513458),
 ('worcestershire', 0.17670156061649323)]

In [17]:
# Choose median length of ingredients as window size
ingredient_length = np.array([len(ingredient) for ingredient in data.ingredients])
np.ceil(np.median(ingredient_length))

11.0

In [9]:
data = pd.read_pickle("processed_cookbook.pkl")
data.iloc[2575, :]

id                                                                  2575
recipe_title                      Creamy Fall Soup In Acorn Squash Bowls
recipe_url             https://minimalistbaker.com/creamy-fall-soup-i...
ingredients            [acorn_squash, melted_coconut_oil, maple_syrup...
num_steps                                                            8.0
total_time                                                          90.0
prep_time                                                           15.0
cook_time                                                           75.0
custom_time                                                          0.0
calories                                                           285.0
carbohydrates                                                       51.7
protein                                                              4.3
fat                                                                  9.1
saturated_fat                                      