# Model Training - FastText

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import mlflow

import json
import requests
import boto3

import streamlit as st

import warnings
warnings.filterwarnings("ignore")

In [15]:
import yaml

def load_params():
    with open("../params.yaml") as f:
        params = yaml.safe_load(f)
    return params

params = load_params()

## Sandbox

In [3]:
def connect_database():
    """Connect to the DynamoDB database."""
    dynamodb = boto3.resource("dynamodb",
                            aws_access_key_id=st.secrets.s3.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=st.secrets.s3.AWS_SECRET_ACCESS_KEY,
                            region_name=st.secrets.s3.AWS_DEFAULT_REGION)
    table = dynamodb.Table(st.secrets.s3.DB_NAME)
    return table

data = pd.read_pickle(params["model_pipeline"]["recipe_path"])
model = FastText.load(params["model_pipeline"]["model_path"])
table = connect_database()

In [4]:
# IDF (Inverse Document Frequency)
idf = {}
vector_embedding = {}
for i, vocab in enumerate(model.wv.index_to_key):
    # Apply IDF
    n_recipe_contains_vocab = data.ingredients.apply(lambda x: vocab in x).sum()
    idf[vocab] = np.log(len(data.ingredients) / n_recipe_contains_vocab)

    # Apply IDF weight to Word2Vec embeddings
    vector_embedding[vocab] = model.wv.vectors[i]/np.linalg.norm(model.wv.vectors[i]) * idf[vocab] # Normalize w2v embedding first
    vector_embedding[vocab] = vector_embedding[vocab] / np.linalg.norm(vector_embedding[vocab])  # Normalize combined embedding again

idf

{'salt': 0.3928653931060427,
 'garlic': 0.9728863994977894,
 'water': 1.2474342884510103,
 'olive_oil': 1.3923734071962728,
 'black_pepper': 1.400105404479599,
 'sugar': 1.7065040189406901,
 'egg': 1.7908029890020982,
 'soy': 1.8923467061332238,
 'onion': 2.013707563137491,
 'ginger': 2.071498688165899,
 'vanilla_extract': 2.1601224114682016,
 'lemon': 2.1754537222149906,
 'unsalted_butter': 2.2039456780092967,
 'carrot': 2.3909022373074,
 'allpurpose_flour': 2.4993647329632305,
 'parsley': 2.522941463527227,
 'baking': 2.5289235352047745,
 'maple_syrup': 2.5349416075303375,
 'scallion': 2.571831036216051,
 'sesame_oil': 2.571831036216051,
 'milk': 2.6079667425452273,
 'pepper': 2.6298494537947352,
 'cinnamon': 2.717665659801652,
 'white_pepper': 2.7470795450079453,
 'cilantro': 2.759595352939776,
 'vegetable_oil': 2.7646458687258444,
 'oil': 2.7876942641621354,
 'cornstarch': 2.830028627988696,
 'lime': 2.846377765990226,
 'neutral_oil': 2.8518874218011954,
 'shaoxing_wine': 2.8999872

In [6]:
np.linalg.norm(vector_embedding.get("salt"))

1.0

In [21]:
user_id = 110833230122006731136
n=10
text = "salt, pepper, garlic, noodle"
query = [ingredient.strip() for ingredient in text.split(",")]

# Get the vector for the search query
query_vec = np.array([vector_embedding[word] for word in query if word in vector_embedding])
for word in query:
    if word not in vector_embedding:
        print(f"'{word}' not in vocabulary.")

max_sim = np.array([])
for recipe_ingredients in data.ingredients:
    embedding_vec = np.array([vector_embedding[ing] for ing in recipe_ingredients if ing in vector_embedding])
    
    if embedding_vec.size == 0:
        max_sim = np.append(max_sim, 0)
        continue

    tokens_sim = query_vec @ embedding_vec.T
    tokens_sim = tokens_sim.clip(min=0.0)
    a_best = tokens_sim.max(axis=1)
    
    b_best = tokens_sim.max(axis=0)
    score = 0.5 * (a_best.mean() + b_best.mean())
    max_sim = np.append(max_sim, score)

mean_query_vec = query_vec.mean(axis=0)

# Compute mean ingredient vectors for all recipes
mean_ingredients_vec = []
for recipe_ingredients in data.ingredients:
    embedding_vec = [vector_embedding[ing] for ing in recipe_ingredients if ing in vector_embedding]
    mean_vec = np.mean(embedding_vec, axis=0) if embedding_vec else np.zeros(model.vector_size) * -1
    mean_ingredients_vec.append(mean_vec)

mean_ingredients_vec = np.array(mean_ingredients_vec)
cosine_sim_matrix = mean_query_vec.reshape(1, -1) @ mean_ingredients_vec.T
score = 0.2 * cosine_sim_matrix[0] + 0.8 * max_sim
# score = max_sim

rec_idx = np.argsort(score)[-5:][::-1]
# rec_idx = [4557, 5297, 3977, 6109, 366, 3572, 6262, 314]
data.iloc[rec_idx]


Unnamed: 0,id,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
2369,2369,Garlic,https://minimalistbaker.com/how-to-roast-garlic/,"[garlic, oliveoil, salt]",9.0,65.0,5.0,60.0,0.0,164.0,...,78.0,120.0,0.6,0.3,0.0,13.2,50.0,0.5,1.0,https://minimalistbaker.com/wp-content/uploads...
4766,4766,Garlic Sauted Spinach,https://www.recipetineats.com/garlic-sauteed-s...,"[bunch_spinach, olive_oil, garlic, salt, pepper]",6.0,9.0,5.0,4.0,0.0,100.0,...,409.0,837.0,3.0,0.0,14065.0,43.1,154.0,4.1,0.0,https://www.recipetineats.com/tachyon/2018/12/...
4049,4049,Pork Skewers,https://natashaskitchen.com/pork-skewers-shash...,"[pork_sirloin, dry_red_wine_merlot, olive_oil,...",8.0,255.0,240.0,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://natashaskitchen.com/wp-content/uploads...
3771,3771,Breaded Baked Chicken Drumsticks,https://natashaskitchen.com/breaded-baked-chic...,"[chicken_drumstick, salt, pepper, dijon_mustar...",6.0,68.0,10.0,58.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://natashaskitchen.com/wp-content/uploads...
368,368,Reverse Sear Steak On Traeger With Smoked Garlic,https://www.justonecookbook.com/reverse-sear-s...,"[tenderloin_steak, salt, black_pepper, neutral...",18.0,150.0,10.0,65.0,60.0,392.0,...,500.0,854.0,1.0,1.0,6.0,3.0,71.0,4.0,0.0,https://www.justonecookbook.com/wp-content/upl...


In [12]:
user_id = 110833230122006731136
table_test = table.get_item(Key={"user_id": int(user_id)})["Item"]

table_test

{'disliked_idx': {'191': '2025-07-22 22:03:20',
  '2130': '2025-07-22 22:03:24',
  '572': '2025-07-22 22:03:38',
  '3207': '2025-07-22 22:03:21',
  '1216': '2025-07-22 22:09:55',
  '1149': '2025-07-22 22:09:54',
  '1258': '2025-07-22 22:03:16',
  '322': '2025-07-22 22:03:39',
  '1104': '2025-07-22 22:06:46',
  '1246': '2025-07-22 22:03:34',
  '3457': '2025-07-22 22:07:03',
  '2774': '2025-07-22 22:03:22',
  '2466': '2025-07-22 22:06:53',
  '524': '2025-07-22 22:03:23',
  '3763': '2025-07-22 22:03:14',
  '4589': '2025-07-22 22:03:31',
  '3985': '2025-07-22 22:03:35'},
 'user_id': Decimal('110833230122006731136'),
 'liked_idx': {'4914': '2025-07-22 22:03:36',
  '3504': '2025-07-22 22:03:28',
  '2118': '2025-07-22 22:03:32',
  '2755': '2025-07-22 22:07:50',
  '4025': '2025-07-22 22:03:27',
  '2244': '2025-07-22 22:08:10',
  '4720': '2025-07-22 22:03:26',
  '284': '2025-07-22 22:03:15',
  '6161': '2025-07-22 22:03:18',
  '274': '2025-07-22 22:03:29'}}

In [None]:
table_test.get("liked_idx")

{'5717': '2025-07-17'}

In [None]:
# from datetime import datetime
# test_map = {"liked_idx": list(map(int, list(liked_idx))), 
#             "disliked_idx": list(map(int, list(disliked_idx))),
#             "date": datetime.now().strftime("%Y-%m-%d" + "%H:%M:%S")}
# test_map


{'liked_idx': [2128, 277, 5509],
 'disliked_idx': [2819, 195, 3719, 2378, 2092, 916],
 'date': '2025-07-1722:41:21'}

In [None]:
# liked_idx = set(table_test.get("liked_idx").keys())
# disliked_idx = set(table_test.get("disliked_idx").keys())

# exclude_indices = set(liked_idx).union(disliked_idx)


In [8]:
exclude_indices

{'1106', '1758', '2227', '425', '5869', '5948', '795'}

## Model Training

Saves model as Python file for MLflow (Models From Code)

In [27]:
%%writefile ../models/fast_text.py 
import numpy as np
import pandas as pd
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import mlflow
import json

class RecipeFastText(mlflow.pyfunc.PythonModel):
    def __init__(self, params):
        self.params = params
    
    def load_context(self, context):
        self.model = FastText.load(context.artifacts["model_path"])
        self.data = pd.read_pickle(context.artifacts["recipe_path"])

        # Apply IDF (Inverse Document Frequency)
        self.idf = {}
        self.ing_embeddings = {}

        for i, vocab in enumerate(self.model.wv.index_to_key):
            # Apply IDF
            n_recipe_contains_vocab = self.data.ingredients.apply(lambda x: vocab in x).sum()
            self.idf[vocab] = np.log(len(self.data.ingredients) / n_recipe_contains_vocab)

            # Apply IDF weight to Word2Vec embeddings
            self.ing_embeddings[vocab] = self.model.wv.vectors[i] * self.idf[vocab]
            self.ing_embeddings[vocab] = self.ing_embeddings[vocab] / np.linalg.norm(self.ing_embeddings[vocab])  # Normalize the vector with L2 norm

        # Calculate recipe vector means
        recipe_vector_means = []
        for recipe_ingredients in self.data.ingredients:
            embedding_vec = [self.ing_embeddings[ing] for ing in recipe_ingredients if ing in self.ing_embeddings]
            mean_vec = np.mean(embedding_vec, axis=0) if embedding_vec else np.zeros(self.model.vector_size) * -1
            recipe_vector_means.append(mean_vec)

        self.recipe_vector_means = np.array(recipe_vector_means)

    def get_score(self, query):
        """Calculates the similarity score for a given query."""
        # Get the vector for the search query
        query_vec = np.array([self.ing_embeddings[word] for word in query if word in self.ing_embeddings])
        if query_vec.size == 0:
            return json.dumps([])

        max_sim = np.array([])
        for recipe_ingredients in self.data.ingredients:
            recipe_embeddings = np.array([self.ing_embeddings[ing] for ing in recipe_ingredients if ing in self.ing_embeddings])

            if recipe_embeddings.size == 0:
                max_sim = np.append(max_sim, 0)
                continue

            tokens_sim = query_vec @ recipe_embeddings.T
            a_best = tokens_sim.max(axis=1)
            
            b_best = tokens_sim.max(axis=0)
            score = 0.5 * (a_best.mean() + b_best.mean())
            score = a_best.sum()
            max_sim = np.append(max_sim, score)

        mean_query_vec = query_vec.mean(axis=0)
        cosine_sim_matrix = mean_query_vec.reshape(1, -1) @ self.recipe_vector_means.T
        score = self.params.w_cosine * cosine_sim_matrix[0] + self.params.w_maxsim * max_sim
        return score
    
    def predict(self, query: list[str]) -> str:
        """Predicts the top N similar recipes based on a search query."""

        score = self.get_score(query)

        # Get the top N most similar recipe indices
        top_n_indices = np.argsort(score)[::-1][:self.params.n_recs]
        return json.dumps(top_n_indices.tolist())

Overwriting ../models/fast_text.py


In [35]:
# fasttext_model = FastText(data.ingredients, **params["model_pipeline"]["fast_text"])
# fasttext_model.save(params["model_pipeline"]["model_path"])
fasttext_model = FastText.load(params["model_pipeline"]["model_path"])

In [38]:
import cloudpickle
cloudpickle.__version__

'3.1.1'

In [None]:
import sys
sys.path.insert(0, '../')
from models.fast_text import RecipeFastText

mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("model_training")

with mlflow.start_run():
    mlflow.log_params(params["model_pipeline"])
    
    # Log model
    model_info = mlflow.pyfunc.log_model(
        name="embedding_model",
        python_model=RecipeFastText(params["model_pipeline"]["model_scoring"]), # "./models/recipe_embedding_model.py" 
        artifacts={"model_path": params["model_pipeline"]["model_path"],
                   "model_ngram_path": params["model_pipeline"]["model_ngram_path"],
                   "recipe_path": params["model_pipeline"]["recipe_path"]},
        pip_requirements=["gensim==4.3.3",
                          f"cloudpickle=={cloudpickle.__version__}"]
    )

    # Register model
    mlflow.register_model(
        model_uri=model_info._model_uri,
        name="fasttext_model"
    )

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 3109.19it/s] 
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00,  4.10it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 340.72it/s]
Registered model 'fasttext_model' already exists. Creating a new version of this model...
2025/08/17 22:02:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: fasttext_model, version 5


🏃 View run hilarious-stoat-775 at: http://127.0.0.1:8080/#/experiments/186008785936151337/runs/230987aef6494790b48f0248d2825495
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/186008785936151337


Created version '5' of model 'fasttext_model'.


In [48]:
model_info.model_uri

'models:/m-4d653801ec8f4028808b158fb3002793'

In [None]:
model = mlflow.pyfunc.load_model(model_info.model_uri)
# set proper loading location

Downloading artifacts:  78%|███████▊  | 7/9 [00:30<00:08,  4.42s/it]   


MlflowException: The following failures occurred while downloading one or more artifacts from http://127.0.0.1:8080/api/2.0/mlflow-artifacts/artifacts/186008785936151337/models/m-4d653801ec8f4028808b158fb3002793/artifacts:
##### File python_model.pkl #####
("Connection broken: InvalidChunkLength(got length b'HTTP/1.1 500 Internal Server Error\\r\\n', 0 bytes read)", InvalidChunkLength(got length b'HTTP/1.1 500 Internal Server Error\r\n', 0 bytes read))
##### File artifacts/fasttext.model.wv.vectors_ngrams.npy #####
("Connection broken: InvalidChunkLength(got length b'HTTP/1.1 500 Internal Server Error\\r\\n', 0 bytes read)", InvalidChunkLength(got length b'HTTP/1.1 500 Internal Server Error\r\n', 0 bytes read))

## Comparison Check

In [397]:
fasttext_model.wv.most_similar("chicken", topn=10)

[('chicken_wing', 0.8833999037742615),
 ('chicken_tender', 0.8739950060844421),
 ('chicken_thigh', 0.856991171836853),
 ('chicken_breast', 0.8426862359046936),
 ('chicken_mince', 0.8309381604194641),
 ('chicken_tenderloin', 0.8227993845939636),
 ('chicken_drumstick', 0.8012790083885193),
 ('rotisserie_chicken', 0.7952107191085815),
 ('chicken_broth_stock', 0.7417898774147034),
 ('chicken_broth', 0.7386066913604736)]

In [378]:
w2v_model.wv.most_similar("milk", topn=10)

[('buttermilk', 0.4348703920841217),
 ('sheet_puff_pastry', 0.33785754442214966),
 ('almond_milk', 0.32518696784973145),
 ('lowfat_buttermilk', 0.316470205783844),
 ('unsalted_butter', 0.309580534696579),
 ('ricotta_cheese', 0.30522701144218445),
 ('unsweetened_almond_milk', 0.3016698658466339),
 ('dark_chocolate_chip', 0.29976528882980347),
 ('apricot_jam', 0.2976725995540619),
 ('granny_smith_apple', 0.29501983523368835)]

In [17]:
# Choose median length of ingredients as window size
ingredient_length = np.array([len(ingredient) for ingredient in data.ingredients])
np.ceil(np.median(ingredient_length))

11.0

In [9]:
data = pd.read_pickle("processed_cookbook.pkl")
data.iloc[2575, :]

id                                                                  2575
recipe_title                      Creamy Fall Soup In Acorn Squash Bowls
recipe_url             https://minimalistbaker.com/creamy-fall-soup-i...
ingredients            [acorn_squash, melted_coconut_oil, maple_syrup...
num_steps                                                            8.0
total_time                                                          90.0
prep_time                                                           15.0
cook_time                                                           75.0
custom_time                                                          0.0
calories                                                           285.0
carbohydrates                                                       51.7
protein                                                              4.3
fat                                                                  9.1
saturated_fat                                      