# Model Training - FastText

In [188]:
# Import required libraries
import numpy as np
import pandas as pd
import os 
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import mlflow

import json
import requests
import boto3

import streamlit as st

import warnings
warnings.filterwarnings("ignore")

In [181]:
import yaml

def load_params():
    with open("../params.yaml") as f:
        params = yaml.safe_load(f)
    return params

params = load_params()

In [182]:
def connect_database():
    """Connect to the DynamoDB database."""
    dynamodb = boto3.resource("dynamodb",
                            aws_access_key_id=st.secrets.s3.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=st.secrets.s3.AWS_SECRET_ACCESS_KEY,
                            region_name=st.secrets.s3.AWS_DEFAULT_REGION)
    table = dynamodb.Table(st.secrets.s3.DB_NAME)
    return table

data = pd.read_pickle(params["model_pipeline"]["recipe_path"])
table = connect_database()

## Model Training

In [183]:
# fasttext_model = FastText(data.ingredients, **params["model_pipeline"]["fast_text"])
# fasttext_model.save(params["model_pipeline"]["model_path"])
fasttext_model = FastText.load(params["model_pipeline"]["model_path"])

In [184]:
# Quick sanity check
fasttext_model.wv.most_similar("peanut butter", topn=10)

[('peanut_butter', 0.8854959607124329),
 ('smooth_peanut_butter', 0.855765163898468),
 ('creamy_peanut_butter', 0.7880486249923706),
 ('salted_creamy_peanut_butter', 0.7332251667976379),
 ('creamy_salted_peanut_butter', 0.7154029607772827),
 ('nut_butter', 0.6878952383995056),
 ('salted_natural_peanut_butter', 0.6861165761947632),
 ('cocoa_butter', 0.6729941964149475),
 ('butter', 0.6723080277442932),
 ('natural_salted_peanut_butter', 0.653947651386261)]

### IDF (Inverse Document Frequency)

In [5]:
idf = {}
vector_embedding = {}
for i, vocab in enumerate(fasttext_model.wv.index_to_key):
    # Apply IDF
    n_recipe_contains_vocab = data.ingredients.apply(lambda x: vocab in x).sum()
    idf[vocab] = np.log(len(data.ingredients) / n_recipe_contains_vocab)

    # Apply IDF weight to Word2Vec embeddings
    vector_embedding[vocab] = fasttext_model.wv.vectors[i]/np.linalg.norm(fasttext_model.wv.vectors[i]) * idf[vocab] # Normalize w2v embedding first
    vector_embedding[vocab] = vector_embedding[vocab] / np.linalg.norm(vector_embedding[vocab])  # Normalize combined embedding again

idf

{'salt': 0.3928653931060427,
 'garlic': 0.9728863994977894,
 'water': 1.2474342884510103,
 'olive_oil': 1.3923734071962728,
 'black_pepper': 1.400105404479599,
 'sugar': 1.7065040189406901,
 'egg': 1.7908029890020982,
 'soy': 1.8923467061332238,
 'onion': 2.013707563137491,
 'ginger': 2.071498688165899,
 'vanilla_extract': 2.1601224114682016,
 'lemon': 2.1754537222149906,
 'unsalted_butter': 2.2039456780092967,
 'carrot': 2.3909022373074,
 'allpurpose_flour': 2.4993647329632305,
 'parsley': 2.522941463527227,
 'baking': 2.5289235352047745,
 'maple_syrup': 2.5349416075303375,
 'scallion': 2.571831036216051,
 'sesame_oil': 2.571831036216051,
 'milk': 2.6079667425452273,
 'pepper': 2.6298494537947352,
 'cinnamon': 2.717665659801652,
 'white_pepper': 2.7470795450079453,
 'cilantro': 2.759595352939776,
 'vegetable_oil': 2.7646458687258444,
 'oil': 2.7876942641621354,
 'cornstarch': 2.830028627988696,
 'lime': 2.846377765990226,
 'neutral_oil': 2.8518874218011954,
 'shaoxing_wine': 2.8999872

In [6]:
# Check if it's normalized
np.linalg.norm(vector_embedding.get("salt"))

1.0

### Model Scoring System

In [23]:
model.predict(["salt"])

[array([-2.6550920e+00, -2.8948134e-01, -1.3038169e+00, -1.2672992e+00,
        -1.1699896e-01, -2.4807813e+00, -5.7276672e-01, -2.0239155e+00,
        -3.6649773e+00, -2.4176674e+00, -2.5293906e+00,  1.3403758e+00,
         7.7194236e-02,  5.2182901e-01, -9.9768186e-01, -2.1142140e-01,
         2.2082214e+00, -2.7719632e-01,  1.8602798e+00,  2.0711403e+00,
         8.0471212e-01,  1.6247444e+00, -3.2902260e+00,  9.0726924e-01,
         1.5361571e+00, -7.8738576e-01, -2.5842564e+00,  1.7833999e+00,
        -2.7365606e+00, -2.5798705e+00, -2.3482225e+00, -9.8127532e-01,
        -1.1024963e+00,  1.2330235e+00, -1.2888752e+00, -1.0355765e+00,
        -1.5144287e+00, -9.3993258e-01,  4.2902217e+00,  8.4660459e-01,
         2.4824442e-01,  5.2841846e-02, -7.7579081e-01,  1.3014902e+00,
        -2.3578355e+00,  8.9016682e-01, -1.4454038e+00, -8.0663610e-01,
         1.3427621e+00, -1.5135692e+00,  2.1927497e+00,  1.4676518e+00,
        -1.7562844e+00,  1.6300368e+00, -2.5004225e+00,  2.20193

In [140]:
# Prepare ingredient vectors
ingredient_vectors = []
for ingredients in data.ingredients:
    # Get the vectors for the recipe ingredients and normalize them
    embedding_vecs = [fasttext_model.wv.get_vector(ing) for ing in ingredients]
    unit_vecs = np.array([vec/np.linalg.norm(vec) for vec in embedding_vecs])
    ingredient_vectors.append(unit_vecs)

ingredient_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in ingredient_vectors])

In [141]:
# Prepare title vectors
title_vectors = []
for title in data.recipe_title:
    # Get the vector for the recipe title and normalize it
    title_split = title.split()
    embedding_vecs = [fasttext_model.wv.get_vector(title) for title in title_split]
    unit_vecs = np.array([vec/np.linalg.norm(vec) for vec in embedding_vecs])
    title_vectors.append(unit_vecs)

title_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in title_vectors])

In [132]:
def normalization(arr):
    return (arr - arr.min()) / (arr.max() - arr.min())

In [154]:
# Prediction scoring system
user_id = 110833230122006731136
n=10
text = "chicken thighs, rice, peanut butter, almond milk, kecap manis, soy sauce"
query = [ingredient.strip() for ingredient in text.split(",") if ingredient]
print("query:", query)

# Get the vectors for the search query and normalize them
query_vecs = [fasttext_model.wv.get_vector(input) for input in query]
unit_query_vec = np.array([vec/np.linalg.norm(vec) for vec in query_vecs])

# Calculate MaxSim for each recipe in the data
# Might be slow for large amount of data
ingredient_max_sim = np.array([])
for ing_vector in ingredient_vectors:
    tokens_sim = unit_query_vec @ ing_vector.T

    a_best = tokens_sim.max(axis=1)
    b_best = tokens_sim.max(axis=0)
    
    score = 0.5 * (a_best.mean() + b_best.mean())
    ingredient_max_sim = np.append(ingredient_max_sim, score)

# Calculate cosine similarity between the mean query vector and the mean ingredients vector
mean_query_vec = unit_query_vec.mean(axis=0)
ingredient_sim = mean_query_vec.reshape(1, -1) @ ingredient_vector_means.T
title_sim = mean_query_vec.reshape(1, -1) @ title_vector_means.T

score = (params["model_pipeline"]["model_scoring"]["w_cosine"] * normalization(ingredient_sim[0]) + 
            params["model_pipeline"]["model_scoring"]["w_maxsim"] * normalization(ingredient_max_sim) + 
            params["model_pipeline"]["model_scoring"]["w_title"] * normalization(title_sim[0]))

rec_idx = np.argsort(score)[-params["model_pipeline"]["model_scoring"]["n_recs"]:][::-1]
# rec_idx = [4557, 5297, 3977, 6109, 366, 3572, 6262, 314]
data.iloc[rec_idx]

query: ['chicken thighs', 'rice', 'peanut butter', 'almond milk', 'kecap manis', 'soy sauce']


Unnamed: 0,id,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
5297,5297,Rice Flour,https://thewoksoflife.com/how-to-make-rice-flour/,[rice],2.0,5.0,5.0,0.0,0.0,293.0,...,0.0,61.0,2.0,0.1,0.0,0.0,8.0,0.3,0.5,https://thewoksoflife.com/wp-content/uploads/2...
5558,5558,Chicken Thighs,https://thewoksoflife.com/how-to-debone-chicke...,[chicken_thigh],4.0,10.0,10.0,0.0,0.0,211.0,...,74.0,198.0,0.0,0.0,75.0,0.0,8.0,1.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
4557,4557,Rice S Outrageously Delicious Rice,https://www.recipetineats.com/rice-recipes-ric...,"[one_pot_greek_chicken, lemon_rice, mexican_be...",7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.recipetineats.com/tachyon/2020/03/...
5484,5484,Down A Chicken,https://thewoksoflife.com/how-to-break-down-a-...,[chicken],9.0,10.0,10.0,0.0,0.0,409.0,...,133.0,360.0,0.0,0.0,267.0,3.0,21.0,2.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
157,157,Quick And Tasty Fried Rice S For Weeknight Meals,https://www.justonecookbook.com/easy-fried-ric...,"[fried_rice, garlic_fried_rice, salmon_fried_r...",4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.justonecookbook.com/wp-content/upl...
4725,4725,Velveting Chicken The Chinese Restaurant Secret,https://www.recipetineats.com/velveting-chicke...,"[chicken_breast_slice_size_piece, baking_soda]",6.0,25.0,5.0,0.0,20.0,95.0,...,411.0,308.0,0.0,0.0,25.0,1.0,4.0,0.3,0.0,https://www.recipetineats.com/tachyon/2019/03/...
4334,4334,Pad See Ew,https://www.recipetineats.com/thai-stir-fried-...,"[wide_rice_stick_noodle, dark_soy, oyster, soy...",11.0,18.0,8.0,10.0,0.0,510.0,...,406.0,169.0,1.6,2.9,9600.0,75.1,40.0,1.4,260.0,https://www.recipetineats.com/tachyon/2016/03/...
4974,4974,Spicy Thai Chicken Stir Fry,https://www.recipetineats.com/spicy-thai-chick...,"[soy, dark_soy, fish, oyster, sugar, oil, thai...",6.0,15.0,7.0,8.0,0.0,427.0,...,791.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,230.0,https://www.recipetineats.com/tachyon/2016/11/...
4546,4546,Vietnamese Rice Paper Rolls,https://www.recipetineats.com/vietnamese-rice-...,"[sheet_″_round_rice_paper, shrimp, vermicelli_...",10.0,20.0,20.0,0.0,0.0,135.0,...,201.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93.0,https://www.recipetineats.com/tachyon/2014/07/...
4635,4635,Thai Basil Chicken,https://www.recipetineats.com/thai-basil-chick...,"[chicken_thigh, green_onion, thai_basil_leaf, ...",6.0,15.0,10.0,5.0,0.0,360.0,...,588.0,231.0,1.0,2.0,148.0,2.0,14.0,1.0,160.0,https://www.recipetineats.com/tachyon/2017/03/...


In [251]:
normalization(ingredient_sim[0])

array([0.5517847 , 0.5666874 , 0.5185082 , ..., 0.44919264, 0.7029632 ,
       0.36308682], dtype=float32)

In [153]:
ingredient_sim

array([[ 0.07017788,  0.0765854 ,  0.05587044, ...,  0.02606776,
         0.13517804, -0.01095399]], dtype=float32)

### DynamoDB Data Revtrieval

In [10]:
# user_id = 110833230122006731136
# table_test = table.get_item(Key={"user_id": int(user_id)})["Item"]

# table_test

In [11]:
# from datetime import datetime
# test_map = {"liked_idx": list(map(int, list(liked_idx))), 
#             "disliked_idx": list(map(int, list(disliked_idx))),
#             "date": datetime.now().strftime("%Y-%m-%d" + "%H:%M:%S")}
# test_map


## Wrap Model with MLFlow Python Model

Models from Code (Add set model) or Load as Class (Restart kernel after every overwrite)

In [260]:
class RecipeFastText(mlflow.pyfunc.PythonModel):
    def __init__(self, params):
        self.params = params

    def load_context(self, context):
        self.model = FastText.load(context.artifacts["model_path"])
        self.data = pd.read_pickle(context.artifacts["data_path"])

        # Prepare ingredient vectors
        self.ingredient_vectors = []
        for ingredients in self.data.ingredients:
            # Get the vectors for the recipe ingredients and normalize them
            embedding_vecs = [self.model.wv.get_vector(ing) for ing in ingredients]
            unit_vecs = np.array([vec/np.linalg.norm(vec) for vec in embedding_vecs])
            self.ingredient_vectors.append(unit_vecs)

        self.ingredient_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in self.ingredient_vectors])

        # Prepare title vectors
        self.title_vectors = []
        for title in self.data.recipe_title:
            # Get the vector for the recipe title and normalize it
            title_split = title.split()
            embedding_vecs = [self.model.wv.get_vector(title) for title in title_split]
            unit_vecs = np.array([vec/np.linalg.norm(vec) for vec in embedding_vecs])
            self.title_vectors.append(unit_vecs)

        self.title_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in self.title_vectors])

    def normalization(self, arr):
        return (arr - arr.min()) / (arr.max() - arr.min())

    def predict(self, model_input: list[str]) -> list[float]:
        """Predicts the model vector based on the input string."""
        
        # Get the vectors for the search query and normalize them
        query_vecs = [self.model.wv.get_vector(input) for input in model_input]
        unit_query_vec = np.array([vec/np.linalg.norm(vec) for vec in query_vecs])

        # Calculate MaxSim for each recipe in the data
        # Might be slow for large amount of data
        ingredient_max_sim = np.array([])
        for ing_vector in self.ingredient_vectors:
            tokens_sim = unit_query_vec @ ing_vector.T

            a_best = tokens_sim.max(axis=1)
            b_best = tokens_sim.max(axis=0)
            
            score = 0.5 * (a_best.mean() + b_best.mean())
            ingredient_max_sim = np.append(ingredient_max_sim, score)

        # Calculate cosine similarity between the mean query vector and the mean ingredients vector
        mean_query_vec = unit_query_vec.mean(axis=0)
        ingredient_sim = mean_query_vec.reshape(1, -1) @ self.ingredient_vector_means.T
        title_sim = mean_query_vec.reshape(1, -1) @ self.title_vector_means.T
        
        score = (self.params["model_scoring"]["w_cosine"] * self.normalization(ingredient_sim[0]) + 
                self.params["model_scoring"]["w_maxsim"] * self.normalization(ingredient_max_sim) + 
                self.params["model_scoring"]["w_title"] * self.normalization(title_sim[0]))

        rec_idx = np.argsort(score)[::-1]
        return rec_idx

### Log Model to MLFlow

In [265]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("FastText-Model")

2025/08/24 21:02:54 INFO mlflow.tracking.fluent: Experiment with name 'FastText-Model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/198067954999234058', creation_time=1756094574349, experiment_id='198067954999234058', last_update_time=1756094574349, lifecycle_stage='active', name='FastText-Model', tags={}>

In [None]:
# Load params
params = load_params()

with mlflow.start_run():
    # Load data
    data = pd.read_pickle(params["model_pipeline"]["recipe_path"])

    # Load fasttext model
    fasttext_model = RecipeFastText(params["model_pipeline"])

    # Log parameters
    mlflow.log_params(params["model_pipeline"])

    # Log model
    model_info = mlflow.pyfunc.log_model(
        name="fasttext_model",
        python_model=fasttext_model,
        artifacts={"model_path": params["model_pipeline"]["model_path"],
                   "model_ngram_path": params["model_pipeline"]["model_ngram_path"],
                   "data_path": params["model_pipeline"]["recipe_path"]},
        pip_requirements=["gensim==4.3.3"]
    )

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 365.61it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 404.35it/s] 


🏃 View run calm-sponge-68 at: http://127.0.0.1:8080/#/experiments/198067954999234058/runs/4d89f3fa03ff47c185848a7004ba0c35
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/198067954999234058


In [267]:
local_uri = os.path.join("../mlflow/mlartifacts", *model_info.artifact_path.split("/")[1:])

metadata = {
    "server_uri": model_info.model_uri,
    "local_uri": local_uri,
    "run_id": model_info.run_id,
    "time_logged": model_info.utc_time_created,
    "model_params": params["model_pipeline"]
}

with open("../experiment_metadata.json", "w") as f:
    f.write(json.dumps(metadata, indent=4))

In [264]:
with open("../experiment_metadata.json", "r") as f:
    metadata = json.load(f)

model = mlflow.pyfunc.load_model(metadata["local_uri"])

In [220]:
rec_idx = model.predict(["peanut butter", "jelly", "bread"])  # Example usage
data.iloc[rec_idx]

Unnamed: 0,id,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
4679,4679,Grilled Garlic Bread,https://www.recipetineats.com/grilled-garlic-b...,"[bread_loaf, stick_salted_butter, garlic, pars...",9.0,15.0,5.0,10.0,0.0,331.0,...,495.0,127.0,2.0,4.0,575.0,1.3,100.0,2.4,0.0,https://www.recipetineats.com/tachyon/2019/06/...
4211,4211,My Grilled Cheese Sandwich,https://www.recipetineats.com/grilled-cheese-s...,"[sourdough_bread, salted_butter, vintage_chedd...",6.0,12.0,5.0,7.0,0.0,807.0,...,1427.0,212.0,3.0,6.0,1292.0,0.0,629.0,5.0,0.0,https://www.recipetineats.com/tachyon/2023/07/...
5598,5598,Chinese Hot Pot At Home,https://thewoksoflife.com/chinese-hot-pot-at-h...,"[pot_soup_baseor_stock, baby_bok_choy, napa_ca...",3.0,40.0,40.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6122,6122,Elvis French Toast,https://thewoksoflife.com/elvis-french-toast/,"[egg, milk, half, cinnamon, peanut_butter, cha...",7.0,35.0,15.0,20.0,0.0,605.0,...,546.0,614.0,5.0,13.0,546.0,5.0,129.0,3.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
5708,5708,Bacon In A Wok After School Sandwiches,https://thewoksoflife.com/bacon-in-a-wok-sandw...,"[bacon, crusty_bread]",6.0,10.0,0.0,10.0,0.0,645.0,...,1094.0,295.0,3.0,3.0,0.0,0.0,56.0,5.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1836,1836,Vegan Buttermilk,https://minimalistbaker.com/how-to-make-vegan-...,"[unsweetened_plain_dairyfree_milk, lemon_cider...",2.0,5.0,5.0,0.0,0.0,39.0,...,173.0,177.0,0.5,2.3,0.0,6.0,443.0,0.7,1.0,https://minimalistbaker.com/wp-content/uploads...
2959,2959,Chai Ginger Ice Cream Sandwiches,https://minimalistbaker.com/chai-ginger-ice-cr...,"[vegan_chai_ice_cream, vegan_glutenfree_ginger...",5.0,70.0,60.0,10.0,0.0,417.0,...,235.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,1.0,https://minimalistbaker.com/wp-content/uploads...
1493,1493,Almond Flour,https://www.loveandlemons.com/almond-flour/,[slivered_almond],3.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://cdn.loveandlemons.com/wp-content/uploa...
2267,2267,Almond Meal,https://minimalistbaker.com/how-to-make-almond...,[almond],3.0,5.0,5.0,0.0,0.0,207.0,...,0.0,262.0,4.5,1.6,0.0,0.0,0.0,0.0,1.0,https://minimalistbaker.com/wp-content/uploads...


## Model Register

In [22]:
with mlflow.start_run():
    mlflow.register_model(
        model_uri=model_info._model_uri,
        name="fasttext_model"
    )

Successfully registered model 'fasttext_model'.
2025/08/23 17:04:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: fasttext_model, version 1


🏃 View run rumbling-snail-782 at: http://127.0.0.1:8080/#/experiments/329658568731902423/runs/769ba51e2ca54d7080dd69d61e90b4bd
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/329658568731902423


Created version '1' of model 'fasttext_model'.


In [23]:
model = mlflow.pyfunc.load_model("models:/fasttext_model/latest")

Downloading artifacts: 100%|██████████| 12/12 [01:15<00:00,  6.25s/it] 
