# Model Training - FastText

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import os 
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import mlflow
import json

import warnings
warnings.filterwarnings("ignore")

In [276]:
import yaml

def load_params():
    with open("../params.yaml") as f:
        params = yaml.safe_load(f)
    return params

params = load_params()

In [277]:
data = pd.read_pickle("../" + params["model_pipeline"]["recipe_path"])

## Model Training

In [278]:
# fasttext_model = FastText(data.ingredients, **params["model_pipeline"]["fast_text"])
# fasttext_model.save("../" + params["model_pipeline"]["model_path"])
fasttext_model = FastText.load("../" + params["model_pipeline"]["model_path"])

In [279]:
# Quick sanity check
fasttext_model.wv.most_similar("chicken", topn=10)

[('chicken_fajitas', 0.9779865145683289),
 ('chicken_fat', 0.9776473641395569),
 ('chicken_nacho', 0.9735757112503052),
 ('lbs/_chicken', 0.97292160987854),
 ('chicken_nanban', 0.9725943803787231),
 ('chicken_foot', 0.9708802700042725),
 ('chicken_mignons', 0.9700337052345276),
 ('chicken_leg', 0.9676940441131592),
 ('chicken_back', 0.9674406051635742),
 ('chicken_chashu', 0.9617377519607544)]

### IDF (Inverse Document Frequency)

In [306]:
from time import time

In [None]:
from collections import Counter
ingredients_count = Counter(data.ingredients.explode())
t0 = time()
ingredients_count.get("salt")
# data.ingredients.explode().isin(["salt"]).sum()
# data.ingredients.apply(lambda x: "salt" in x).sum()
print(f"Time taken: {time() - t0:.4f} seconds")

Time taken: 0.0001 seconds


In [350]:
idf = {}
# Calculate IDF
ingredients_count = Counter(data.ingredients.explode())
data_length = len(data)
for vocab in fasttext_model.wv.index_to_key:
    n_recipe_contains_vocab = ingredients_count.get(vocab, 0)
    idf[vocab] = np.log(data_length / (n_recipe_contains_vocab + 1e-9))

idf

{'salt': 0.43756323278446185,
 'garlic': 1.085610536358484,
 'water': 1.3433086441537483,
 'sugar': 1.5919917083637571,
 'soy_sauce': 1.6608262141840013,
 'black_pepper': 1.6979272435469352,
 'egg': 1.7484856093407433,
 'ginger': 1.874513430255987,
 'olive_oil': 1.9424564328800327,
 'vanilla_extract': 2.1806656847698176,
 'onion': 2.2918129960106466,
 'pepper': 2.370740254600195,
 'unsalted_butter': 2.3804490687271405,
 'lemon': 2.3984971385988136,
 'shaoxing_wine': 2.405141681317471,
 'baking_powder': 2.405141681317471,
 'milk': 2.4151920171709556,
 'cornstarch': 2.4424974678611764,
 'sesame_oil': 2.4599499178123723,
 'carrot': 2.5405796522123336,
 'maple_syrup': 2.5874632381110922,
 'scallion': 2.613807213450641,
 'vegetable_oil': 2.642975948041237,
 'butter': 2.66434428244689,
 'cinnamon': 2.6665064454513807,
 'white_pepper': 2.7085015664878913,
 'green_onion': 2.717571923457835,
 'brown_sugar': 2.7244290931839563,
 'all-purpose_flour': 2.7737928163021763,
 'baking_soda': 2.81310991

### Ingredient Vectors

In [281]:
# Prepare ingredient vectors
ingredient_vectors = []
for ingredients in data.ingredients:
    # Get the normalized vectors for the recipe ingredients and weight by IDF
    embedding_vecs = [fasttext_model.wv.get_vector(ing, norm=True) * idf.get(ing, 1) for ing in ingredients]
    ingredient_vectors.append(embedding_vecs)

ingredient_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in ingredient_vectors])

### Title Vectors

In [282]:
# Prepare title vectors
title_vectors = []
for title in data.recipe_title:
    # Get the vector for the recipe title and normalize it
    title_split = title.split()
    embedding_vecs = [fasttext_model.wv.get_vector(title, norm=True) for title in title_split]
    title_vectors.append(embedding_vecs)

title_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in title_vectors])

### Query Processing

In [283]:
# Query processing with IDF weighting
text = "Rice, duck" # Example input
query = [ingredient.strip() for ingredient in text.split(",") if ingredient]
print("query:", query)

# Get the normalized vectors for the search query
query_vecs = np.array([fasttext_model.wv.get_vector(input, norm=True) for input in query])

# Borrow IDF from top 3 most similar ingredients
idf_top_weights = params["model_pipeline"]["model_scoring"]["idf_top_weights"] # Weights for top 3 similar ingredients

query_idfs = []
for query in query_vecs:
    similar_ings = [ing for ing, _ in fasttext_model.wv.similar_by_vector(query, topn=3)]
    ing_idf = np.sum([weight * idf.get(similar_ing, 1) for similar_ing, weight in zip(similar_ings, idf_top_weights)])
    query_idfs.append(ing_idf)

# Apply IDF 
query_vecs = np.array([(query_vec * query_idf) for query_vec, query_idf in zip(query_vecs, query_idfs)])

query: ['Rice', 'duck']


### Model Scoring System

In [None]:
t0 = time()

# Prediction scoring system
# Calculate MaxSim for each recipe in the data
# Might be slow for large amount of data
ingredient_max_sim = np.array([])
for ing_vector in ingredient_vectors:
    ing_vector = np.array(ing_vector)
    tokens_sim = np.array([fasttext_model.wv.cosine_similarities(vec, ing_vector) for vec in query_vecs])
    
    a_best = tokens_sim.max(axis=1)
    b_best = tokens_sim.max(axis=0)
    
    score = 0.5 * (a_best.mean() + b_best.mean())
    ingredient_max_sim = np.append(ingredient_max_sim, score)

# Calculate cosine similarity between the mean query vector and the mean ingredients vector
mean_query_vec = query_vecs.mean(axis=0)
ingredient_sim = fasttext_model.wv.cosine_similarities(mean_query_vec, ingredient_vector_means)
title_sim = fasttext_model.wv.cosine_similarities(mean_query_vec, title_vector_means)

score = (params["model_pipeline"]["model_scoring"]["w_cosine"] * ingredient_sim + 
            params["model_pipeline"]["model_scoring"]["w_maxsim"] * ingredient_max_sim + 
            params["model_pipeline"]["model_scoring"]["w_title"] * title_sim)

rec_idx = np.argsort(score)[-params["model_service"]["n_recs"]:][::-1]
# rec_idx = [4557, 5297, 3977, 6109, 366, 3572, 6262, 314]
print("Time taken for scoring:", time() - t0)
data.iloc[rec_idx]
# fasttext_model.wv.similarity("chicken", "asdasd")

# fasttext_model.wv.cosine_similarities(mean_query_vec, ingredient_vector_means)
# cosine_similarity(mean_query_vec.reshape(1, -1), ingredient_vector_means)[0]

Time taken for scoring: 0.17101621627807617


Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat,num_ingredients
2046,Steamed Pork With Rice Powder,https://thewoksoflife.com/steamed-pork-rice-po...,"[pork_belly, ginger, shaoxing_wine, fermented_...",https://thewoksoflife.com/wp-content/uploads/2...,5.0,90.0,90.0,180.0,0.0,625.0,...,1.0,127.0,2.2,0.0,4.1,75.0,0.0,0.0,,15
1890,Pork Belly Mushroom Rice Bowl,https://thewoksoflife.com/mushroom-rice-bowl-p...,"[shiitake_mushroom, pork_belly, oil, ginger, s...",https://thewoksoflife.com/wp-content/uploads/2...,5.0,10.0,30.0,40.0,0.0,612.0,...,5.0,21.0,1.2,0.0,0.6,30.0,0.0,0.0,,11
1731,Lazy Sticky Rice Dumplings,https://thewoksoflife.com/lazy-sticky-rice-dum...,"[sticky_rice, bamboo_leaf, pork_belly, soy_sau...",https://thewoksoflife.com/wp-content/uploads/2...,9.0,360.0,90.0,450.0,0.0,675.0,...,1.0,30.0,3.0,0.0,1.0,68.0,0.0,0.0,,12
5045,Brown Rice,https://www.recipetineats.com/how-to-cook-brow...,"[brown_rice_–_grain_grain/basmati, water, shor...",https://www.recipetineats.com/tachyon/2020/09/...,10.0,1.0,30.0,41.0,0.0,229.0,...,0.0,26.0,1.0,10.0,0.0,0.0,0.0,0.0,,5
2377,Hong Kong Style Clay Pot Rice Bowl,https://thewoksoflife.com/hong-kong-style-clay...,"[grain_rice, water, -inch_piece_cured_pork_bel...",https://thewoksoflife.com/wp-content/uploads/2...,4.0,60.0,15.0,75.0,0.0,667.0,...,1.0,43.0,1.0,0.0,1.0,65.0,1.0,3.0,11.0,11
5066,Jasmine Rice,https://www.recipetineats.com/how-to-cook-jasm...,"[jasmine_rice, water]",https://www.recipetineats.com/tachyon/2020/06/...,7.0,1.0,12.0,23.0,0.0,169.0,...,1.0,13.0,1.0,10.0,0.0,0.0,0.0,0.0,,2
2012,Chinese Spiced Braised Beef Shank,https://thewoksoflife.com/braised-beef-shank/,"[cinnamon_stick, black_cardamom_pod, sichuan_p...",https://thewoksoflife.com/wp-content/uploads/2...,6.0,10.0,120.0,130.0,0.0,200.0,...,3.0,74.0,3.8,0.0,2.5,80.0,0.0,0.0,,20
2344,Sticky Rice Hashbrowns,https://thewoksoflife.com/rice-hashbrowns/,"[rice, short_grain_white_rice, water, pork, so...",https://thewoksoflife.com/wp-content/uploads/2...,7.0,120.0,45.0,165.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,13
1925,One Pot Rice Cooker Rice With Dace Fish,https://thewoksoflife.com/dace-fish-black-bean...,"[rice, dace_fish_salted_black_bean, carrot, pe...",https://thewoksoflife.com/wp-content/uploads/2...,4.0,5.0,25.0,30.0,0.0,551.0,...,2.0,36.0,1.1,0.0,8.7,2840.0,0.0,0.0,,9
2359,Dim Sum Sticky Rice Lotus Leaf Wraps,https://thewoksoflife.com/dim-sum-sticky-rice-...,"[soy_sauce, white_pepper, oyster_sauce, five-s...",https://thewoksoflife.com/wp-content/uploads/2...,5.0,135.0,135.0,270.0,0.0,384.0,...,2.0,37.0,2.5,0.0,6.8,310.0,0.0,0.0,,15


## Wrap Model with MLFlow Python Model

Models from Code (Add set model) or Load as Class (Restart kernel after every overwrite)

In [None]:
class RecipeFastText(mlflow.pyfunc.PythonModel):
    def __init__(self, params):
        self.params = params

    def load_context(self, context):
        self.model = FastText.load(context.artifacts["model_path"])
        self.data = pd.read_pickle(context.artifacts["data_path"])

        # Prepare IDF
        self.idf = {}
        ingredients_count = Counter(self.data.ingredients.explode())
        data_length = len(self.data)
        for vocab in self.model.wv.index_to_key:
            n_recipe_contains_vocab = ingredients_count.get(vocab, 0)
            self.idf[vocab] = np.log(data_length / (n_recipe_contains_vocab + 1e-9))

        # Prepare ingredient vectors
        self.ingredient_vectors = []
        for ingredients in self.data.ingredients:
            # Get the vectors for the recipe ingredients and normalize them
            embedding_vecs = [self.model.wv.get_vector(ing, norm=True) * self.idf.get(ing, 1)  for ing in ingredients]
            self.ingredient_vectors.append(embedding_vecs)

        self.ingredient_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in self.ingredient_vectors])

        # Prepare title vectors
        self.title_vectors = []
        for title in self.data.recipe_title:
            # Get the vector for the recipe title and normalize it
            title_split = title.split()
            embedding_vecs = [self.model.wv.get_vector(title, norm=True) for title in title_split]
            self.title_vectors.append(embedding_vecs)

        self.title_vector_means = np.array([np.mean(ing_vector, axis=0) for ing_vector in self.title_vectors])

    def predict(self, model_input: list[str]) -> list[float]:
        """Predicts the recipe similarity score based on the input string."""
        
        # Get the vectors for the search query and normalize them
        query_vecs = np.array([self.model.wv.get_vector(input, norm=True) for input in model_input])

        # Borrow IDF from top 3 most similar ingredients
        idf_top_weights = self.params["model_scoring"]["idf_top_weights"] # Weights for top 3 similar ingredients

        query_idfs = []
        for query in query_vecs:
            similar_ings = [ing for ing, _ in self.model.wv.similar_by_vector(query, topn=3)]
            ing_idf = np.sum([weight * self.idf.get(similar_ing, 1) for similar_ing, weight in zip(similar_ings, idf_top_weights)])
            query_idfs.append(ing_idf)

        # Apply IDF 
        query_vecs = np.array([(query_vec * query_idf) for query_vec, query_idf in zip(query_vecs, query_idfs)])

        # Calculate MaxSim for each recipe in the data
        # Might be slow for large amount of data
        ingredient_max_sim = np.array([])
        for ing_vector in self.ingredient_vectors:
            ing_vector = np.array(ing_vector)
            tokens_sim = np.array([self.model.wv.cosine_similarities(vec, ing_vector) for vec in query_vecs])

            a_best = tokens_sim.max(axis=1)
            b_best = tokens_sim.max(axis=0)
            
            score = 0.5 * (a_best.mean() + b_best.mean())
            ingredient_max_sim = np.append(ingredient_max_sim, score)

        # Calculate cosine similarity between the mean query vector and the mean ingredients vector
        mean_query_vec = query_vecs.mean(axis=0)
        ingredient_sim = self.model.wv.cosine_similarities(mean_query_vec, self.ingredient_vector_means)
        title_sim = self.model.wv.cosine_similarities(mean_query_vec, self.title_vector_means)

        score = (self.params["model_scoring"]["w_cosine"] * ingredient_sim +
                self.params["model_scoring"]["w_maxsim"] * ingredient_max_sim +
                self.params["model_scoring"]["w_title"] * title_sim)

        return score

### Log Model to MLFlow

In [25]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("FastText-Model")

<Experiment: artifact_location='mlflow-artifacts:/395968226849530716', creation_time=1758176343047, experiment_id='395968226849530716', last_update_time=1758176343047, lifecycle_stage='active', name='FastText-Model', tags={}>

In [None]:
# Load params
params = load_params()

with mlflow.start_run():
    # Load data
    data = pd.read_pickle("../" + params["model_pipeline"]["recipe_path"])

    # Load fasttext model
    fasttext_model = RecipeFastText(params["model_pipeline"])

    # Log params
    mlflow.log_params(params["model_pipeline"]["fast_text"])
    mlflow.log_params(params["model_pipeline"]["model_scoring"])

    # Log model
    model_info = mlflow.pyfunc.log_model(
        name="fasttext_model",
        python_model=fasttext_model,
        artifacts={"model_path": "../" + params["model_pipeline"]["model_path"],
                   "data_path": "../" + params["model_pipeline"]["recipe_path"]},
        pip_requirements=["gensim==4.3.3"]
    )

    # Log local URI
    local_uri = os.path.join("../mlflow/mlartifacts", *model_info.artifact_path.split("/")[1:])
    mlflow.log_params({"local_uri": local_uri})
    
    # Log metadata
    metadata = {
        "server_uri": model_info.model_uri,
        "local_uri": local_uri,
        "run_id": model_info.run_id,
        "time_logged": model_info.utc_time_created,
        "model_params": params["model_pipeline"]
    }

    # Save metadata locally
    metadata["model_params"] = params["model_pipeline"]
    with open("../training_metadata.json", "w") as f:
        f.write(json.dumps(metadata, indent=4))

    mlflow.log_artifact("../training_metadata.json")


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 2646.25it/s] 
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 368.05it/s]


🏃 View run unequaled-auk-279 at: http://127.0.0.1:8080/#/experiments/395968226849530716/runs/b51bd046c0d5455d872f746bf44c0d06
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/395968226849530716


### Quick Sanity Check

In [None]:
with open("../training_metadata.json", "r") as f:
    metadata = json.load(f)

model = mlflow.pyfunc.load_model(metadata["local_uri"])

Downloading artifacts: 100%|██████████| 8/8 [00:03<00:00,  2.41it/s]


In [33]:
sim_scores = model.predict(["bread"])  # Example usage
rec_idx = np.argsort(sim_scores)[::-1][:10] # Get top recommendations
data.iloc[rec_idx]

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat,num_ingredients
3488,Breadcrumbs,https://minimalistbaker.com/how-to-make-breadc...,"[torn/cubed_bread, italian_herb_seasoning, gar...",https://minimalistbaker.com/wp-content/uploads...,7.0,5.0,20.0,25.0,1.0,114.0,...,2.4,62.0,1.5,0.0,0.0,0.0,0.0,0.7,0.3,4
5167,Grilled Garlic Bread,https://www.recipetineats.com/grilled-garlic-b...,"[bread_loaf, /_stick_salted_butter, garlic, pa...",https://www.recipetineats.com/tachyon/2019/06/...,9.0,5.0,10.0,15.0,0.0,331.0,...,4.0,100.0,2.4,0.0,1.3,575.0,0.0,0.0,,4
5250,Christmas Leftovers Bread Bowl Sandwich,https://www.recipetineats.com/bread-bowl-sandw...,"[round/oval_bread_loaf, ham_turkey, meat_slice...",https://www.recipetineats.com/tachyon/2018/12/...,10.0,15.0,30.0,45.0,0.0,574.0,...,2.0,226.0,4.4,0.0,5.4,2160.0,0.0,0.0,,8
5813,Garlic Bread,https://www.loveandlemons.com/garlic-bread/,"[unsalted_butter, parsley/chives, garlic, salt...",https://cdn.loveandlemons.com/wp-content/uploa...,4.0,10.0,10.0,20.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,8
5232,Quick Cheesy Garlic Bread,https://www.recipetineats.com/quick-cheesy-gar...,"[bread_loaf_approx_/″, unsalted_butter, salt, ...",https://www.recipetineats.com/tachyon/2019/01/...,7.0,5.0,25.0,30.0,0.0,267.0,...,0.0,117.0,1.5,0.0,0.2,400.0,0.0,0.0,,6
683,Melted Brie And Bacon On Grilled Flatbread,https://dailydishrecipes.com/melted-brie-bacon...,"[bread, brie_cheese, bacon, rom_tomato, season...",https://dailydishrecipes.com/wp-content/upload...,3.0,5.0,7.0,12.0,0.0,478.0,...,2.0,164.0,1.0,0.0,8.0,1030.0,0.1,4.0,14.0,6
5670,No Washing Up Ham Egg Cheese Bread Bowls,https://www.recipetineats.com/no-washing-up-ha...,"[bread_roll, slice_ham, egg, mozzarella_cheese...",https://www.recipetineats.com/tachyon/2014/09/...,8.0,5.0,15.0,20.0,194.0,415.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5
6368,Thanksgiving Sandwich,https://www.spoonforkbacon.com/thanksgiving-tu...,"[bread, mayonnaise, cranberry_sauce, roasted_t...",https://www.spoonforkbacon.com/wp-content/uplo...,4.0,10.0,0.0,10.0,0.0,1030.0,...,63.0,179.0,6.0,0.0,6.0,2607.0,1.0,18.0,11.0,7
1145,Ciabatta Bread Bacon Pizza,https://dailydishrecipes.com/ciabatta-bread-ba...,"[ciabatta_bread, mozzarella, basil, garlic, ol...",https://dailydishrecipes.com/wp-content/upload...,8.0,5.0,10.0,17.0,0.0,425.0,...,1.0,296.0,0.5,2.0,1.0,550.0,0.03,2.0,10.0,6
672,Brie And Raspberry Panini With Hazelnut Spread,https://dailydishrecipes.com/brie-and-raspberr...,"[sourdough_bread, butter, chocolate-hazelnut_s...",https://dailydishrecipes.com/wp-content/upload...,2.0,5.0,5.0,10.0,0.0,598.0,...,17.0,143.0,6.0,0.0,2.0,346.0,0.2,2.0,4.0,5
