# Model Evaluation

In [3]:
# Import required libraries
import numpy as np
import pandas as pd
# from models.embedding_model import EmbeddingModel

# from sklearn.metrics.pairwise import cosine_similarity
import mlflow

import json
import requests
import boto3

import streamlit as st

import warnings
warnings.filterwarnings("ignore")

### Logging

In [3]:
def connect_database():
    """Connect to the DynamoDB database."""
    dynamodb = boto3.resource("dynamodb",
                            aws_access_key_id=st.secrets.s3.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=st.secrets.s3.AWS_SECRET_ACCESS_KEY,
                            region_name=st.secrets.s3.AWS_DEFAULT_REGION)
    table = dynamodb.Table(st.secrets.s3.DB_NAME)
    return table

table = connect_database()

user_id = 110833230122006731136 
user_item = table.get_item(Key={"user_id": int(user_id)})["Item"]

liked_idx = list(map(int, user_item["liked_idx"].keys()))
disliked_idx = list(map(int, user_item["disliked_idx"].keys()))

In [None]:
fasttext_path = "fasttext.model" 
word2vec_path = "word2vec.model" 
recipe_path = "processed_cookbook.pkl"

mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("model_validation")

n_recs = 30

with mlflow.start_run():
    # mlflow.log_params(params)
    
    # Log model
    model_info = mlflow.pyfunc.log_model(
        name="word2vec_model_search",
        python_model=EmbeddingModel(n_recs=n_recs, like_step=0.8, dislike_step=1.0, model="FastText"), # "./models/EmbeddingModel.py" 
        artifacts={"fasttext_path": fasttext_path, 
                   "word2vec_path": word2vec_path,
                   "recipe_path": recipe_path},
        pip_requirements=["gensim==4.3.3"]
    )
    
    # Load model for validation 
    loaded_model = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

    # # Grab data for validation
    # liked_df = pd.DataFrame().from_dict(user_item["liked_idx"], orient="index").reset_index().rename(columns={"index": "recipe_id", 0: "date"})
    # liked_df["date"] = pd.to_datetime(liked_df["date"])
    # liked_df["recipe_id"] = liked_df["recipe_id"].astype(int)
    # liked_df.sort_values(by="date", ascending=False, inplace=True)
    # liked_df.reset_index(drop=True, inplace=True)
    # like_test = liked_df.iloc[:-5, :]
    
    # disliked_df = pd.DataFrame().from_dict(user_item["disliked_idx"], orient="index").reset_index().rename(columns={"index": "recipe_id", 0: "date"})
    # disliked_df["date"] = pd.to_datetime(disliked_df["date"])
    # disliked_df["recipe_id"] = disliked_df["recipe_id"].astype(int)
    # disliked_df.sort_values(by="date", ascending=False, inplace=True)
    # disliked_df.reset_index(drop=True, inplace=True)
    # dislike_test = disliked_df.iloc[:-5, :]
    
    # prediction = loaded_model.predict([like_test["recipe_id"].values.tolist(), dislike_test["recipe_id"].values.tolist()])
    # prediction = np.asarray(json.loads(prediction))
    # print("Prediction: ", prediction)

    # precision_k = np.isin(prediction, liked_df["recipe_id"].values.tolist()).sum()/n_recs
    # mlflow.log_metric("precision_k", precision_k)
    # print(f"Precision@{n_recs} = ", precision_k)

    # recall_k = np.isin(prediction, liked_df["recipe_id"].values.tolist()).sum()/len(liked_df)
    # mlflow.log_metric("recall_k", recall_k)
    # print(f"Recall@{n_recs} = ", recall_k)



Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 3214.03it/s] 
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1027.76it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 40.84it/s]   


🏃 View run angry-chimp-486 at: http://127.0.0.1:5000/#/experiments/484303514770949339/runs/08f0ee683da94d4f8cc444a0d6cdf5b0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/484303514770949339


<!-- ### Validation -->

In [3]:
model = mlflow.pyfunc.load_model(model_info._model_uri)

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 35.71it/s]   


In [4]:
text = "honey, salt, pepper, chicken, beef, fish"
query = [ingredient.strip() for ingredient in text.split(",")]
model.predict(query)

'[6042, 223, 232, 336, 5624, 5058, 5522, 6111, 5102, 4576, 6254, 5024, 4833, 3720, 4120, 4249, 4814, 4928, 6252, 3609, 4670, 6157, 5931, 518, 4461, 4430, 6027, 572, 4711, 5094]'

In [31]:
def connect_database():
    """Connect to the DynamoDB database."""
    dynamodb = boto3.resource("dynamodb",
                            aws_access_key_id=st.secrets.s3.AWS_ACCESS_KEY_ID,
                            aws_secret_access_key=st.secrets.s3.AWS_SECRET_ACCESS_KEY,
                            region_name=st.secrets.s3.AWS_DEFAULT_REGION)
    table = dynamodb.Table(st.secrets.s3.DB_NAME)
    return table

table = connect_database()

user_id = 110833230122006731136 # alfredmastann@gmail.com test user
user_item = table.get_item(Key={"user_id": int(user_id)})["Item"]

liked_idx = list(map(int, user_item["liked_idx"].keys()))
disliked_idx = list(map(int, user_item["disliked_idx"].keys()))

loaded_model = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)
predicted = loaded_model.predict([liked_idx, disliked_idx])

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 89.99it/s] 


In [None]:
liked_df = pd.DataFrame().from_dict(user_item["liked_idx"], orient="index").reset_index().rename(columns={"index": "recipe_id", 0: "date"})
liked_df["date"] = pd.to_datetime(liked_df["date"])
liked_df["recipe_id"] = liked_df["recipe_id"].astype(int)
liked_df.sort_values(by="date", ascending=False, inplace=True)
liked_df.reset_index(drop=True, inplace=True)
like_test = liked_df.iloc[:-5, :]
like_test

Unnamed: 0,recipe_id,date
0,1955,2025-08-11 17:47:31
1,4381,2025-08-11 17:47:25
2,4102,2025-07-22 22:25:17
3,5601,2025-07-22 22:25:12
4,995,2025-07-22 22:25:09
5,6081,2025-07-22 22:25:05
6,3220,2025-07-22 22:25:03
7,4005,2025-07-22 22:24:59
8,1076,2025-07-22 22:24:55
9,149,2025-07-22 22:24:50


In [35]:
disliked_df = pd.DataFrame().from_dict(user_item["disliked_idx"], orient="index").reset_index().rename(columns={"index": "recipe_id", 0: "date"})
disliked_df["date"] = pd.to_datetime(disliked_df["date"])
disliked_df["recipe_id"] = disliked_df["recipe_id"].astype(int)
disliked_df.sort_values(by="date", ascending=False, inplace=True)
disliked_df.reset_index(drop=True, inplace=True)
dislike_test = disliked_df.iloc[:-5, :]
dislike_test

Unnamed: 0,recipe_id,date
0,3934,2025-08-11 17:47:27
1,4006,2025-07-22 22:25:16
2,6228,2025-07-22 22:25:15
3,219,2025-07-22 22:25:14
4,6025,2025-07-22 22:25:13
5,1053,2025-07-22 22:25:08
6,5978,2025-07-22 22:25:07
7,930,2025-07-22 22:25:06
8,3193,2025-07-22 22:25:02
9,762,2025-07-22 22:25:01


In [12]:
prediction = loaded_model.predict([like_test["recipe_id"].values.tolist(), dislike_test["recipe_id"].values.tolist()])
prediction = np.asarray(json.loads(prediction))
prediction

array([4930, 3895, 3411, 5562, 5703, 3169, 3126, 3179, 5794, 5595, 3627,
       6131, 5585,  583,   90, 5301, 3400, 3149, 1496, 4241,  569, 3214,
       5892, 5499, 5685, 3830, 3211,  735, 5575,  756])

Recall@K (K = 10)

In [8]:
recall_k = np.isin(prediction, liked_df["recipe_id"].values.tolist()).sum()/len(liked_df)
recall_k

0.023809523809523808

Precision@K (K = 10)

In [9]:
precision_k = np.isin(prediction, liked_df["recipe_id"].values.tolist()).sum()/10
precision_k

0.1

### Registering

In [5]:
# Register model if validation is successful (RUN THIS TO UPDATE THE STREAMLIT MODEL)
with mlflow.start_run():
    mlflow.register_model(
        model_uri=model_info._model_uri,
        name="word2vec_model"
    )

Registered model 'word2vec_model' already exists. Creating a new version of this model...
2025/08/17 16:07:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: word2vec_model, version 12


🏃 View run charming-cub-981 at: http://127.0.0.1:5000/#/experiments/484303514770949339/runs/9a87b537f6314c6db73922d31945c24f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/484303514770949339


Created version '12' of model 'word2vec_model'.


### API Call Tests

In [14]:
params = {"query": ["rice", "beef", "meat"]}
response = requests.post(f"http://localhost:8000/recommend/", params=params)
response.text

'"[5297, 4557, 157, 161, 33, 4726, 5797, 274, 13, 618, 140, 14, 896, 251, 45, 40, 5068, 6047, 6073, 55, 206, 4709, 637, 231, 3863, 412, 5598, 3571, 5957, 307]"'