# Model Evaluation

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import mlflow
import json

import warnings
warnings.filterwarnings("ignore")

## Preparation

In [2]:
import yaml

def load_params():
    with open("../params.yaml") as f:
        params = yaml.safe_load(f)
    return params

# Load params
params = load_params()

# Load recipes data
data = pd.read_pickle("../" + params["model_pipeline"]["recipe_path"])

In [3]:
# Load current model metadata
with open("../training_metadata.json", "r") as f:
    metadata = json.load(f)

metadata["model_params"]

{'model_path': 'models/fasttext.model',
 'model_ngram_path': 'models/fasttext.model.wv.vectors_ngrams.npy',
 'recipe_path': 'data/processed_cookbook.pkl',
 'fast_text': {'vector_size': 100,
  'window': 8,
  'sg': 0,
  'epochs': 100,
  'hs': 1,
  'seed': 1234,
  'min_count': 1},
 'model_scoring': {'w_title': 0.2,
  'w_cosine': 0.2,
  'w_maxsim': 0.6,
  'idf_top_weights': [0.8, 0.1, 0.1]}}

In [5]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("FastText-Model")

# Load mlflow model
model = mlflow.pyfunc.load_model("../" + metadata["local_uri"])

## Ground Truth Similarity Scores

Based only on ingredients as we want to make sure it includes all the ingredients in the query and not too much other ingredient

In [935]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_ings  = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), norm="l2")
X_ings  = tfidf_ings.fit_transform(data.ingredients.apply(lambda x: ", ".join(x)))   

In [936]:
from sklearn.metrics.pairwise import cosine_similarity
import random
random.seed(42)

q_id = random.randint(0, len(data.ingredients))
q_ings = X_ings[q_id]
true_scores = cosine_similarity(q_ings, X_ings)[0]
true_scores

array([0.03605603, 0.06025482, 0.09246626, ..., 0.00964642, 0.08279631,
       0.00443774])

## Model Prediction Scores

In [937]:
pred_scores = model.predict(data.ingredients.iloc[q_id])
pred_scores

array([0.39872128, 0.36949491, 0.39284569, ..., 0.18476525, 0.45841201,
       0.06926141])

## Predictive Metrics

### Regression Metrics

In [938]:
rmse = np.sqrt(np.mean((true_scores - pred_scores)**2))
mse = np.mean((true_scores - pred_scores)**2)
mae = np.mean(np.abs(true_scores - pred_scores))

print("RMSE:", rmse)
print("MSE:", mse)
print("MAE:", mae)

RMSE: 0.2612618074672088
MSE: 0.06825773204103287
MAE: 0.23503921615713305


### Soft Precision, Recall, F1

In [939]:
k=10
recommended_idx = np.argsort(pred_scores)[::-1][:30]
true_preds = true_scores[recommended_idx]
true_preds[:k]

array([1.        , 0.14232544, 0.0710299 , 0.126302  , 0.2394034 ,
       0.12145707, 0.06825931, 0.08254903, 0.06712469, 0.06515801])

In [940]:
precision_k = true_preds[:k].mean() # Precision@K
recall_k = true_preds[:k].sum()/true_preds.sum() # Recall@K
f1_k = (2 * precision_k * recall_k) / (precision_k + recall_k) # F1@K

print("Precision@K:", precision_k)
print("Recall@K:", recall_k)
print("F1@K:", f1_k)

Precision@K: 0.19836088555085438
Recall@K: 0.56162399110598
F1@K: 0.29317486609063437


## Ranking Metrics

In [941]:
random.seed(42)

q_id = random.randint(0, len(data.ingredients))
q_ings = X_ings[q_id]
true_scores = cosine_similarity(q_ings, X_ings)[0]
true_scores

array([0.03605603, 0.06025482, 0.09246626, ..., 0.00964642, 0.08279631,
       0.00443774])

### NDCG (Normalized Discounted Cumulative Gain)

In [942]:
sorted_recs = sorted(true_scores, reverse=True)
dcg, idcg = 0, 0
for i in range(len(true_scores)):
    dcg += true_scores[i]/np.log2((i+1)+1)
    idcg += sorted_recs[i]/np.log2((i+1)+1)

idcg += 1e-10  # Avoid dividing by 0
ndcg = dcg/idcg
print(f"NDCG: {ndcg}")

NDCG: 0.8098711508388237


### AP (Average Precision) --> MAP (Mean Average Precision)

In [943]:
relevant_preds = true_preds[1:] > 0.5
relevant_count = np.cumsum(relevant_preds)[relevant_preds == 1]
relevant_idx = np.arange(1, len(relevant_preds) + 1)[relevant_preds == 1]
ap = np.nanmean((relevant_count / relevant_idx))
print(f"AP: {ap}")

AP: nan


### RR (Reciprocal Rank) --> MRR (Mean Reciprocal Rank)

In [944]:
first_relevant_idx = relevant_idx[0] if len(relevant_idx) > 0 else 0
rr = 1/first_relevant_idx if first_relevant_idx else 0
print(f"RR: {rr}")

RR: 0


## Repeat to ensure consistency

Source: https://www.evidentlyai.com/ranking-metrics/evaluating-recommender-systems#mrr

In [945]:
random.seed(42)

k = 10
relevant_thresh = 0.5
scores = {"RMSE": [], "MSE": [], "MAE": [], f"Precision_{k}": [], f"Recall_{k}": [], f"F1_{k}": [], "NDCG": [], "MAP": [], "MRR": [], "Hit-Rate": []}

for i in range(10):
    # Get true scores for current query
    q_id = random.randint(0, len(data.ingredients))
    q_ings = X_ings[q_id]
    true_scores = cosine_similarity(q_ings, X_ings)[0]

    # Get predicted scores for current query
    pred_scores = model.predict(data.ingredients.iloc[q_id])

    # Get top recommendations and its true similarity
    recommended_idx = np.argsort(pred_scores)[::-1][:params["model_service"]["n_recs"]]
    true_preds = true_scores[recommended_idx]

    # Calculate predictive metrics
    ## RMSE
    rmse = np.sqrt(np.mean((true_scores - pred_scores)**2))
    scores["RMSE"].append(rmse)

    ## MSE
    mse = np.mean((true_scores - pred_scores)**2)
    scores["MSE"].append(mse)

    ## MAE
    mae = np.mean(np.abs(true_scores - pred_scores))
    scores["MAE"].append(mae)

    ## Precision@K
    precision_k = true_preds[:k].mean()
    scores[f"Precision_{k}"].append(precision_k)

    ## Recall@K
    recall_k = true_preds[:k].sum()/true_preds.sum() 
    scores[f"Recall_{k}"].append(recall_k)

    ## F1@K
    f1_k = (2 * precision_k * recall_k) / (precision_k + recall_k) 
    scores[f"F1_{k}"].append(f1_k)

    # Calculate ranking metrics
    ## NDCG
    sorted_recs = sorted(true_scores, reverse=True)
    dcg, idcg = 0, 0
    for i in range(len(true_scores)):
        dcg += true_scores[i]/np.log2((i+1)+1)
        idcg += sorted_recs[i]/np.log2((i+1)+1)

    idcg += 1e-10  # Avoid dividing by 0
    scores["NDCG"].append(dcg/idcg)

    ## MAP
    relevant_preds = true_preds[1:] > relevant_thresh
    relevant_count = np.cumsum(relevant_preds)[relevant_preds == 1]
    relevant_idx = np.arange(1, len(relevant_preds) + 1)[relevant_preds == 1]
    ap = np.nanmean((relevant_count / relevant_idx))
    scores["MAP"].append(ap if not np.isnan(ap) else 0)

    ## MRR
    first_relevant_idx = relevant_idx[0] if len(relevant_idx) > 0 else 0
    rr = 1/first_relevant_idx if first_relevant_idx else 0
    scores["MRR"].append(rr)

    ## Hit-Rate
    hit = 1 if True in relevant_preds[:k] else 0
    scores["Hit-Rate"].append(hit)

for metric in scores:
    print(f"{metric:>15}:", np.mean(scores[metric]))

           RMSE: 0.22691767167952398
            MSE: 0.053161452762294795
            MAE: 0.1940248615680723
   Precision_10: 0.3394571886911364
      Recall_10: 0.44860077437530793
          F1_10: 0.3664164538518096
           NDCG: 0.7961161608436464
            MAP: 0.19081783956783954
            MRR: 0.2166666666666667
       Hit-Rate: 0.3


## Update Metadata

In [961]:
metric_scores = {}
for metric in scores:
    metric_scores[metric] = np.mean(scores[metric])

validation_metadata = {**metadata, "metrics": metric_scores}

# Save validation model metadata
with open("../validation_metadata.json", "w") as f:
    f.write(json.dumps(validation_metadata, indent=4))

## Log to MLFlow

In [947]:
# Log metrics to MLflow
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("FastText-Model")

with mlflow.start_run(run_id=metadata["run_id"]):
    mlflow.log_metrics(metric_scores)

🏃 View run serious-stoat-300 at: http://127.0.0.1:8080/#/experiments/300877920446874715/runs/32cc39d3e1e24799886476ad91037e36
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/300877920446874715
