In [1]:
import os
import polars as pl
import numpy as np

import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..')) 
sys.path.append(project_root)

from models.baseline_model import BaselineModel
from models.content_based_model import ContentBasedModel
from models.collaborative_memory_model import CollaborativeMemoryModel
from models.collaborative_model_based import CollaborativeModelBased
from models.session_based_model import SessionBasedModel

In [2]:
def load_data(train_path="../data/processed/train_data.csv", jokes_path="../data/processed/jokes_with_clusters.parquet"):
    """
    Load training data and joke metadata.

    Args:
        train_path (str): Path to training data.
        jokes_path (str): Path to joke metadata with clusters.

    Returns:
        train_df (pl.DataFrame): Training set.
        jokes_df (pl.DataFrame): Jokes metadata.
    """
    print(f"📂 Loading training data from {train_path}...")
    train_df = pl.read_csv(train_path)
    print(f"✅ Training data loaded successfully! Shape: {train_df.shape}")
    
    print(f"📂 Loading joke data from {jokes_path}...")
    jokes_df = pl.read_parquet(jokes_path)
    print(f"✅ Joke data loaded successfully! Shape: {jokes_df.shape}")
    
    return train_df, jokes_df

In [3]:
def train_and_save_baseline(train_df: pl.DataFrame, items_df: pl.DataFrame, save_path: str = "../models/baseline_model.parquet"):
    """
    Trains the BaselineModel and saves it as a Parquet file.

    Args:
        train_df (pl.DataFrame): The training data containing 'userId', 'jokeId', 'rating', and other columns.
        items_df (pl.DataFrame): The joke metadata containing at least the 'jokeId' column.
        save_path (str): The path where the trained model should be saved.
    """
    print("\n🚀 Training Baseline Model...")

    baseline_model = BaselineModel(ratings=train_df, items=items_df)
    baseline_model.train()
    print("✅ Baseline Model training completed!")

    baseline_model.save_model(save_path)
    print(f"💾 Baseline Model saved successfully at {save_path}")

In [4]:
def train_and_save_content_based(jokes_df: pl.DataFrame, save_path: str = "../models/content_based/"):
    """
    🚀 Trains the Content-Based Model and saves the similarity matrix and joke IDs.

    Args:
        jokes_df (pl.DataFrame): The joke features DataFrame, containing 'jokeId', 'embeddings', and other features.
        save_path (str): The directory where the trained model files will be saved.
    """
    print("\n🔹 Training Content-Based Model...")

    # Ensure 'embeddings' column exists
    if 'embeddings' not in jokes_df.columns:
        raise ValueError("❌ The DataFrame must contain an 'embeddings' column with precomputed joke embeddings.")

    # Extract joke IDs and embeddings from the DataFrame
    joke_ids = jokes_df["jokeId"].to_list()
    embeddings = np.vstack(jokes_df["embeddings"].to_list())

    print(f"📦 Loaded {len(joke_ids)} jokes and their embeddings of shape {embeddings.shape}.")

    # Step 1: Train the Content-Based Recommender Model
    print("🚀 Training Content-Based Recommender...")
    content_model = ContentBasedModel()
    content_model.train(embeddings, joke_ids)
    print(f"✅ Content-Based model trained successfully.")

    # Step 2: Save the trained model
    os.makedirs(save_path, exist_ok=True)
    print(f"💾 Saving Content-Based model to {save_path}...")
    content_model.save_model(save_path)
    print(f"✅ Content-Based model saved successfully at {save_path}")

In [5]:
def train_and_save_collaborative_memory(train_df, save_path="../models/collaborative_memory_model.onnx"):
    """
    🚀 Train and save the Collaborative Memory-Based Model.
    
    Args:
        train_df (pl.DataFrame): Training data containing ['userId', 'jokeId', 'rating'].
        save_path (str): Path where the model will be saved.
    """
    print("\n🔹 Training Collaborative Memory-Based Model...")

    # Step 1: Initialize the model
    print("⚙️ Initializing the Collaborative Memory-Based Model...")
    memory_model = CollaborativeMemoryModel()
    
    # Step 2: Train the model
    print("🚀 Training Collaborative Memory-Based Model...")
    memory_model.train(train_df)
    print("✅ Collaborative Memory-Based Model trained successfully.")
    
    # Step 3: Save the trained model
    print(f"💾 Saving Collaborative Memory-Based model to {save_path}...")
    memory_model.save_model(save_path)
    print(f"✅ Collaborative Memory-Based model saved successfully at {save_path}")

In [6]:
def train_and_save_collaborative_model_based(train_df, save_path="../models/collaborative_model_based.npz"):
    """
    🚀 Train and save the Collaborative Model-Based Model.
    
    Args:
        train_df (pl.DataFrame): Training data containing ['userId', 'jokeId', 'rating'].
        save_path (str): Path where the model will be saved.
    """
    print("\n🔹 Training Collaborative Model-Based Model...")

    # Step 1: Initialize the model
    print("⚙️ Initializing the Collaborative Model-Based Recommender...")
    model_based = CollaborativeModelBased(factors=64, iterations=20, regularization=0.1)
    
    # Step 2: Train the model
    print("🚀 Training Collaborative Filtering Model...")
    model_based.train(train_df)
    print("✅ Collaborative Filtering Model trained successfully.")
    
    # Step 3: Save the trained model
    print(f"💾 Saving Collaborative Model-Based model to {save_path}...")
    model_based.save_model(save_path)
    print(f"✅ Collaborative Model-Based model saved successfully at {save_path}")

In [7]:
def train_and_save_session_based(train_df, 
                                 gru_path="../models/session_gru.pth", 
                                 cooccurrence_path="../models/session_cooccurrence.npy"):
    """
    Train and save the Session-Based Model.
    """
    print("\n🚀 Training Session-Based Model...")

    # Initialize the model
    session_model = SessionBasedModel()
    
    # Train the model (includes both GRU and co-occurrence)
    session_model.train(train_df)
    
    # Save the trained model
    session_model.save_model(model_path=gru_path, co_occurrence_path=cooccurrence_path)
    
    print(f"✅ Session-Based model (GRU + Co-occurrence) saved at {gru_path} and {cooccurrence_path}")

### Load the training and joke data

In [8]:
train_df, jokes_df = load_data()

📂 Loading training data from ../data/processed/train_data.csv...
✅ Training data loaded successfully! Shape: (323433, 3)
📂 Loading joke data from ../data/processed/jokes_with_clusters.parquet...
✅ Joke data loaded successfully! Shape: (150, 9)


### Train and Save the models

In [9]:
train_and_save_baseline(train_df, jokes_df, save_path="../models/baseline_model.parquet")


🚀 Training Baseline Model...
🚀 Training Baseline Model...
📊 Filtered valid users for training...
✅ Top 100 Jokes calculated successfully. Total jokes: 100
✅ Baseline Model training completed!
💾 Saving model to ../models/baseline_model.parquet...
✅ Model saved successfully at ../models/baseline_model.parquet
💾 Baseline Model saved successfully at ../models/baseline_model.parquet


In [10]:
train_and_save_content_based(jokes_df, save_path="../models/content_based")


🔹 Training Content-Based Model...
📦 Loaded 150 jokes and their embeddings of shape (150, 384).
🚀 Training Content-Based Recommender...
🔍 Calculating cosine similarity matrix...
✅ Cosine similarity matrix shape: (150, 150)
✅ Content-Based model trained successfully.
💾 Saving Content-Based model to ../models/content_based...
✅ Model saved at ../models/content_based
✅ Content-Based model saved successfully at ../models/content_based


In [11]:
train_and_save_collaborative_memory(train_df, save_path="../models/collaborative_memory_model.pkl")


🔹 Training Collaborative Memory-Based Model...
⚙️ Initializing the Collaborative Memory-Based Model...
🚀 Training Collaborative Memory-Based Model...
🔹 Extracting user and item indices...
🔹 Converting userId and jokeId to categorical indices...
🔹 Creating the sparse ratings matrix...
🔹 Calculating user-user similarity matrix...
✅ Similarity matrix of shape (13095, 13095) calculated successfully.
✅ Collaborative Memory-Based Model trained successfully.
💾 Saving Collaborative Memory-Based model to ../models/collaborative_memory_model.pkl...
💾 Saving CollaborativeMemoryModel to ../models/collaborative_memory_model.pkl...
✅ Collaborative Memory-Based model saved successfully at ../models/collaborative_memory_model.pkl
✅ Collaborative Memory-Based model saved successfully at ../models/collaborative_memory_model.pkl


In [12]:
train_and_save_collaborative_model_based(train_df, save_path="../models/collaborative_model_based.npz")


🔹 Training Collaborative Model-Based Model...
⚙️ Initializing the Collaborative Model-Based Recommender...


  check_blas_config()


🚀 Training Collaborative Filtering Model...
🔹 Starting training for Collaborative Filtering Model...
📈 Training ALS model with 13095 users and 133 items...


100%|██████████| 20/20 [00:06<00:00,  2.88it/s]

✅ Training complete!
✅ Collaborative Filtering Model trained successfully.
💾 Saving Collaborative Model-Based model to ../models/collaborative_model_based.npz...
✅ Model saved at ../models/collaborative_model_based.npz
✅ Collaborative Model-Based model saved successfully at ../models/collaborative_model_based.npz





In [13]:
train_and_save_session_based(train_df)


🚀 Training Session-Based Model...

🔹 Extracting user sessions for GRU training...
🔹 Extracting sessions from training data...
✅ Extracted 13095 sessions from training data.
📦 Extracted 13095 sessions for training.
🚀 Training GRU4Rec with 151 items...
🧮 Epoch 1/5, Loss: 17854.6833
🧮 Epoch 2/5, Loss: 13653.0654
🧮 Epoch 3/5, Loss: 11768.9861
🧮 Epoch 4/5, Loss: 10441.6738
🧮 Epoch 5/5, Loss: 9777.4353
✅ GRU4Rec model training complete.
✅ GRU4Rec model saved at ../models/session_gru.pth
⚠️ No co-occurrence matrix to save.
✅ Session-Based model (GRU + Co-occurrence) saved at ../models/session_gru.pth and ../models/session_cooccurrence.npy


__📘 Note on High Loss Values__
>The high loss values observed during training are due to the absence of a timestamp column in the dataset. Without timestamps, the natural order of user interactions is unknown, leading to randomly ordered sessions. This disrupts the sequential nature of GRU-based models like GRU4Rec, causing the model to predict on incorrect targets, which inflates the loss values. A potential solution is to introduce synthetic timestamps or sort user interactions logically to preserve the session’s temporal structure.