# 03: Baseline Model Training

This notebook trains the baseline models:
1. GMF (Generalized Matrix Factorization)
2. MLP (Multi-Layer Perceptron)
3. NeuMF (Neural Matrix Factorization)

And evaluates their performance.

In [None]:
# Imports
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from src.config import config
from src.models.gmf import GMF
from src.models.mlp import MLP
from src.models.neumf import NeuMF
from src.train import train_model
from src.evaluate import evaluate_model
from src.negative_sampling import build_user_history

# Set device
config._set_device()
print(f"Using device: {config.train.DEVICE}")

## Load Data

In [None]:
# Load data splits
train_df = pd.read_pickle(config.paths.train_path)
val_df = pd.read_pickle(config.paths.val_path)
test_df = pd.read_pickle(config.paths.test_path)

# Load mappings
import pickle
with open(config.paths.mappings_path, 'rb') as f:
    mappings = pickle.load(f)

num_users = mappings['num_users']
num_items = mappings['num_items']
num_genres = mappings['num_genres']

print(f"Dataset: {num_users:,} users, {num_items:,} items, {num_genres} genres")
print(f"Train: {len(train_df):,} ratings")
print(f"Val: {len(val_df):,} ratings")
print(f"Test: {len(test_df):,} ratings")

## Prepare Data for Training

In [None]:
# Extract arrays
train_users = train_df['userId'].values
train_items = train_df['movieId'].values

val_users = val_df['userId'].values
val_items = val_df['movieId'].values

test_users = test_df['userId'].values
test_items = test_df['movieId'].values

# Build user history for evaluation
user_history = build_user_history(train_users, train_items)

print("Data prepared for training!")

## Train GMF

In [None]:
print("\n" + "="*60)
print("TRAINING GMF")
print("="*60)

# Create model
gmf_model = GMF(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=config.model.USER_EMBEDDING_DIM,
    hidden_dim=config.model.GMF_HIDDEN_DIM,
)

print(f"GMF parameters: {gmf_model.count_parameters():,}")

# Train
gmf_history = train_model(
    model=gmf_model,
    train_users=train_users,
    train_items=train_items,
    val_data={
        'users': val_users,
        'items': val_items,
    },
    num_items=num_items,
    num_epochs=config.train.NUM_EPOCHS,
    batch_size=config.train.BATCH_SIZE,
    learning_rate=config.train.LEARNING_RATE,
    weight_decay=config.train.WEIGHT_DECAY,
    num_negatives=config.train.NUM_NEGATIVES,
    device=config.train.DEVICE,
    save_dir=config.paths.TRAINED_MODELS_DIR,
    early_stopping_patience=config.train.EARLY_STOPPING_PATIENCE,
    log_dir=config.paths.TENSORBOARD_LOG_DIR + "/gmf",
)

## Train MLP

In [None]:
print("\n" + "="*60)
print("TRAINING MLP")
print("="*60)

# Create model
mlp_model = MLP(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=config.model.USER_EMBEDDING_DIM,
    hidden_dims=config.model.MLP_HIDDEN_DIMS,
    dropout=config.model.MLP_DROPOUT,
)

print(f"MLP parameters: {mlp_model.count_parameters():,}")

# Train
mlp_history = train_model(
    model=mlp_model,
    train_users=train_users,
    train_items=train_items,
    val_data={
        'users': val_users,
        'items': val_items,
    },
    num_items=num_items,
    num_epochs=config.train.NUM_EPOCHS,
    batch_size=config.train.BATCH_SIZE,
    learning_rate=config.train.LEARNING_RATE,
    weight_decay=config.train.WEIGHT_DECAY,
    num_negatives=config.train.NUM_NEGATIVES,
    device=config.train.DEVICE,
    save_dir=config.paths.TRAINED_MODELS_DIR,
    early_stopping_patience=config.train.EARLY_STOPPING_PATIENCE,
    log_dir=config.paths.TENSORBOARD_LOG_DIR + "/mlp",
)

## Train NeuMF

In [None]:
print("\n" + "="*60)
print("TRAINING NeuMF")
print("="*60)

# Create model
neumf_model = NeuMF(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=config.model.USER_EMBEDDING_DIM,
    gmf_hidden_dim=config.model.GMF_HIDDEN_DIM,
    mlp_hidden_dims=config.model.MLP_HIDDEN_DIMS,
    mlp_dropout=config.model.MLP_DROPOUT,
    fusion_dim=config.model.NEUMF_FUSION_DIM,
)

print(f"NeuMF parameters: {neumf_model.count_parameters():,}")

# Train
neumf_history = train_model(
    model=neumf_model,
    train_users=train_users,
    train_items=train_items,
    val_data={
        'users': val_users,
        'items': val_items,
    },
    num_items=num_items,
    num_epochs=config.train.NUM_EPOCHS,
    batch_size=config.train.BATCH_SIZE,
    learning_rate=config.train.LEARNING_RATE,
    weight_decay=config.train.WEIGHT_DECAY,
    num_negatives=config.train.NUM_NEGATIVES,
    device=config.train.DEVICE,
    save_dir=config.paths.TRAINED_MODELS_DIR,
    early_stopping_patience=config.train.EARLY_STOPPING_PATIENCE,
    log_dir=config.paths.TENSORBOARD_LOG_DIR + "/neumf",
)

## Evaluate on Test Set

In [None]:
# Load best models and evaluate
models = {
    'GMF': GMF.load(f"{config.paths.TRAINED_MODELS_DIR}/GMF_best.pt", GMF, num_users=num_users, num_items=num_items),
    'MLP': MLP.load(f"{config.paths.TRAINED_MODELS_DIR}/MLP_best.pt", MLP, num_users=num_users, num_items=num_items),
    'NeuMF': NeuMF.load(f"{config.paths.TRAINED_MODELS_DIR}/NeuMF_best.pt", NeuMF, num_users=num_users, num_items=num_items),
}

results = {}
for name, model in models.items():
    print(f"\nEvaluating {name}...")
    metrics = evaluate_model(
        model=model,
        users=test_users,
        items=test_items,
        k_values=config.eval.K_VALUES,
        device=config.train.DEVICE,
        num_items=num_items,
        user_history=user_history,
    )
    results[name] = metrics
    print(f"  HR@10: {metrics['hr@10']:.4f}")
    print(f"  NDCG@10: {metrics['ndcg@10']:.4f}")
    print(f"  AUC: {metrics['auc']:.4f}")

## Results Summary

In [None]:
# Display comparison
results_df = pd.DataFrame(results).T
print("\n" + "="*60)
print("BASELINE MODEL COMPARISON")
print("="*60)
print(results_df)

print("\n" + "="*60)
print("BASELINE TRAINING COMPLETE!")
print("="*60)