# Recommender System - Baseline Model

In [31]:
import os
import math
import random
import numpy as np
import pandas as pd
import csv
from collections import defaultdict
from tqdm import tqdm

os.environ["KAGGLEHUB_CACHE"] = "D:/Python Projects/recommendation_system/recsys/data"

import kagglehub
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

In [32]:
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cuda


## Loading the dataset
The dataset is Douban Cross Domain dataset from Kaggle.

In [50]:
DOUBAN_DATASET = r"fengzhujoey/douban-datasetratingreviewside-information"
DOUBAN_DOMAIN = "moviereviews"

def load_douban_reviews(domain:str, max_items:int=100000) -> pd.DataFrame:
    path = kagglehub.dataset_download(DOUBAN_DATASET)
    df_raw = pd.read_csv(f"{path}/douban_dataset(text information)/{DOUBAN_DOMAIN}_cleaned.txt",
                         sep="\t",
                         quoting=csv.QUOTE_MINIMAL,
                         dtype=str,
                         keep_default_na=False)

    df_raw.columns = [c.strip().strip('"').strip('"') for c in df_raw.columns]

    # Coerce rating to float (invalid/missing -> 0.0)
    ratings = pd.to_numeric(df_raw["rating"], errors="coerce").fillna(0.0).astype(float)

    # Parse time to POSIX seconds (int)
    timestamp = pd.to_datetime(df_raw["time"], errors="coerce", utc=True)
    timestamp = timestamp.fillna(pd.Timestamp("1970-01-01", tz="UTC"))
    timestamp_s = (timestamp.astype(int) // 10**9).astype(int)

    out = pd.DataFrame({
        "user": df_raw["user_id"].astype(int),
        "item": df_raw["movie_id"].astype(int),
        "rating": ratings,
        "domain": domain,
        "timestamp": timestamp_s
    })

    if max_items:
        out = out.copy()
        out = out.sample(min(max_items, len(out))).reset_index(drop=True)

    return out

df = load_douban_reviews(DOUBAN_DOMAIN, max_items=None)
print(f"Loaded {len(df)} rows from {DOUBAN_DOMAIN} domain.")

Loaded 1278401 rows from moviereviews domain.


## Preprocessing the dataset
- Make it implicit by considering all ratings as positive interactions.
- Filter out users and items with less than 5 interactions.
- Create a mapping of user and item IDs to indices.

In [34]:
# Make implicit dataset, filter users/items with less than 5 interactions, and encode user/item IDs
def preprocess_dataset(df, min_user_interactions=5, min_item_interactions=5):
    # Make it implicit
    df["label"] = 1.0

    # Filter users and items with less than 5 interactions
    user_counts = df["user"].value_counts()
    item_counts = df["item"].value_counts()

    valid_users = user_counts[user_counts >= min_user_interactions].index
    valid_items = item_counts[item_counts >= min_item_interactions].index

    df = df[df["user"].isin(valid_users) & df["item"].isin(valid_items)].copy()
    print("After interactions filtering:", len(df), "rows,", df["user"].nunique(), "users,", df["item"].nunique(), "items")

    user_enc = LabelEncoder()
    item_enc = LabelEncoder()

    df["user_id"] = user_enc.fit_transform(df["user"])
    df["item_id"] = item_enc.fit_transform(df["item"])

    return df

filtered_df = preprocess_dataset(df, min_user_interactions=20, min_item_interactions=20)

After interactions filtering: 1208553 rows, 2590 users, 14938 items


In [46]:
def calculate_data_sparsity(df):
    num_users = df["user"].nunique()
    num_items = df["item"].nunique()
    num_interactions = len(df)
    density = num_interactions / (num_users * num_items)
    sparsity = 1 - density

    print(f"Number of users: {num_users}")
    print(f"Number of items: {num_items}")
    print(f"Number of interactions: {num_interactions}")
    print(f"-" * 30)
    print(f"Interaction Matrix Density: {density:.4f}")
    print(f"Interaction Matrix Sparsity: {sparsity:.4f}")

calculate_data_sparsity(filtered_df)

Number of users: 2590
Number of items: 14938
Number of interactions: 1208553
------------------------------
Interaction Matrix Density: 0.0312
Interaction Matrix Sparsity: 0.9688


In [35]:
# Define Leave-One-Out (LOO) split
def loo_split(df):
    df = df.sort_values(["user_id", "timestamp"])
    train_rows, val_rows, test_rows = [], [], []
    for uid, group in df.groupby("user_id", sort=False):
        g = group.sort_values("timestamp")
        if len(g) < 5:
            train_rows.append(g)
            continue
        test_rows.append(g.iloc[[-1]])  # Last interaction as test
        val_rows.append(g.iloc[[-2]])    # Second last interaction as validation
        train_rows.append(g.iloc[:-2])    # All but last two as training

    train_df = pd.concat(train_rows, ignore_index=True)
    val_df = pd.concat(val_rows, ignore_index=True)
    test_df = pd.concat(test_rows, ignore_index=True)

    return train_df, val_df, test_df

train_df, val_df, test_df = loo_split(filtered_df)
print(f"Train/Validation/Test split: {len(train_df)}, {len(val_df)}, {len(test_df)}")

Train/Validation/Test split: 1203373, 2590, 2590


## Building the dataset with negative sampling

In [36]:
NUM_USERS = filtered_df["user_id"].max() + 1
NUM_ITEMS = filtered_df["item_id"].max() + 1

In [37]:
# Rank the single positive against k negative samples for evaluation
pos_items_by_user = defaultdict(set)
for u, i in zip(train_df["user_id"].values, train_df["item_id"].values):
    pos_items_by_user[u].add(i)

def sample_eval_negatives(eval_df, num_items, pos_by_user, n_neg=99, seed=42):
    rng = np.random.default_rng(seed)
    neg_dict = {}
    for u, pos_i in zip(eval_df["user_id"].values, eval_df["item_id"].values):
        user_pos = pos_by_user[u].copy()
        user_pos.add(pos_i)
        candidates = []
        while len(candidates) < n_neg:
            cand = rng.integers(0, num_items)
            if cand not in user_pos:
                candidates.append(int(cand))
                user_pos.add(cand)
        neg_dict[u] = candidates
    return neg_dict

val_negatives = sample_eval_negatives(val_df, NUM_ITEMS, pos_items_by_user)
test_negatives = sample_eval_negatives(test_df, NUM_ITEMS, pos_items_by_user)
print(f"Sampled {len(val_negatives)} validation users and {len(test_negatives)} test users with {len(next(iter(val_negatives.values())))} negatives each.")

Sampled 2590 validation users and 2590 test users with 99 negatives each.


In [38]:
# Training dataset with k negatives per positive
class ImplicitTrainingDataset(Dataset):
    def __init__(self, train_df, num_items, pos_by_user, neg_k=4):
        self.pos_pairs = train_df[["user_id", "item_id"]].values.astype(np.int64)
        self.num_pos = len(self.pos_pairs)
        self.neg_k = neg_k
        self.num_items = num_items
        self.pos_by_user = pos_by_user
        self.length = self.num_pos * (1 + self.neg_k)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        pos_idx = idx // (self.neg_k + 1)
        is_pos = (idx % (self.neg_k + 1) == 0)
        u, i_pos = self.pos_pairs[pos_idx]
        if is_pos:
            return int(u), int(i_pos), 1.0

        while True:
            j = random.randint(0, self.num_items - 1)
            if j not in self.pos_by_user[u]:
                return int(u), int(j), 0.0

train_dataset = ImplicitTrainingDataset(train_df,
                                  num_items=NUM_ITEMS,
                                  pos_by_user=pos_items_by_user,
                                  neg_k=4)
train_loader = DataLoader(train_dataset,
                          batch_size=1024,
                          shuffle=True)

print(f"Training dataset created with {len(train_dataset)} samples.")

Training dataset created with 6016865 samples.


## Utility functions for evaluation & ranking metrics
- For each val user, build candidate set = `{positive} U {k negatives}`
- Score, sort, compute HR@K, NDCG@K, Precision@K, Recall@K

In [39]:
# @torch.no_grad()
# def compute_val_loss(model, eval_df, loss_fn, pos_items_by_user, num_items, neg_k=4, device="cpu"):
#     model.eval()
#     losses = []
#     for u, i_pos in zip(eval_df["user_id"].values, eval_df["item_d"].values):
#         # Positives
#         u_t = torch.tensor([u], dtype=torch.long, device=device)
#         i_t = torch.tensor([i_pos], dtype=torch.long, device=device)
#         logit_pos = model(u_t, i_t)
#         loss_pos = loss_fn(logit_pos, torch.ones_like(logit_pos))
#         losses.append(loss_pos.item())
#
#         # Negatives
#         taken = pos_items_by_user[u].copy()
#         taken.add(int(i_pos))
#         n_added = 0
#         while n_added < neg_k:
#             j = random.randint(0, num_items - 1)
#             if j not in taken:
#                 uj = torch.tensor([u], dtype=torch.long, device=device)
#                 jj = torch.tensor([j], dtype=torch.long, device=device)
#                 logit_neg = model(uj, jj)
#                 loss_neg = loss_fn(logit_neg, torch.zeros_like(logit_neg))
#                 losses.append(loss_neg.item())
#                 taken.add(j)
#                 n_added += 1
#
#     return float(np.mean(losses))
#
# def evaluate_ranking(eval_df, n_neg, seed=42):
#     rng = np.random.default_rng(seed)
#     neg_dict = {}
#

In [40]:
# Validation loss and ranking metrics
@torch.no_grad()
def evaluate_ranking(model, eval_df, neg_dict, k=10, device="cpu"):
    model.eval()
    hits, ndcgs, precisions, recalls = [], [], [], []
    rng = np.random.default_rng(42)

    for u, pos_i in zip(eval_df["user_id"].values, eval_df["item_id"].values):
        candidates = [int(pos_i)] + neg_dict[u]
        users = torch.tensor([u] * len(candidates), dtype=torch.long, device=device)
        items = torch.tensor(candidates, dtype=torch.long, device=device)

        scores = model(users, items).detach().cpu().numpy()

        # Tiny noise to break ties fairly (same user → same noise stream)
        user_rng = np.random.default_rng(rng.integers(0, 2**31) ^ u)
        scores = scores + 1e-12 * user_rng.standard_normal(scores.shape)
        rank = np.argsort(-scores).tolist().index(0) + 1
        # rank = 1 + int(np.sum(scores[1:] > scores[0]))

        hit = 1.0 if rank <= k else 0.0
        hits.append(hit)
        ndcg = (1.0 / math.log2(rank + 1)) if rank <= k else 0.0
        ndcgs.append(ndcg)
        precisions.append(hit / k)
        recalls.append(hit)

    return {
        "HR@K": float(np.mean(hits)),
        "NDCG@K": float(np.mean(ndcgs)),
        "Precision@K": float(np.mean(precisions)),
        "Recall@K": float(np.mean(recalls))
    }

@torch.no_grad()
def compute_val_loss(model, eval_df, loss_fn, pos_items_by_user, num_items, neg_k=4, device="cpu"):
    model.eval()
    losses = []
    for u, i_pos in zip(eval_df["user_id"].values, eval_df["item_id"].values):
        # Positives
        u_t = torch.tensor([u], dtype=torch.long, device=device)
        i_t = torch.tensor([i_pos], dtype=torch.long, device=device)
        logit_pos = model(u_t, i_t)
        loss_pos = loss_fn(logit_pos, torch.ones_like(logit_pos))
        losses.append(loss_pos.item())

        # Negatives
        taken = pos_items_by_user[u].copy()
        taken.add(int(i_pos))
        n_added = 0
        while n_added < neg_k:
            j = random.randint(0, num_items - 1)
            if j not in taken:
                jj = torch.tensor([j], dtype=torch.long, device=device)
                logit_neg = model(u_t, jj)
                loss_neg = loss_fn(logit_neg, torch.zeros_like(logit_neg))
                losses.append(loss_neg.item())
                taken.add(j)
                n_added += 1

    return float(np.mean(losses))

## Building baseline model

In [41]:
# # Simple Matrix Factorization model with biases (dot product)
# class MF(nn.Module):
#     def __init__(self, num_users, num_items, embed_dim=64):
#         super().__init__()
#         self.user_emb = nn.Embedding(num_users, embed_dim)
#         self.item_emb = nn.Embedding(num_items, embed_dim)
#         nn.init.normal_(self.user_emb.weight, std=0.01)
#         nn.init.normal_(self.item_emb.weight, std=0.01)
#
#     def forward(self, users, items):
#         u = self.user_emb(users)
#         v = self.item_emb(items)
#         output = (u * v).sum(dim=1, keepdim=True)
#         return output.squeeze()

In [42]:
class ImprovedMatrixFactorization(nn.Module):
    """Improved MF model with dropout and better initialization"""
    def __init__(self, n_users, n_items, embedding_dim, dropout=0.2):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        self.dropout = nn.Dropout(dropout)

        # Better initialization
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_ids, item_ids):
        user_embeds = self.dropout(self.user_embedding(user_ids))
        item_embeds = self.dropout(self.item_embedding(item_ids))

        dot_product = (user_embeds * item_embeds).sum(dim=1, keepdim=True)
        output = dot_product + self.user_bias(user_ids) + self.item_bias(item_ids) + self.global_bias

        return output.squeeze()

    def predict_all_items(self, user_id):
        """Predict scores for all items for a given user"""
        user_tensor = torch.tensor([user_id], dtype=torch.long).to(next(self.parameters()).device)
        all_items = torch.arange(self.item_embedding.num_embeddings).to(next(self.parameters()).device)

        user_embed = self.user_embedding(user_tensor)
        item_embeds = self.item_embedding(all_items)

        scores = torch.matmul(user_embed, item_embeds.T).squeeze()
        scores += self.user_bias(user_tensor).squeeze()
        scores += self.item_bias(all_items).squeeze()
        scores += self.global_bias

        return torch.sigmoid(scores)

## Train loop

In [43]:
EPOCHS = 30
K = 10 # Cutoff for ranking metrics

model = ImprovedMatrixFactorization(n_users=NUM_USERS, n_items=NUM_ITEMS, embedding_dim=64).to(DEVICE)
# model = MF(num_users=NUM_USERS, num_items=NUM_ITEMS, embed_dim=32).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3)
loss_fn = nn.BCEWithLogitsLoss()

best_ndcg = 0.0
best_epoch = 0

for epoch in range(0, EPOCHS):
    model.train()
    running_loss = 0.0
    n_batches = 0

    for users, items, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{EPOCHS}"):
        users = users.to(DEVICE)
        items = items.to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()
        logits = model(users, items)
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        n_batches += 1

    train_loss = running_loss / max(n_batches, 1)
    val_loss = compute_val_loss(model, val_df, loss_fn, pos_items_by_user, NUM_ITEMS, neg_k=4, device=DEVICE)
    metrics = evaluate_ranking(model, val_df, val_negatives, k=K, device=DEVICE)

    improvement_msg = ""
    cur_ndcg = metrics["NDCG@K"]
    if cur_ndcg > best_ndcg:
        improvement_msg = f"   <--- New best NDCG@{K}: {cur_ndcg:.4f}"
        best_ndcg = cur_ndcg
        best_epoch = epoch + 1

    print(f"Epoch {epoch + 1}/{EPOCHS} - "
          f"Train Loss: {train_loss:.4f}, "
          f"Val Loss: {val_loss:.4f}, "
          f"HR@{K}: {metrics['HR@K']:.4f}, "
          f"NDCG@{K}: {metrics['NDCG@K']:.4f}, "
          f"Precision@{K}: {metrics['Precision@K']:.4f}, "
          f"Recall@{K}: {metrics['Recall@K']:.4f}, "
          f"{improvement_msg}")

print("\n" + "="*50)
print("Training Complete!")
print(f"Best performance was at Epoch {best_epoch} with NDCG@{K}: {best_ndcg:.4f}")
print("="*50)

Epoch 1/30: 100%|██████████| 5876/5876 [00:45<00:00, 128.97it/s]


Epoch 1/30 - Train Loss: 0.5165, Val Loss: 0.4991, HR@10: 0.5429, NDCG@10: 0.3398, Precision@10: 0.0543, Recall@10: 0.5429,    <--- New best NDCG@10: 0.3398


Epoch 2/30: 100%|██████████| 5876/5876 [00:44<00:00, 131.81it/s]


Epoch 2/30 - Train Loss: 0.4994, Val Loss: 0.4992, HR@10: 0.5629, NDCG@10: 0.3227, Precision@10: 0.0563, Recall@10: 0.5629, 


Epoch 3/30: 100%|██████████| 5876/5876 [00:44<00:00, 131.50it/s]


Epoch 3/30 - Train Loss: 0.4994, Val Loss: 0.4991, HR@10: 0.5556, NDCG@10: 0.3405, Precision@10: 0.0556, Recall@10: 0.5556,    <--- New best NDCG@10: 0.3405


Epoch 4/30:  17%|█▋        | 975/5876 [00:07<00:37, 130.05it/s]


KeyboardInterrupt: 