# Recommendation on the MovieLens dataset
This notebook demonstrates the use of multiple recommendation algorithms on the MovieLens-1M dataset.

Algorithms used include:
- Sequenial recommendation algorithms, **SASRec** and **BERT4Rec**
- Matrix factorization techniques (in progress, not included yet)

Experiment:
For each user, we will...
- hold out the latest movie by them. 
- train a model based on all the preceding movies
- return the top 100 movies predicted by the model to see if the held-out movie was included

***NOTE***  
*We only care about movies that were watched and **rated highly (>= 4)**. Anything else is ignored!*

## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import random

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from sasrec import SASRec, SASRecDataset
from bert4rec import BERT4Rec, BERT4RecDataset

## Load and prepare data
***NOTE***  
*It is assumed that that MovieLens-1M dataset has already been downloaded and placed next to this notebook in a folder named `ml-1m`.*

This is what'll happen below:
- After loading the data, sort each user's sequence of movie ratings chronologically.
- If a user's sequence is less than 3 movie ratings long, use all movie ratings for training. Otherwise, use all but the last two ratings for training, second-to-last rating for validation and the last rating for testing.
- After that, we'll sample "negative" examples to use for training alongside the actual movies that were selected/rated by the user.

### Preprocess MovieLens 1M

#### Ratings

In [2]:
# Load ratings data
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",
    names=["user", "item", "rating", "timestamp"]
)

# Keep ratings >= 4
ratings = ratings[ratings["rating"] >= 4]

# Map to consecutive IDs 
# (some users/movies disappeared from ratings df after previous step)
user2id = {u: i+1 for i, u in enumerate(ratings["user"].unique())}
item2id = {m: i+1 for i, m in enumerate(ratings["item"].unique())}
ratings["user"] = ratings["user"].map(user2id)
ratings["item"] = ratings["item"].map(item2id)

n_users = len(user2id)
n_items = len(item2id)

# Build user sequences
user_sequences = defaultdict(list)
for row in ratings.itertuples(index=False):
    user_sequences[row.user].append((row.item, row.timestamp))

# Sort by time
for u in user_sequences:
    user_sequences[u] = [x[0] for x in sorted(user_sequences[u], key=lambda x: x[1])]

# Leave-one-out split
train_seqs, valid_seqs, test_seqs = {}, {}, {}
for u, items in user_sequences.items():
    if len(items) < 3:
        train_seqs[u] = items
        valid_seqs[u], test_seqs[u] = [], []
    else:
        train_seqs[u] = items[:-2]
        valid_seqs[u] = [items[-2]]
        test_seqs[u] = [items[-1]]

#### Movie Details

In [3]:
# Load movies for item names
movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    engine="python",
    names=["item", "title", "genres"], encoding='latin-1'
)
movies["item"] = movies["item"].map(item2id)
itemid2name = dict(zip(movies["item"], movies["title"]))

## Train & Predict

#### Setup

In [4]:
# GPU support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Max sequence length for sequential recommenders
MAX_SEQ_LEN = 50

### Training (SASRec)

In [6]:
# Prepare training set
train_dataset = SASRecDataset(user_sequences, n_items, max_len=MAX_SEQ_LEN, num_negatives=10)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# Initialize model and optimizer
model = SASRec(n_items, hidden_dim=64, max_len=MAX_SEQ_LEN).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(20):  # small demo run
    model.train()
    total_loss = 0
    for seq, pos, neg in train_loader:
        seq, pos, neg = seq.to(device), pos.to(device), neg.to(device)

        seq_repr = model(seq)
        pos_emb = model.item_emb(pos)
        neg_emb = model.item_emb(neg)

        pos_score = (seq_repr * pos_emb).sum(dim=-1)
        neg_score = (seq_repr * neg_emb).sum(dim=-1)

        loss = -torch.mean(torch.log(torch.sigmoid(pos_score - neg_score)))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# Keep model for later evaluation
sasrec_model = model


Epoch 1, Loss: 4.1632
Epoch 2, Loss: 3.3145
Epoch 3, Loss: 3.1876
Epoch 4, Loss: 2.5853
Epoch 5, Loss: 2.3459
Epoch 6, Loss: 2.1365
Epoch 7, Loss: 1.8776
Epoch 8, Loss: 1.7370
Epoch 9, Loss: 1.5095
Epoch 10, Loss: 1.3991
Epoch 11, Loss: 1.2741
Epoch 12, Loss: 1.0858
Epoch 13, Loss: 0.9553
Epoch 14, Loss: 0.8929
Epoch 15, Loss: 0.7823
Epoch 16, Loss: 0.7402
Epoch 17, Loss: 0.7076
Epoch 18, Loss: 0.6953
Epoch 19, Loss: 0.6708
Epoch 20, Loss: 0.6683


### Training (BERT4Rec)

In [7]:
# Prepare training set
train_dataset = BERT4RecDataset(train_seqs, n_items)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# Initialize model, optimizer and loss function
model = BERT4Rec(n_items, hidden_dim=64, max_len=MAX_SEQ_LEN).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

for epoch in range(20):  # small demo
    model.train()
    total_loss = 0
    for masked_seq, labels in train_loader:
        masked_seq, labels = masked_seq.to(device), labels.to(device)
        outputs = model(masked_seq)  # [B, L, H]
        logits = outputs @ model.item_emb.weight.T  # [B, L, n_items+2]

        loss = criterion(logits.view(-1, n_items+2), labels.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss {total_loss/len(train_loader):.4f}")

# Keep model for later evaluation
bert4rec_model = model

Epoch 1: Loss 23.1741
Epoch 2: Loss 15.8258
Epoch 3: Loss 12.7880
Epoch 4: Loss 11.0311
Epoch 5: Loss 9.8299
Epoch 6: Loss 9.0530
Epoch 7: Loss 8.5721
Epoch 8: Loss 8.2285
Epoch 9: Loss 8.0283
Epoch 10: Loss 7.8777
Epoch 11: Loss 7.7294
Epoch 12: Loss 7.6659
Epoch 13: Loss 7.5835
Epoch 14: Loss 7.5338
Epoch 15: Loss 7.4816
Epoch 16: Loss 7.4594
Epoch 17: Loss 7.4258
Epoch 18: Loss 7.3971
Epoch 19: Loss 7.3863
Epoch 20: Loss 7.3700


### Predict & Evaluate (Hit@K, NDCG@K)

#### Get Rank for a Single Example

In [8]:
def get_rank(model, u, seed=42):
    """
    Deterministic rank computation for a single user u.
    Seeds Python/NumPy/torch RNGs and forces deterministic cuDNN behavior
    so repeated calls return the same rank.
    """
    # seed RNGs
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    # make CUDA deterministic (may slow down)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    true_item = test_seqs[u][0]                                    # true next item for this user
    seq = train_seqs[u]                                            # this user's previous movies
    candidates = [x for x in range(1, n_items+1) if x not in seq]  # candidates = all movies - user's previous movies

    # let true item be the first candidate for easier indexing later
    candidates.remove(true_item)
    candidates = [true_item] + candidates

    scores = model.predict(seq, candidates, device).cpu().detach().numpy().flatten()  # score all candidates
    
    ranked = np.argsort(-scores)          # rank candidates by score (highest score first)
    rank_of_true = list(ranked).index(0)  # 0 = index of true item in candidates
    
    return rank_of_true

In [9]:
get_rank(sasrec_model, 112)

577

In [10]:
get_rank(bert4rec_model, 224)

243

#### Evaluate Models on Test Set

In [11]:
def evaluate_model(model, K=10):
    model.eval()
    hits, ndcgs = [], []

    with torch.no_grad():
        for u in test_seqs:
            # ignore users with no test items
            if len(test_seqs[u]) == 0:
                continue

            # get rank of the true item
            rank_of_true = get_rank(model, u)

            # Metrics
            if rank_of_true < K:
                hits.append(1)
                ndcgs.append(1 / np.log2(rank_of_true + 2))
            else:
                hits.append(0)
                ndcgs.append(0)

    hit_rate = np.mean(hits)
    ndcg     = np.mean(ndcgs)
    print(f"Hit@{K}: {hit_rate:.4f}, NDCG@{K}: {ndcg:.4f}\n")

    return hit_rate, ndcg

In [12]:
hit_rate, ndcg = evaluate_model(sasrec_model, K=100)

  output = torch._nested_tensor_from_mask(


Hit@100: 0.0908, NDCG@100: 0.0206



In [13]:
hit_rate, ndcg = evaluate_model(bert4rec_model, K=100)

Hit@100: 0.2328, NDCG@100: 0.0559

