# BERT4Rec on MovieLens
This notebook demonstrates the use of the sequenial recommendation algorithm, **BERT4Rec**, to predict the next movie for a particular user.

## Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import random

## Load and prepare data
***NOTE***  
It is assumed that that MovieLens-1M dataset has already been downloaded and placed next to this notebook in a folder named `ml-1m`.

This is what'll happen below:
- After loading the data, sort each user's sequence of movie ratings chronologically.
- If a user's sequence is less than 2 movie ratings long, ignore. Otherwise, use all but the last rating for training, and use the last rating for testing.

In [2]:
# =========================
# 1) Load and preprocess MovieLens
# =========================
MAX_SEQ_LEN = 50

# Load ratings
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",
    names=["user", "item", "rating", "timestamp"]
)

# Map users and items to ids
user2id = {u: i+1 for i, u in enumerate(ratings["user"].unique())}
item2id = {m: i+1 for i, m in enumerate(ratings["item"].unique())}
id2item = {v: k for k, v in item2id.items()}

ratings["user"] = ratings["user"].map(user2id)
ratings["item"] = ratings["item"].map(item2id)

n_users = len(user2id)
n_items = len(item2id)

# Load movies for item names
movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    engine="python",
    names=["item", "title", "genres"], encoding='latin-1'
)
movies["item"] = movies["item"].map(item2id)
itemid2name = dict(zip(movies["item"], movies["title"]))

In [3]:
# =========================
# 2) Build user sequences
# =========================
user_seqs = defaultdict(list)
for u, i, t in ratings[["user", "item", "timestamp"]].itertuples(index=False):
    user_seqs[u].append((t, i))

for u in user_seqs:
    user_seqs[u] = [x for _, x in sorted(user_seqs[u])]

train_seqs, test_seqs = {}, {}
for u, seq in user_seqs.items():
    if len(seq) < 2:
        continue
    train_seqs[u] = seq[:-1]
    test_seqs[u] = [seq[-1]]

In [4]:
# =========================
# 3) Dataset for BERT4Rec (masked modeling)
# =========================
MASK_ID = n_items + 1  # special mask token

class BERT4RecDataset(Dataset):
    def __init__(self, user_seqs, max_len=MAX_SEQ_LEN, mask_prob=0.15):
        self.user_seqs = user_seqs
        self.users = list(user_seqs.keys())
        self.max_len = max_len
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        u = self.users[idx]
        seq = self.user_seqs[u][-self.max_len:]
        seq = [0]*(self.max_len - len(seq)) + seq

        seq = torch.tensor(seq, dtype=torch.long)

        # Masking
        masked_seq = seq.clone()
        labels = torch.full_like(seq, -100)  # ignore index
        prob = torch.rand(seq.size())
        mask = (prob < self.mask_prob) & (seq != 0)
        masked_seq[mask] = MASK_ID
        labels[mask] = seq[mask]

        return masked_seq, labels

## BERT4Rec model

In [5]:
# =========================
# 4) BERT4Rec model
# =========================
class BERT4Rec(nn.Module):
    def __init__(self, n_items, hidden_dim=64, max_len=50,
                 num_layers=2, num_heads=2, dropout=0.2):
        super().__init__()
        self.item_emb = nn.Embedding(n_items+2, hidden_dim, padding_idx=0)  # +1 for MASK
        self.pos_emb = nn.Embedding(max_len, hidden_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim*4,
            dropout=dropout,
            activation="gelu",
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.norm = nn.LayerNorm(hidden_dim)

        self.hidden_dim = hidden_dim
        self.max_len = max_len
        self.n_items = n_items

    def forward(self, seq):
        B, L = seq.shape
        pos_ids = torch.arange(L, device=seq.device).unsqueeze(0).expand(B, L)
        x = self.item_emb(seq) + self.pos_emb(pos_ids)

        x = self.encoder(x)
        x = self.norm(x)
        return x

    def predict(self, seq, candidates):
        x = self.forward(seq)  # [B, L, H]
        last_hidden = x[:, -1, :]  # [B, H]
        cand_emb = self.item_emb(candidates)  # [B, C, H]
        scores = torch.bmm(cand_emb, last_hidden.unsqueeze(-1)).squeeze(-1)  # [B, C]
        return scores

## Train & Predict

In [6]:
# =========================
# 5) Training loop
# =========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERT4Rec(n_items, hidden_dim=64, max_len=MAX_SEQ_LEN).to(device)

train_dataset = BERT4RecDataset(train_seqs)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(20):  # small demo
    model.train()
    total_loss = 0
    for masked_seq, labels in train_loader:
        masked_seq, labels = masked_seq.to(device), labels.to(device)
        outputs = model(masked_seq)  # [B, L, H]
        logits = outputs @ model.item_emb.weight.T  # [B, L, n_items+2]

        loss = criterion(logits.view(-1, n_items+2), labels.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss {total_loss/len(train_loader):.4f}")

Epoch 1: Loss 22.5920
Epoch 2: Loss 16.0852
Epoch 3: Loss 13.2235
Epoch 4: Loss 11.3859
Epoch 5: Loss 10.1377
Epoch 6: Loss 9.3468
Epoch 7: Loss 8.8279
Epoch 8: Loss 8.4794
Epoch 9: Loss 8.2370
Epoch 10: Loss 8.0493
Epoch 11: Loss 7.9462
Epoch 12: Loss 7.8436
Epoch 13: Loss 7.7658
Epoch 14: Loss 7.7183
Epoch 15: Loss 7.6657
Epoch 16: Loss 7.6474
Epoch 17: Loss 7.6018
Epoch 18: Loss 7.5964
Epoch 19: Loss 7.5768
Epoch 20: Loss 7.5515


In [7]:
# =========================
# 6) Evaluation (Hit@K, NDCG@K)
# =========================
def evaluate_model(model, train_seqs, test_seqs, n_items, itemid2name, K=10, num_neg=100):
    model.eval()
    hits, ndcgs = [], []
    example_outputs = []

    with torch.no_grad():
        for u in test_seqs:
            if len(test_seqs[u]) == 0:
                continue

            true_item = test_seqs[u][0]
            seq = train_seqs[u][-MAX_SEQ_LEN:]
            seq_padded = [0]*(MAX_SEQ_LEN - len(seq)) + seq
            seq_padded[-1] = MASK_ID  # mask last position

            seq_tensor = torch.tensor([seq_padded], dtype=torch.long).to(device)

            # Candidate set: true + negatives
            candidates = [true_item] + random.sample(range(1, n_items+1), num_neg)
            candidates_tensor = torch.tensor(candidates, dtype=torch.long).unsqueeze(0).to(device)

            scores = model.predict(seq_tensor, candidates_tensor).cpu().numpy().flatten()
            ranked = np.argsort(-scores)
            rank_of_true = list(ranked).index(0)

            # Metrics
            if rank_of_true < K:
                hits.append(1)
                ndcgs.append(1 / np.log2(rank_of_true + 2))
            else:
                hits.append(0)
                ndcgs.append(0)

            if len(example_outputs) < 5:
                topk_idx = ranked[:K]
                topk_items = [candidates[i] for i in topk_idx]
                topk_names = [itemid2name[it] for it in topk_items if it in itemid2name]
                example_outputs.append({
                    "true_item": itemid2name.get(true_item, str(true_item)),
                    "topk": topk_names
                })

    hit_rate = np.mean(hits)
    ndcg = np.mean(ndcgs)

    print(f"Hit@{K}: {hit_rate:.4f}, NDCG@{K}: {ndcg:.4f}\n")
    print("Sample recommendations:")
    for ex in example_outputs:
        print("True item:", ex["true_item"])
        print("Top-K predictions:", ex["topk"])
        print("---")

    return hit_rate, ndcg

In [8]:
# Run evaluation
hit, ndcg = evaluate_model(model, train_seqs, test_seqs, n_items, itemid2name, K=10, num_neg=100)

Hit@10: 0.3598, NDCG@10: 0.1828

Sample recommendations:
True item: Pocahontas (1995)
Top-K predictions: ['Arachnophobia (1990)', 'Seven (Se7en) (1995)', 'Sleepless in Seattle (1993)', 'Batman Returns (1992)', 'Hoop Dreams (1994)', 'Tao of Steve, The (2000)', 'Outbreak (1995)', "Pee-wee's Big Adventure (1985)", 'Basic Instinct (1992)', 'Jackie Brown (1997)']
---
True item: Armageddon (1998)
Top-K predictions: ['Toy Story 2 (1999)', 'When Harry Met Sally... (1989)', 'Armageddon (1998)', 'Shanghai Noon (2000)', 'Seven (Se7en) (1995)', 'Blues Brothers, The (1980)', 'Grease (1978)', 'From Dusk Till Dawn (1996)', 'Star Trek IV: The Voyage Home (1986)', 'Trading Places (1983)']
---
True item: Little Mermaid, The (1989)
Top-K predictions: ['Fight Club (1999)', 'Breakfast Club, The (1985)', 'Hunt for Red October, The (1990)', 'Little Mermaid, The (1989)', 'Get Shorty (1995)', 'Birds, The (1963)', 'Sense and Sensibility (1995)', 'Swingers (1996)', 'Midnight Cowboy (1969)', 'Conspiracy Theory (1