Author: David Zelenay

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px


# Preprocessing

In [2]:
ratings_df = pd.read_parquet("../data/parquet/ratings.parquet")
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 762.9 MB


In [3]:
movies_df = pd.read_parquet("../data/parquet/movies.parquet")
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


# Prepare User Sequences
To train BERT4Rec, we need to convert the ratings data into sequences of movie IDs for each user, ordered by timestamp. Each sequence represents the user's interaction history.

In [4]:
# Create user sequences as tuples of (rating, movieId), ordered by timestamp
user_sequences = ratings_df.sort_values(['userId', 'timestamp']).groupby('userId')[['rating', 'movieId']].apply(lambda x: list(zip(x['rating'], x['movieId'])))
user_sequences = user_sequences.tolist()
print(f"Number of users: {len(user_sequences)}")
print(f"Example sequence: {user_sequences[0][:10]}")

Number of users: 162541
Example sequence: [(4.0, 5952), (2.5, 2012), (2.5, 2011), (4.0, 1653), (4.0, 1250), (3.5, 6539), (4.0, 6377), (4.0, 3448), (4.0, 1088), (3.5, 899)]


## Install and Import Required Packages
We'll use PyTorch and HuggingFace Transformers for the BERT4Rec model. If not already installed, run the following cell.

## Tokenize and Pad Sequences
BERT4Rec requires sequences of equal length. We'll map movie IDs to integer tokens and pad the sequences to a fixed length.

In [5]:
from collections import defaultdict
from torch.nn.utils.rnn import pad_sequence
import torch

# Create movieId to index mapping
movie_ids = set([movie for seq in user_sequences for movie in seq])
movie2idx = {movie: idx+1 for idx, movie in enumerate(sorted(movie_ids))}  # 0 reserved for padding
idx2movie = {idx: movie for movie, idx in movie2idx.items()}

# Convert sequences to index lists
indexed_sequences = [[movie2idx[movie] for movie in seq] for seq in user_sequences]

# Pad sequences
max_seq_len = 50  # You can adjust this
padded_sequences = pad_sequence([torch.tensor(seq[-max_seq_len:]) for seq in indexed_sequences], batch_first=True, padding_value=0)
print(f"Padded sequences shape: {padded_sequences.shape}")

Padded sequences shape: torch.Size([162541, 50])


## Prepare PyTorch Dataset and DataLoader
We'll create a custom Dataset to feed the padded sequences into the BERT4Rec model. Each sample will be a sequence for masked language modeling.

In [7]:
from torch.utils.data import Dataset, DataLoader
import random

class BERT4RecDataset(Dataset):
    def __init__(self, sequences, mask_prob=0.15):
        self.sequences = sequences
        self.mask_prob = mask_prob
        self.vocab_size = len(movie2idx) + 1  # +1 for padding

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx].clone()
        labels = seq.clone()
        mask = torch.rand(seq.size()) < self.mask_prob
        seq[mask] = self.vocab_size - 1  # Use last index as [MASK] token
        labels[~mask] = -100  # Only compute loss on masked tokens
        return seq, labels

# Create dataset and dataloader
mask_token = len(movie2idx) + 1
train_dataset = BERT4RecDataset(padded_sequences)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Check a batch
for batch in train_loader:
    print(batch[0].shape, batch[1].shape)
    break

torch.Size([64, 50]) torch.Size([64, 50])


## Define and Train the BERT4Rec Model
We'll use HuggingFace's `BertForMaskedLM` as the base for BERT4Rec. The model will be trained to predict masked movies in user sequences.

In [8]:
from transformers import BertConfig, BertForMaskedLM
from torch.optim import AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = BertConfig(
    vocab_size=len(movie2idx) + 2,  # +1 for padding, +1 for [MASK]
    max_position_embeddings=max_seq_len,
    num_attention_heads=4,
    num_hidden_layers=4,
    type_vocab_size=1
)
model = BertForMaskedLM(config).to(device)

optimizer = AdamW(model.parameters(), lr=5e-4)

# Training loop (1 epoch for demonstration)
model.train()
for batch in train_loader:
    input_ids, labels = batch[0].to(device), batch[1].to(device)
    outputs = model(input_ids=input_ids, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(f"Batch loss: {loss.item():.4f}")
    break  # Remove this break to train on the full dataset

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Batch loss: 12.7385


## Generate Recommendations with BERT4Rec
To recommend movies for a user, we mask the next position in their sequence and let the model predict likely movies. We then select the top predictions as recommendations.

In [None]:
# Example: Recommend movies for the first user in the dataset
user_seq = indexed_sequences[0][-max_seq_len:]  # Most recent interactions
seen_movies = set(user_seq)

# Prepare input: mask the last position
input_seq = user_seq.copy()
if len(input_seq) < max_seq_len:
    input_seq = [0] * (max_seq_len - len(input_seq)) + input_seq
input_seq[-1] = len(movie2idx) + 1  # [MASK] token
input_tensor = torch.tensor([input_seq]).to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_tensor)
    logits = outputs.logits
    masked_pos = -1  # Last position
    probs = logits[0, masked_pos].softmax(dim=-1)
    topk = torch.topk(probs, k=10)
    rec_indices = topk.indices.cpu().numpy()
    # Filter out padding, mask, and already seen movies
    rec_movies = [idx2movie[idx] for idx in rec_indices if idx in idx2movie and idx not in seen_movies][:10]  # Top 10 recommendations

print("Recommended movie IDs:", rec_movies)
# Optionally, map to movie titles
rec_movie_ids = [movie_id for rating, movie_id in rec_movies if rating > 3]

movie_titles = movies_df.set_index('movieId').loc[rec_movie_ids]['title'].tolist()
print("Recommended movie titles:", movie_titles)

Recommended movie IDs: [(2.5, 98607), (2.0, 188305), (2.0, 151605), (2.0, 46335), (4.0, 118894), (1.0, 3307), (4.0, 102263), (0.5, 97906), (3.0, 137347), (2.0, 4485)]
Recommended movie titles: ['Scooby-Doo! Abracadabra-Doo (2010)', 'Ju-on: White Ghost (2009)']


In [None]:
rec_movie_ids_only = [movie_id for _, movie_id in rec_movies]
print(rec_movie_ids_only)

{(1.0, 7167),
 (2.5, 90066),
 (3.0, 150570),
 (4.0, 7438),
 (1.0, 751),
 (3.0, 144154),
 (4.0, 1022),
 (3.5, 92920),
 (3.0, 186033),
 (2.0, 163931),
 (3.0, 173201),
 (1.0, 65261),
 (2.5, 148160),
 (4.0, 59116),
 (1.0, 4134),
 (4.5, 7882),
 (2.5, 128912),
 (4.5, 1466),
 (2.5, 170791),
 (3.0, 153953),
 (2.5, 164375),
 (2.5, 116080),
 (5.0, 101597),
 (2.5, 157959),
 (0.5, 3976),
 (2.5, 151543),
 (4.0, 68915),
 (1.5, 4578),
 (1.0, 49396),
 (4.5, 4849),
 (3.0, 192799),
 (0.5, 7359),
 (4.5, 33896),
 (1.0, 113906),
 (0.5, 943),
 (4.5, 27480),
 (4.0, 72298),
 (3.5, 164196),
 (4.0, 114177),
 (4.0, 65882),
 (1.0, 101074),
 (5.0, 140443),
 (4.5, 8232),
 (3.5, 193243),
 (2.5, 171141),
 (0.5, 59037),
 (0.5, 4326),
 (5.0, 101947),
 (4.0, 82097),
 (5.0, 179289),
 (1.0, 104457),
 (2.0, 3204),
 (4.5, 59910),
 (1.0, 139920),
 (4.0, 104728),
 (3.0, 6058),
 (4.5, 88957),
 (1.0, 127088),
 (2.0, 25835),
 (1.5, 69438),
 (4.0, 169238),
 (4.0, 162822),
 (2.0, 6587),
 (1.5, 98485),
 (0.5, 78635),
 (2.0, 171),
 