In [1]:
import pandas as pd
import numpy as np


use_cols = ['user_id', 'item_id', 'timestamp', 'click', 'like', 'follow', 'search']
df = pd.read_csv('rec_inter.csv', usecols=use_cols)

## data cleaning & preprocessing

In [2]:
# data cleaning & preprocessing

for c in ["click","like","follow","search"]:
    df[c] = df[c].fillna(0).astype(np.int8)

# only keep recommendation interaction(none search)
df = df[df['search'] == 0]

df['pos'] = ((df['click'] + df['like'] + df['follow']) > 0).astype(np.int8)

# timestamp
ts = pd.to_numeric(df['timestamp'], errors='coerce')
df = df[ts.notna()].copy()
df['ts'] = ts.astype('int64')

In [3]:
# ==========================================
# 1. Iterative Filtering (Core Stability)
# ==========================================
# Keep top K items to avoid OOM and reduce noise
target_item_count = 50000
min_user_inter = 5

print(f"Original: {len(df)} interactions")

# Filter Items first (Keep Top 50k)
item_counts = df['item_id'].value_counts()
if len(item_counts) > target_item_count:
    top_items = item_counts.head(target_item_count).index
    df_filtered = df[df['item_id'].isin(top_items)].copy()
else:
    df_filtered = df.copy()

# Filter Users (Keep >= 5 interactions)
# We might need a loop because removing users might reduce item counts, and vice versa
# But for simplicity, one pass usually works well enough for coursework
user_counts = df_filtered['user_id'].value_counts()
valid_users = user_counts[user_counts >= min_user_inter].index
df_filtered = df_filtered[df_filtered['user_id'].isin(valid_users)].copy()

print(f"Filtered: {len(df_filtered)} interactions")
print(f"Users: {df_filtered['user_id'].nunique()}, Items: {df_filtered['item_id'].nunique()}")

# ==========================================
# 2. ID Remapping
# ==========================================
unique_users = df_filtered['user_id'].unique()
unique_items = df_filtered['item_id'].unique()

user2idx = {uid: i for i, uid in enumerate(unique_users)}
item2idx = {iid: i for i, iid in enumerate(unique_items)}

df_filtered['user_idx'] = df_filtered['user_id'].map(user2idx)
df_filtered['item_idx'] = df_filtered['item_id'].map(item2idx)

num_users = len(unique_users)
num_items = len(unique_items)

# ==========================================
# 3. Train/Test Split
# ==========================================
df_filtered = df_filtered.sort_values(['user_idx', 'ts'])
grouped = df_filtered.groupby('user_idx')
test = df_filtered.loc[grouped.tail(1).index]
train = df_filtered.drop(test.index)

print(f"Train: {len(train)}, Test: {len(test)}")

Original: 7461153 interactions
Filtered: 2621943 interactions
Users: 23926, Items: 50000
Train: 2598017, Test: 23926


## Most Popular Baseline

In [4]:
# ==========================================
# 1. Define Reusable Evaluation Function
# ==========================================
def evaluate_model(model_name, test_df, topk_preds, K=50):
    """
    test_df: DataFrame with 'user_idx' and 'item_idx' (ground truth)
    topk_preds: dict or Series, user_idx -> list of top K item_indices
    """
    hits = []
    ndcgs = []

    # Convert predictions to a dict for fast lookup if it isn't already
    if not isinstance(topk_preds, dict):
        pred_dict = topk_preds.to_dict()
    else:
        pred_dict = topk_preds

    for _, row in test_df.iterrows():
        u = row['user_idx']
        gt = row['item_idx']

        # Get recommendations for this user, default to empty if missing
        recs = pred_dict.get(u, [])

        # HR@K
        if gt in recs:
            hits.append(1)
            # NDCG@K
            rank = recs.index(gt)
            ndcgs.append(1.0 / np.log2(rank + 2))
        else:
            hits.append(0)
            ndcgs.append(0.0)

    hr = np.mean(hits)
    ndcg = np.mean(ndcgs)
    print(f"[{model_name}] HR@{K}: {hr:.4f}  NDCG@{K}: {ndcg:.4f}")
    return hr, ndcg

# ==========================================
# 2. Run Most Popular Baseline
# ==========================================
# Calculate popularity on TRAIN set only (avoid data leakage)
# Using weighted popularity as you did before
train['w'] = (1*train['click'] + 2*train['like'] + 3*train['follow']).astype(np.int16)
pop_scores = train.groupby('item_idx')['w'].sum().sort_values(ascending=False)

# Get global Top-K list
K = 50
global_topk = pop_scores.index[:K].tolist()

# Assign same topk to all test users
most_pop_preds = {u: global_topk for u in test['user_idx'].unique()}

# Evaluate
hr_MP, ndcg_MP = evaluate_model("MostPopular", test, most_pop_preds, K=50)

[MostPopular] HR@50: 0.0303  NDCG@50: 0.0089


## Item CF

In [5]:
import torch
from sklearn.preprocessing import normalize

# Setup Device (GPU/CPU)
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Prepare Indices and Values from Train Data
indices = torch.tensor([train['user_idx'].values, train['item_idx'].values], dtype=torch.long)
values = torch.tensor(train['w'].values, dtype=torch.float32)
shape = torch.Size((num_users, num_items))


# Construct User-Item Sparse Matrix
# user_item_mat: (Users x Items)
user_item_mat = torch.sparse_coo_tensor(indices, values, shape, device=device)
print(f"Sparse User-Item Matrix Shape: {user_item_mat.shape}")

Using CUDA GPU


  indices = torch.tensor([train['user_idx'].values, train['item_idx'].values], dtype=torch.long)


Sparse User-Item Matrix Shape: torch.Size([23926, 50000])


In [6]:
# Calculate Item-Item Similarity (Cosine)

# Transpose to (Items x Users)
item_user_mat = user_item_mat.t()

item_user_dense = item_user_mat.to_dense()
item_norms = torch.norm(item_user_dense, p=2, dim=1, keepdim=True)
item_norms[item_norms == 0] = 1e-9
item_user_norm = item_user_dense / item_norms
sim_matrix = torch.mm(item_user_norm, item_user_norm.t()) # (Items x Items)

sim_matrix.fill_diagonal_(0)


tensor([[0.0000, 0.0418, 0.0415,  ..., 0.0000, 0.0000, 0.0000],
        [0.0418, 0.0000, 0.0805,  ..., 0.0000, 0.0000, 0.0000],
        [0.0415, 0.0805, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')

In [7]:
# Generate Recommendations
from tqdm import tqdm


# 5. Generate Recommendations
item_cf_preds = {}
K = 50
test_users_arr = test['user_idx'].unique()
batch_size = 200 # Adjust based on memory

for i in tqdm(range(0, len(test_users_arr), batch_size), desc="Predicting"):
    batch_uids = test_users_arr[i : i + batch_size]
    batch_uids_tensor = torch.tensor(batch_uids, device=device)

    # Get History: (Batch x Items)
    # index_select works on dense or sparse (if supported)
    # converting batch history to dense for calculation
    batch_hist = user_item_mat.index_select(0, batch_uids_tensor).to_dense()

    # Score: (Batch x Items) * (Items x Items) -> (Batch x Items)
    scores = torch.mm(batch_hist, sim_matrix)

    # Mask seen items
    scores = scores - 9999.0 * batch_hist

    # Top-K
    _, topk_indices = torch.topk(scores, k=K, dim=1)

    # Store
    topk_cpu = topk_indices.cpu().numpy()
    for idx, u in enumerate(batch_uids):
        item_cf_preds[u] = topk_cpu[idx].tolist()

# 6. Evaluate
hr_IC, ndcg_IC = evaluate_model("ItemCF", test, item_cf_preds, K=50)

Predicting: 100%|██████████| 120/120 [00:07<00:00, 15.41it/s]


[ItemCF] HR@50: 0.0548  NDCG@50: 0.0160


## NeuCF

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random

# ==========================================
# 1. NeuMF Dataset with Negative Sampling
# ==========================================
class NeuMFDataset(Dataset):
    def __init__(self, train_df, num_items, num_neg=4):
        self.users = torch.tensor(train_df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(train_df['item_idx'].values, dtype=torch.long)
        self.num_items = num_items
        self.num_neg = num_neg

        # Pre-compute a set of interacted items for each user for fast negative sampling
        # set lookup is O(1)
        self.user_item_set = train_df.groupby('user_idx')['item_idx'].apply(set).to_dict()

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        # Positive sample
        u = self.users[idx]
        i = self.items[idx]

        # Negative sampling
        # We need to find an item 'j' that user 'u' has NOT seen
        item_indices = []
        labels = []

        # Add Positive
        item_indices.append(i)
        labels.append(1.0)

        # Add Negatives
        interacted_items = self.user_item_set.get(u.item(), set())

        for _ in range(self.num_neg):
            j = random.randint(0, self.num_items - 1)
            # Simple rejection sampling
            while j in interacted_items:
                j = random.randint(0, self.num_items - 1)

            item_indices.append(torch.tensor(j, dtype=torch.long))
            labels.append(0.0)

        return u, torch.stack(item_indices), torch.tensor(labels, dtype=torch.float32)

# Create Dataset and DataLoader
# num_neg=4 is standard for NCF/NeuMF
train_dataset = NeuMFDataset(train, num_items=num_items, num_neg=4)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

print("NeuMF Dataset Ready.")

NeuMF Dataset Ready.


In [12]:
# ==========================================
# 2. NeuMF Model Architecture
# ==========================================
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NeuMF, self).__init__()

        # --- GMF Part ---
        self.gmf_user_embed = nn.Embedding(num_users, embedding_dim)
        self.gmf_item_embed = nn.Embedding(num_items, embedding_dim)

        # --- MLP Part ---
        self.mlp_user_embed = nn.Embedding(num_users, embedding_dim)
        self.mlp_item_embed = nn.Embedding(num_items, embedding_dim)

        # MLP Layers: Input(2*dim) -> dim -> dim/2 -> 1
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, embedding_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # --- Prediction Layer ---
        # Concatenate GMF output (dim) and MLP output (dim/2)
        self.output = nn.Linear(embedding_dim + embedding_dim // 2, 1)

        # Init weights (Optional but recommended)
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.01)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, user, item):
        # GMF
        gmf_u = self.gmf_user_embed(user)
        gmf_i = self.gmf_item_embed(item)
        gmf_vector = gmf_u * gmf_i # Element-wise product

        # MLP
        mlp_u = self.mlp_user_embed(user)
        mlp_i = self.mlp_item_embed(item)
        mlp_input = torch.cat([mlp_u, mlp_i], dim=-1)
        mlp_vector = self.mlp(mlp_input)

        # Concat & Predict
        combined = torch.cat([gmf_vector, mlp_vector], dim=-1)
        prediction = self.output(combined)

        # Note: We don't use Sigmoid here because we use BCEWithLogitsLoss later
        return prediction.squeeze()

# Instantiate Model
model = NeuMF(num_users, num_items, embedding_dim=32).to(device)
print(model)

NeuMF(
  (gmf_user_embed): Embedding(23926, 32)
  (gmf_item_embed): Embedding(50000, 32)
  (mlp_user_embed): Embedding(23926, 32)
  (mlp_item_embed): Embedding(50000, 32)
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
  )
  (output): Linear(in_features=48, out_features=1, bias=True)
)


In [13]:
# ==========================================
# 3. Training Loop
# ==========================================
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5 # For demo, you can increase to 10-20
model.train()

for epoch in range(epochs):
    total_loss = 0

    # Progress bar for each epoch
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for u, items, labels in pbar:
        u, items, labels = u.to(device), items.to(device), labels.to(device)

        # Flatten input for batch processing
        # items shape: (batch, 1+num_neg) -> (batch * (1+num_neg))
        # We repeat user '1+num_neg' times to match items
        batch_size_curr = u.shape[0]
        num_samples = items.shape[1]

        u_flat = u.repeat_interleave(num_samples)
        items_flat = items.view(-1)
        labels_flat = labels.view(-1)

        optimizer.zero_grad()
        preds = model(u_flat, items_flat)
        loss = criterion(preds, labels_flat)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})

print("Training Finished.")

# ==========================================
# 4. Generate Recommendations (Inference)
# ==========================================
model.eval()
neumf_preds = {}
test_users = test['user_idx'].unique()
K = 50

# For NeuMF, scoring every item for every user is SLOW.
# Strategy:
# 1. Compute user embeddings and item embeddings.
# 2. But MLP makes it hard to do simple dot product.
# 3. So we usually do Batch Inference on ALL items for each test user (or a candidate set).
# Given we have 50k items, we can score all 50k for each test user.

all_items = torch.arange(num_items, device=device)

with torch.no_grad():
    for u in tqdm(test_users, desc="NeuMF Prediction"):
        # Create input: user u repeated 50000 times
        u_tensor = torch.tensor([u], device=device).repeat(num_items)

        # Predict scores for all items
        scores = model(u_tensor, all_items)

        # Mask seen items (Optional but recommended)
        # For simplicity in demo, we might skip masking or do it simple
        # Here we just take Top-K directly

        # Top-K
        _, top_indices = torch.topk(scores, K)
        neumf_preds[u] = top_indices.cpu().tolist()

# Evaluate
evaluate_model("NeuMF", test, neumf_preds, K=50)

Epoch 1/5: 100%|██████████| 2538/2538 [02:08<00:00, 19.72it/s, loss=0.321]
Epoch 2/5: 100%|██████████| 2538/2538 [02:08<00:00, 19.83it/s, loss=0.269]
Epoch 3/5: 100%|██████████| 2538/2538 [02:08<00:00, 19.83it/s, loss=0.249]
Epoch 4/5: 100%|██████████| 2538/2538 [02:08<00:00, 19.82it/s, loss=0.196]
Epoch 5/5: 100%|██████████| 2538/2538 [02:07<00:00, 19.87it/s, loss=0.201]


Training Finished.


NeuMF Prediction: 100%|██████████| 23926/23926 [00:15<00:00, 1553.37it/s]


[NeuMF] HR@50: 0.0675  NDCG@50: 0.0192


(np.float64(0.06745799548608208), np.float64(0.019187278734472554))

## SASRec

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np

# --- 1. Dataset Class ---
class SASRecDataset(Dataset):
    def __init__(self, train_df, num_items, max_len=50):
        """
        Initializes the SASRec Dataset, preparing fixed-length sequences for training.

        Args:
            train_df (pd.DataFrame): Training interactions DataFrame.
            num_items (int): Total number of unique items (0 to num_items-1 are valid IDs).
            max_len (int): Maximum sequence length for Transformer input.
        """
        self.max_len = max_len
        self.pad_id = num_items # Use num_items as the reserved ID for padding

        # Convert training data into sorted item sequences per user
        print("Preparing SASRec Sequences...")
        # Assume train_df is already sorted by 'user_idx' and 'ts'

        # Extract sequences
        sequences = train_df.groupby('user_idx')['item_idx'].apply(list)

        self.user_ids = []
        self.inputs = []  # Sequence input: [v1, v2, ..., v(t-1)]
        self.targets = [] # Sequence target (next item prediction): [v2, v3, ..., v(t)]

        # For each user, use the 'Leave-One-Out' concept for training sequences
        for u_id, seq in tqdm(sequences.items(), desc="Processing Users"):

            # The last item is reserved for testing, so training uses up to the second-to-last item.
            # However, for sequence modeling, we use the full history available in 'train'
            # to generate (Input Seq, Target Seq) pairs for next-item prediction.

            input_seq = seq[:-1] # E.g., [v1, v2, v3]
            target_seq = seq[1:]  # E.g., [v2, v3, v4]

            # Padding and Truncation
            input_seq = input_seq[-self.max_len:]
            target_seq = target_seq[-self.max_len:]

            # Padding (Use self.pad_id for padding)
            padding_len = self.max_len - len(input_seq)
            padded_input = [self.pad_id] * padding_len + input_seq
            padded_target = [self.pad_id] * padding_len + target_seq

            # Truncate to max_len (already handled by slicing above, but keep for clarity)
            self.inputs.append(padded_input[-self.max_len:])
            self.targets.append(padded_target[-self.max_len:])
            self.user_ids.append(u_id)

    def __len__(self):
        # We model each user's history as ONE training sample
        return len(self.user_ids)

    def __getitem__(self, idx):
        # Returns: Input sequence (MaxLen), Target sequence (MaxLen), User ID (1)
        return (torch.tensor(self.inputs[idx], dtype=torch.long),
                torch.tensor(self.targets[idx], dtype=torch.long),
                self.user_ids[idx])

# Example instantiation:
# train_dataset_sasrec = SASRecDataset(train, num_items=num_items, max_len=50)
# train_loader_sasrec = DataLoader(train_dataset_sasrec, batch_size=256, shuffle=True)
# print("SASRec Dataset Ready.")

In [15]:
# --- 2. Model Architecture ---
class FeedForward(torch.nn.Module):
    """Point-wise Feed-Forward Network (FFN)"""
    def __init__(self, hidden_size, dropout_rate=0.5):
        super(FeedForward, self).__init__()
        self.conv1 = torch.nn.Conv1d(hidden_size, hidden_size, kernel_size=1)
        self.dropout1 = torch.nn.Dropout(dropout_rate)
        self.relu = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv1d(hidden_size, hidden_size, kernel_size=1)
        self.dropout2 = torch.nn.Dropout(dropout_rate)

    def forward(self, x):
        # x: (B, MaxLen, HiddenSize) -> permute to (B, HiddenSize, MaxLen) for Conv1d
        x = x.permute(0, 2, 1)
        x = self.dropout1(self.relu(self.conv1(x)))
        x = self.dropout2(self.conv2(x))
        x = x.permute(0, 2, 1) # Back to (B, MaxLen, HiddenSize)
        return x

class AttentionBlock(torch.nn.Module):
    """Single Transformer Encoder Layer (Self-Attention + FFN)"""
    def __init__(self, hidden_size, num_heads, dropout_rate=0.5):
        super(AttentionBlock, self).__init__()
        # PyTorch's MultiheadAttention includes Q, K, V linear layers internally
        self.attn = torch.nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FeedForward(hidden_size, dropout_rate)

        self.layernorm1 = torch.nn.LayerNorm(hidden_size, eps=1e-6)
        self.layernorm2 = torch.nn.LayerNorm(hidden_size, eps=1e-6)
        self.dropout1 = torch.nn.Dropout(dropout_rate)
        self.dropout2 = torch.nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        # 1. Self-Attention
        # attn_output: (B, MaxLen, HiddenSize)
        # key_padding_mask is not used here; mask is the causal mask
        attn_output, _ = self.attn(x, x, x, attn_mask=mask)
        x = self.layernorm1(x + self.dropout1(attn_output)) # Add & Norm

        # 2. FFN
        ffn_output = self.ffn(x)
        x = self.layernorm2(x + self.dropout2(ffn_output)) # Add & Norm
        return x

class SASRecModel(torch.nn.Module):
    """Self-Attentive Sequential Recommendation Model"""
    def __init__(self, num_items, max_len, num_layers=2, num_heads=2, hidden_size=64, dropout_rate=0.2):
        super(SASRecModel, self).__init__()

        # Item embeddings (includes one extra slot for the PAD_ID)
        self.item_embed = torch.nn.Embedding(num_items + 1, hidden_size, padding_idx=num_items)
        self.position_embed = torch.nn.Embedding(max_len, hidden_size)
        self.dropout = torch.nn.Dropout(dropout_rate)

        self.num_items = num_items
        self.max_len = max_len
        self.hidden_size = hidden_size

        # Transformer Encoder Stack
        self.attention_blocks = torch.nn.ModuleList([
            AttentionBlock(hidden_size, num_heads, dropout_rate)
            for _ in range(num_layers)
        ])

        self.layernorm = torch.nn.LayerNorm(hidden_size, eps=1e-6) # Final Layer Norm

        self._init_weights()

    def _init_weights(self):
        # Initialize item and position embeddings
        torch.nn.init.xavier_uniform_(self.item_embed.weight.data)
        torch.nn.init.xavier_uniform_(self.position_embed.weight.data)

    def forward(self, seq_in):
        # seq_in: (B, MaxLen)

        # 1. Embeddings and Positional Encoding
        item_emb = self.item_embed(seq_in) # (B, MaxLen, HiddenSize)

        # Create positional indices: [0, 1, ..., MaxLen-1]
        positions = torch.arange(self.max_len, dtype=torch.long, device=seq_in.device)
        pos_emb = self.position_embed(positions) # (MaxLen, HiddenSize)
        pos_emb = pos_emb.unsqueeze(0).repeat(seq_in.size(0), 1, 1) # (B, MaxLen, HiddenSize)

        x = item_emb + pos_emb
        x = self.dropout(x)

        # 2. Self-Attention Mask (Causality Mask)
        # This mask prevents a token from attending to subsequent tokens (future information).
        # Mask is upper triangular (disallowed to attend to future items).
        # The mask shape should be (MaxLen, MaxLen)
        attention_mask = torch.triu(torch.ones((self.max_len, self.max_len), dtype=torch.bool, device=seq_in.device), diagonal=1)
        # MultiheadAttention expects False for not masked, True for masked.
        # This mask should be applied as `attn_mask` in the attention layer.

        # 3. Transformer Encoder Blocks
        for block in self.attention_blocks:
            x = block(x, mask=attention_mask)

        # Final Layer Norm
        x = self.layernorm(x)

        # x: (B, MaxLen, HiddenSize) - Output sequence of latent vectors
        return x


In [16]:
# --- 3. Training Loop ---
def train_sasrec(model, train_loader, epochs, num_items, device):
    """
    Trains the SASRec model using Cross-Entropy Loss with sequence data.
    """
    # CrossEntropyLoss is used for multi-class classification (predicting the next item ID)
    # ignore_index=num_items ensures that padding tokens do not contribute to the loss.
    criterion = torch.nn.CrossEntropyLoss(ignore_index=num_items)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(epochs):
        pbar = tqdm(train_loader, desc=f"SASRec Epoch {epoch+1}/{epochs}")
        total_loss = 0

        for input_seq, target_seq, _ in pbar:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)

            optimizer.zero_grad()

            # 1. Forward Pass: Get output latent vectors (B, MaxLen, HiddenSize)
            latent_seq = model(input_seq)

            # 2. Prediction Scores
            # Scores for ALL items (Dot product with Item Embeddings is the prediction head)
            # all_item_embeds: (NumItems+1, HiddenSize)
            all_item_embeds = model.item_embed.weight

            # scores: (B, MaxLen, NumItems+1)
            scores = torch.matmul(latent_seq, all_item_embeds.transpose(0, 1))

            # 3. Calculate Loss
            # scores_flat: (B * MaxLen, NumItems+1)
            scores_flat = scores.view(-1, scores.size(-1))
            # targets_flat: (B * MaxLen)
            targets_flat = target_seq.view(-1)

            loss = criterion(scores_flat, targets_flat)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})

    print("SASRec Training Finished.")

# --- 4. Evaluation/Prediction Function ---
def predict_sasrec(model, test_df, train_df, num_items, max_len, K=50):
    """
    Generates Top-K recommendations for test users using the trained SASRec model (Full Ranking).
    """
    model.eval()

    # 1. Extract each user's training history (needed for model input)
    user_histories = train_df.groupby('user_idx')['item_idx'].apply(list).to_dict()

    sasrec_preds = {}
    test_users = test_df['user_idx'].unique()
    pad_id = num_items

    with torch.no_grad():
        for u in tqdm(test_users, desc="SASRec Prediction"):

            # a. Prepare input sequence (History S_u[:-1])
            history = user_histories.get(u, [])
            # The last item is the ground truth, so we use the history *before* that.
            input_seq = history[:-1] # Input sequence to predict the last item (ground truth)
            input_seq = input_seq[-max_len:] # Truncate

            # b. Padding
            padding_len = max_len - len(input_seq)
            padded_input = [pad_id] * padding_len + input_seq

            # c. Convert to Tensor (Batch size=1)
            input_tensor = torch.tensor([padded_input[-max_len:]], dtype=torch.long, device=device)

            # d. Forward Pass (1, MaxLen, HiddenSize)
            latent_seq = model(input_tensor)

            # e. Get User Representation h_t
            # The prediction is based on the output at the last token position
            last_index = max_len - 1
            u_representation = latent_seq[0, last_index, :] # (HiddenSize)

            # f. Scoring (Dot Product) against all item embeddings
            # Exclude the PAD_ID embedding in the scoring
            all_item_embeds = model.item_embed.weight[:-1]
            scores = torch.matmul(u_representation, all_item_embeds.transpose(0, 1)) # (NumItems)

            # g. Mask seen items (Crucial for sequential recommendation evaluation!)
            # Items user u has already interacted with (must be excluded from recommendation)
            seen_items = set(history[:-1])
            mask = torch.ones_like(scores, dtype=torch.bool)

            for i_idx in seen_items:
                if i_idx < num_items:
                    mask[i_idx] = False

            # Set scores of seen items to a very low value to prevent their recommendation
            scores[~mask] = -1e9

            # h. Top-K
            _, top_indices = torch.topk(scores, K)
            sasrec_preds[u] = top_indices.cpu().tolist()

    return sasrec_preds

In [17]:
# --- Hyperparameters ---
max_len = 50           # Sequence length (must match the model definition)
hidden_size = 64       # Embedding dimension
epochs = 20             # Training epochs (can increase if needed)
batch_size = 256
K = 50                 # Top-K for evaluation

# Ensure the SASRecModel and SASRecDataset classes are defined in previous cells

print(f"Total Items (NumItems): {num_items}")

# 1. Model Instantiation
model_sasrec = SASRecModel(
    num_items=num_items,
    max_len=max_len,
    hidden_size=hidden_size
).to(device)

print(model_sasrec)

# 2. Dataset Instantiation (Using the corrected call with num_items)
train_dataset_sasrec = SASRecDataset(train, num_items=num_items, max_len=max_len)

# 3. DataLoader
train_loader_sasrec = DataLoader(train_dataset_sasrec, batch_size=batch_size, shuffle=True)
print("SASRec DataLoader Ready.")

# Ensure the train_sasrec function (from the previous answer) is defined in a cell above
train_sasrec(model_sasrec, train_loader_sasrec, epochs=epochs, num_items=num_items, device=device)


# Ensure the predict_sasrec function (from the previous answer) is defined in a cell above

# 1. Inference/Prediction (Full Ranking)
sasrec_preds = predict_sasrec(
    model_sasrec,
    test,
    train,
    num_items=num_items, # Pass num_items for PAD_ID calculation
    max_len=max_len,
    K=K
)

# 2. Evaluation
hr_SR, ndcg_SR = evaluate_model("SASRec", test, sasrec_preds, K=K)

# Display final results
print("\n--- Summary ---")
print(f"[SASRec] HR@{K}: {hr_SR:.4f}  NDCG@{K}: {ndcg_SR:.4f}")

Total Items (NumItems): 50000
SASRecModel(
  (item_embed): Embedding(50001, 64, padding_idx=50000)
  (position_embed): Embedding(50, 64)
  (dropout): Dropout(p=0.2, inplace=False)
  (attention_blocks): ModuleList(
    (0-1): 2 x AttentionBlock(
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
      )
      (ffn): FeedForward(
        (conv1): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
        (dropout1): Dropout(p=0.2, inplace=False)
        (relu): ReLU()
        (conv2): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (layernorm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
      (layernorm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
      (dropout1): Dropout(p=0.2, inplace=False)
      (dropout2): Dropout(p=0.2, inplace=False)
    )
  )
  (layernorm): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
)
Preparing SA

Processing Users: 23926it [00:00, 179406.35it/s]


SASRec DataLoader Ready.


SASRec Epoch 1/20: 100%|██████████| 94/94 [00:04<00:00, 19.57it/s, loss=10.4]
SASRec Epoch 2/20: 100%|██████████| 94/94 [00:04<00:00, 22.49it/s, loss=10.1]
SASRec Epoch 3/20: 100%|██████████| 94/94 [00:04<00:00, 22.49it/s, loss=9.87]
SASRec Epoch 4/20: 100%|██████████| 94/94 [00:04<00:00, 22.39it/s, loss=9.52]
SASRec Epoch 5/20: 100%|██████████| 94/94 [00:04<00:00, 20.19it/s, loss=9.28]
SASRec Epoch 6/20: 100%|██████████| 94/94 [00:04<00:00, 22.49it/s, loss=8.93]
SASRec Epoch 7/20: 100%|██████████| 94/94 [00:04<00:00, 22.62it/s, loss=8.82]
SASRec Epoch 8/20: 100%|██████████| 94/94 [00:04<00:00, 22.57it/s, loss=8.68]
SASRec Epoch 9/20: 100%|██████████| 94/94 [00:04<00:00, 22.54it/s, loss=8.53]
SASRec Epoch 10/20: 100%|██████████| 94/94 [00:04<00:00, 22.36it/s, loss=8.35]
SASRec Epoch 11/20: 100%|██████████| 94/94 [00:04<00:00, 22.52it/s, loss=8.32]
SASRec Epoch 12/20: 100%|██████████| 94/94 [00:04<00:00, 22.60it/s, loss=8.15]
SASRec Epoch 13/20: 100%|██████████| 94/94 [00:04<00:00, 22.4

SASRec Training Finished.


SASRec Prediction: 100%|██████████| 23926/23926 [01:34<00:00, 253.18it/s]


[SASRec] HR@50: 0.1006  NDCG@50: 0.0309

--- Summary ---
[SASRec] HR@50: 0.1006  NDCG@50: 0.0309
