In [1]:
import os
import random
import numpy as np
import pandas as pd
import time
import gc
import matplotlib.pyplot as plt
from collections import defaultdict

# os.environ["HF_HOME"] = "D:/Python Projects/recommendation_system"
# os.environ["HF_DATASETS_CACHE"] = "D:/Python Projects/recommendation_system/recsys/data"
# os.environ["TRANSFORMERS_CACHE"] = "D:/Python Projects/recommendation_system/recsys/models"

os.environ["HF_HOME"] = "E:/Python Scripts/recsys"
os.environ['HF_DATASETS_CACHE'] = "E:/Python Scripts/recsys/data"
os.environ['TRANSFORMERS_CACHE'] = "E:/Python Scripts/recsys/models"

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset, Features, Value
from tqdm import tqdm
from tensorboardX import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cuda


## Dataset preparation

In [6]:
HF_DATASET = "McAuley-Lab/Amazon-Reviews-2023"

def load_amazon_reviews(domain:str,
                        save_dir:str = "data",
                        max_items:int | None = None,
                        seed:int = SEED) -> pd.DataFrame:

    os.makedirs(save_dir, exist_ok=True)
    filepath = f"{save_dir}/amazon_reviews_{domain}.csv"

    if not os.path.exists(filepath):
        print(f"File {filepath} not found. Downloading dataset for domain '{domain}'...")
        ds = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            f"raw_review_{domain}",
            split="full",
            trust_remote_code=True,
        )

        # Keep only needed columns
        ds = ds.select_columns(["user_id", "parent_asin", "rating", "timestamp"])
        ds = ds.rename_columns({"user_id": "user", "parent_asin": "item"})
        ds = ds.cast(Features({
            "user": Value("string"),
            "item": Value("string"),
            "rating": Value("float32"),
            "timestamp": Value("int64"),
        }))

        # Convert to pandas (Arrow zero-copy where possible)
        df = ds.to_pandas()
        df.insert(3, "domain", domain)
        df.to_csv(f"{save_dir}/amazon_reviews_{domain}.csv", index=False)
        print(f"Saved amazon_reviews_{domain}.csv to {save_dir}/")

    final_df = pd.read_csv(filepath)
    # Random subset if max_items is set
    if max_items is not None:
        k = min(max_items, len(final_df))
        final_df = final_df.sample(n=k, random_state=seed).reset_index(drop=True)
    print(f"Loaded {filepath} with {len(final_df)} rows.")
    return final_df

def preprocess_dataset(df, min_user_interactions=5, min_item_interactions=5):
    # Make it implicit
    df["label"] = 1.0
    user_counts = df.groupby("user").size()
    valid_users = user_counts[user_counts >= min_user_interactions].index
    item_counts = df.groupby("item").size()
    valid_items = item_counts[item_counts >= min_item_interactions].index
    df_filtered = df[df["user"].isin(valid_users) & df["item"].isin(valid_items)]
    print("After interactions filtering:", len(df), "rows,", df["user"].nunique(), "users,", df["item"].nunique(), "items")
    return df_filtered

def label_encoder(df, shift_item_id=False):
    user_enc = LabelEncoder()
    item_enc = LabelEncoder()
    domain_enc = LabelEncoder()
    df["user_id"] = user_enc.fit_transform(df["user"])
    df["item_id"] = item_enc.fit_transform(df["item"])
    if shift_item_id:
        df["item_id"] = df["item_id"] + 1  # Shift item IDs by 1 to reserve 0 for padding if needed
    df["domain_id"] = domain_enc.fit_transform(df["domain"])
    return df, user_enc, item_enc, domain_enc

In [7]:
# New input
SOURCE_DOMAIN = "Books"

# Loading data from multiple domains
df = load_amazon_reviews(SOURCE_DOMAIN, max_items=3_000_000, seed=SEED)
print(f"Total rows in {SOURCE_DOMAIN}: {len(df)}")

# Preprocess the dataset
filtered_df = preprocess_dataset(df, min_user_interactions=20, min_item_interactions=20)
df_encoded, user_encoder, item_encoder, domain_encoder = label_encoder(df, shift_item_id=True)

NUM_USERS = df_encoded["user_id"].max() + 1
NUM_ITEMS = df_encoded["item_id"].max() + 1
NUM_DOMAINS = df_encoded["domain_id"].max() + 1
print(f"Number of users: {NUM_USERS}, Number of items: {NUM_ITEMS}, Number of domains: {NUM_DOMAINS}")

Loaded data/amazon_reviews_Books.csv with 3000000 rows.
Total rows in Books: 3000000
After interactions filtering: 3000000 rows, 2089252 users, 1227560 items
Number of users: 2089252, Number of items: 1227561, Number of domains: 1


In [8]:
def create_user_sequences(df):
    df_sorted = df.sort_values(["user_id", "timestamp"])
    user_sequences = {}
    for uid, group in df_sorted.groupby("user_id"):
        items = group["item_id"].tolist()
        user_sequences[uid] = items
    return user_sequences

# Create sequences
user_sequences = create_user_sequences(df_encoded)

In [10]:
def sequences_loo_split(user_sequences):
    """
    For each user: last item → test, second-to-last → validation, rest → training
    """
    train_seqs = {}
    val_data = {}
    test_data = {}

    for user, seq in user_sequences.items():
        if len(seq) < 2:  # Need at least 3 items for train/val/test
            continue

        train_seqs[user] = seq[:-2]  # All but last two
        val_data[user] = (seq[:-2], seq[-2])  # Train on all but last 2, predict second-to-last
        test_data[user] = (seq[:-1], seq[-1])  # Train on all but last, predict last

    print(f"\nData split:")
    print(f"  Training sequences: {len(train_seqs)}")
    print(f"  Validation users: {len(val_data)}")
    print(f"  Test users: {len(test_data)}")

    return train_seqs, val_data, test_data

train_sequences, val_sequences, test_sequences = sequences_loo_split(df_encoded)
print(f"Sequences - Train: {len(train_sequences)}, Val: {len(val_sequences)}, Test: {len(test_sequences)}")

KeyError: -2

## Dataset and DataLoader

In [23]:
class SASRecDataset(Dataset):
    def __init__(self, user_sequences, all_train_sequences, num_items, max_seq_len=50, mode="train", neg_samples=1):
        self.user_sequences = user_sequences
        self.all_train_sequences = all_train_sequences  # For negative sampling
        self.num_items = num_items
        self.max_seq_len = max_seq_len
        self.mode = mode
        self.neg_samples = neg_samples

        # Build user interaction history for negative sampling
        self.user_item_set = defaultdict(set)
        for u, items in all_train_sequences.items():
            self.user_item_set[u] = set(items)

        # Create samples based on mode
        self.samples = []
        for user, seq in user_sequences.items():
            if mode == "train":
                # Training: create multiple samples per sequence
                for t in range(1, len(seq)):
                    input_seq = seq[:t]
                    target = seq[t]
                    self.samples.append((user, input_seq, target))
            elif mode == "val":
                # Validation: predict the second-to-last item
                if len(seq) < 2:
                    continue
                input_seq = seq[:-2]  # Use train sequence
                target = seq[-2]      # Predict validation item
                self.samples.append((user, input_seq, target))
            else:  # test
                # Test: predict the last item
                input_seq = seq[:-1]  # Use train+val sequence
                target = seq[-1]      # Predict test item
                self.samples.append((user, input_seq, target))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        user, input_seq, target = self.samples[idx]

        # Truncate if too long
        if len(input_seq) > self.max_seq_len:
            input_seq = input_seq[-self.max_seq_len:]

        # Pad sequence
        pad_len = self.max_seq_len - len(input_seq)
        input_seq = [0] * pad_len + input_seq

        # Sample negatives (items not in user's training history)
        neg_items = self._sample_negatives(user)

        return {
            "user": user,
            "input_seq": torch.tensor(input_seq, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.long),
            "neg_items": torch.tensor(neg_items, dtype=torch.long)
        }

    def _sample_negatives(self, user):
        """Sample items that user hasn't interacted with in training"""
        neg_items = set()
        user_items = self.user_item_set[user]
        while len(neg_items) < self.neg_samples:
            neg = random.randint(1, self.num_items - 1)
            if neg not in user_items:
                neg_items.add(neg)
        return list(neg_items)

In [24]:
# Create datasets with correct sequences
NUM_ITEMS = int(df_encoded["item_id"].max()) + 1

train_dataset = SASRecDataset(
    train_sequences, train_sequences, NUM_ITEMS,
    max_seq_len=50, mode="train", neg_samples=1
)

val_dataset = SASRecDataset(
    val_sequences, train_sequences, NUM_ITEMS,
    max_seq_len=50, mode="val", neg_samples=99
)

test_dataset = SASRecDataset(
    test_sequences, train_sequences, NUM_ITEMS,
    max_seq_len=50, mode="test", neg_samples=99
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

In [25]:
first = next(iter(val_loader))
print("Sample batch from validation loader:")
print("Input sequence shape:", first["input_seq"].shape)
print("Target shape:", first["target"].shape)
print("Negative items shape:", first["neg_items"].shape)

print("\nSample input sequence:")
random_index = []
for _ in range(10):
    random_index.append(random.randint(0, len(val_loader)-1))

for i in random_index:
    print(first["input_seq"][i])

Sample batch from validation loader:
Input sequence shape: torch.Size([2048, 50])
Target shape: torch.Size([2048])
Negative items shape: torch.Size([2048, 99])

Sample input sequence:
tensor([     0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 734224])
tensor([     0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,

## Create SASRec model

In [32]:
class SASRec(nn.Module):
    def __init__(self, num_items, hidden_dim=50, max_seq_len=50,
                 num_blocks=2, num_heads=1, dropout=0.2):
        super().__init__()

        self.num_items = num_items
        self.hidden_dim = hidden_dim
        self.max_seq_len = max_seq_len

        # Embeddings
        self.item_embed = nn.Embedding(num_items, hidden_dim, padding_idx=0)
        self.pos_embed = nn.Embedding(max_seq_len, hidden_dim)
        self.emb_dropout = nn.Dropout(dropout)

        # Attention blocks
        self.attention_blocks = nn.ModuleList([
            nn.ModuleDict({
                'attention': nn.MultiheadAttention(
                    hidden_dim, num_heads, dropout=dropout, batch_first=True
                ),
                'feed_forward': nn.Sequential(
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.Dropout(dropout)
                ),
                'ln1': nn.LayerNorm(hidden_dim),
                'ln2': nn.LayerNorm(hidden_dim)
            }) for _ in range(num_blocks)
        ])

        self.ln_out = nn.LayerNorm(hidden_dim)
        self._init_weights()

    def _init_weights(self):
        # Xavier initialization
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

    def forward(self, input_seq):
        batch_size, seq_len = input_seq.shape

        # Get embeddings
        seqs_emb = self.item_embed(input_seq)  # [B, L, D]
        positions = torch.arange(seq_len, device=input_seq.device).expand(batch_size, -1)
        pos_emb = self.pos_embed(positions)    # [B, L, D]

        # Combine embeddings
        emb = self.emb_dropout(seqs_emb + pos_emb)

        # Create masks
        padding_mask = (input_seq == 0)  # [B, L]
        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=input_seq.device),
            diagonal=1
        ).bool()  # [L, L]

        # Process through attention blocks
        timeline_rep = emb
        for block in self.attention_blocks:
            # Multi-head attention
            residual = timeline_rep
            timeline_rep_norm = block['ln1'](timeline_rep)
            attn_out, _ = block['attention'](
                timeline_rep_norm, timeline_rep_norm, timeline_rep_norm,
                key_padding_mask=padding_mask,
                attn_mask=causal_mask
            )
            timeline_rep = residual + attn_out

            # Feed-forward
            residual = timeline_rep
            timeline_rep_norm = block['ln2'](timeline_rep)
            ff_out = block['feed_forward'](timeline_rep_norm)
            timeline_rep = residual + ff_out

        # Output layer norm
        output = self.ln_out(timeline_rep)  # [B, L, D]
        return output

    def predict(self, input_seq, candidates):
        """
        Predict scores for candidate items
        """
        seq_emb = self.forward(input_seq)  # [B, L, D]

        # Use last non-padding position
        mask = (input_seq != 0).float()
        last_pos = mask.sum(1).long() - 1  # [B]
        last_pos = last_pos.clamp(min=0)

        # Get representation at last position
        batch_idx = torch.arange(input_seq.size(0), device=input_seq.device)
        final_feat = seq_emb[batch_idx, last_pos]  # [B, D]

        # Score candidates
        cand_emb = self.item_embed(candidates)  # [B, N, D] or [B, D]

        if candidates.dim() == 2:  # Multiple candidates per user
            scores = torch.matmul(cand_emb, final_feat.unsqueeze(-1)).squeeze(-1)
        else:  # Single candidate
            scores = (cand_emb * final_feat).sum(dim=1)

        return scores

## Training and evaluation functions

In [33]:
# Training function
def train_epoch(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        input_seq = batch["input_seq"].to(device)
        pos_items = batch["target"].to(device)
        neg_items = batch["neg_items"].to(device)

        # Forward pass
        seq_output = model(input_seq)  # [B, L, D]

        # Get last position output
        batch_size = input_seq.size(0)
        mask = (input_seq != 0).float()
        last_pos = mask.sum(1).long() - 1
        last_pos = last_pos.clamp(min=0)
        batch_idx = torch.arange(batch_size, device=device)
        final_feat = seq_output[batch_idx, last_pos]  # [B, D]

        # Compute scores
        pos_emb = model.item_embed(pos_items)  # [B, D]
        neg_emb = model.item_embed(neg_items.squeeze())  # [B, D]

        pos_scores = (final_feat * pos_emb).sum(dim=1)  # [B]
        neg_scores = (final_feat * neg_emb).sum(dim=1)  # [B]

        # BPR loss
        loss = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-10).mean()

        # Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

In [34]:
# Evaluation function
@torch.no_grad()
def evaluate(model, eval_loader, k=10, device='cpu'):
    model.eval()

    hr_list = []
    ndcg_list = []
    mrr_list = []

    for batch in tqdm(eval_loader, desc="Evaluating"):
        input_seq = batch["input_seq"].to(device)
        target = batch["target"].to(device)
        neg_items = batch["neg_items"].to(device)

        batch_size = input_seq.size(0)

        # Create candidate set: target + negatives
        candidates = torch.cat([
            target.unsqueeze(1),  # [B, 1]
            neg_items  # [B, 99]
        ], dim=1)  # [B, 100]

        # Get predictions
        scores = model.predict(input_seq, candidates)  # [B, 100]

        # Get ranks (0 is the positive item)
        _, indices = torch.topk(scores, k=min(k, scores.size(1)), dim=1)

        # Calculate metrics
        for i in range(batch_size):
            # Check if positive item (index 0) is in top-k
            if 0 in indices[i]:
                hr_list.append(1.0)
                rank = (indices[i] == 0).nonzero(as_tuple=True)[0].item() + 1
                ndcg_list.append(1.0 / np.log2(rank + 1))
                mrr_list.append(1.0 / rank)
            else:
                hr_list.append(0.0)
                ndcg_list.append(0.0)
                mrr_list.append(0.0)

    return {
        f"HR@{k}": np.mean(hr_list),
        f"NDCG@{k}": np.mean(ndcg_list),
        f"MRR@{k}": np.mean(mrr_list)
    }

## Training the model

In [36]:
# Initialize model
model = SASRec(
    num_items=NUM_ITEMS,
    hidden_dim=64,  # Original paper uses 50
    max_seq_len=50,
    num_blocks=2,
    num_heads=2,    # Original paper uses 1
    dropout=0.5
).to(DEVICE)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
best_ndcg = 0
patience = 10
no_improve = 0

for epoch in range(num_epochs):
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, None, DEVICE)

    # Evaluate
    val_metrics = evaluate(model, val_loader, k=10, device=DEVICE)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Metrics: {val_metrics}")

    # Early stopping
    if val_metrics["NDCG@10"] > best_ndcg:
        best_ndcg = val_metrics["NDCG@10"]
        torch.save(model.state_dict(), "best_model.pth")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model and test
model.load_state_dict(torch.load("best_model.pth"))
test_metrics = evaluate(model, test_loader, k=10, device=DEVICE)
print(f"\nTest Metrics: {test_metrics}")

Training: 100%|██████████| 178/178 [00:17<00:00, 10.43it/s]
Evaluating: 100%|██████████| 74/74 [01:27<00:00,  1.19s/it]


Epoch 1/20
Train Loss: nan
Val Metrics: {'HR@10': np.float64(1.0), 'NDCG@10': np.float64(0.3562071871080221), 'MRR@10': np.float64(0.16666666666666657)}


Training: 100%|██████████| 178/178 [00:16<00:00, 10.63it/s]
Evaluating:  62%|██████▏   | 46/74 [00:52<00:32,  1.14s/it]


KeyboardInterrupt: 