# Cross-Domain Recommendation System Development
This notebook is an experiment in building a cross-domain recommendation system using the Amazon Reviews dataset. It uses the best model from the single-domain experiments and extends it to handle multiple domains. The dataset is the same as in the single-domain experiments, but now will combine data from two different domains.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import time
import gc
import matplotlib.pyplot as plt
from collections import defaultdict

os.environ["HF_HOME"] = "D:/Python Projects/recommendation_system"
os.environ["HF_DATASETS_CACHE"] = "D:/Python Projects/recommendation_system/recsys/data"
os.environ["TRANSFORMERS_CACHE"] = "D:/Python Projects/recommendation_system/recsys/models"

# os.environ["HF_HOME"] = "E:/Python Scripts/recsys"
# os.environ['HF_DATASETS_CACHE'] = "E:/Python Scripts/recsys/data"
# os.environ['TRANSFORMERS_CACHE'] = "E:/Python Scripts/recsys/models"

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset, Features, Value
from tqdm import tqdm
from tensorboardX import SummaryWriter

In [2]:
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cuda


In [3]:
HF_DATASET = "McAuley-Lab/Amazon-Reviews-2023"

def load_amazon_reviews(domain, save_dir="data", max_items=None, seed=SEED):
    os.makedirs(save_dir, exist_ok=True)
    filepath = f"{save_dir}/amazon_reviews_{domain}.csv"

    if not os.path.exists(filepath):
        print(f"File {filepath} not found. Downloading dataset for domain '{domain}'...")
        ds = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            f"raw_review_{domain}",
            split="full",
            trust_remote_code=True,
        )

        # Keep only needed columns
        ds = ds.select_columns(["user_id", "parent_asin", "rating", "timestamp"])
        ds = ds.rename_columns({"user_id": "user", "parent_asin": "item"})
        ds = ds.cast(Features({
            "user": Value("string"),
            "item": Value("string"),
            "rating": Value("float32"),
            "timestamp": Value("int64"),
        }))

        # Convert to pandas (Arrow zero-copy where possible)
        df = ds.to_pandas()
        df.insert(3, "domain", domain)
        df.to_csv(f"{save_dir}/amazon_reviews_{domain}.csv", index=False)
        print(f"Saved amazon_reviews_{domain}.csv to {save_dir}/")

    final_df = pd.read_csv(filepath)
    if max_items is not None:
        k = min(max_items, len(final_df))
        final_df = final_df.sample(n=k, random_state=seed).reset_index(drop=True)
    print(f"Loaded {filepath} with {len(final_df)} rows.")
    return final_df

def preprocess_dataset(df, min_user_interactions=5, min_item_interactions=5):
    # Make it implicit
    df["label"] = 1.0
    user_counts = df.groupby("user").size()
    valid_users = user_counts[user_counts >= min_user_interactions].index
    item_counts = df.groupby("item").size()
    valid_items = item_counts[item_counts >= min_item_interactions].index
    df_filtered = df[df["user"].isin(valid_users) & df["item"].isin(valid_items)]
    print("After interactions filtering:", len(df_filtered), "rows,", df_filtered["user"].nunique(), "users,", df_filtered["item"].nunique(), "items")
    return df_filtered

def label_encoder(df, shift_item_id=False):
    user_enc = LabelEncoder()
    item_enc = LabelEncoder()
    domain_enc = LabelEncoder()
    df["user_id"] = user_enc.fit_transform(df["user"])
    df["item_id"] = item_enc.fit_transform(df["item"])
    if shift_item_id:
        df["item_id"] = df["item_id"] + 1
    df["domain_id"] = domain_enc.fit_transform(df["domain"])
    return df, user_enc, item_enc, domain_enc

## Preparing Combined Dataset

In [4]:
# New input
SOURCE_DOMAIN = "Books"
TARGET_DOMAIN = "Movies_and_TV"
ALL_DOMAIN = [SOURCE_DOMAIN, TARGET_DOMAIN]

# Loading data from multiple domains
def load_multi_domain_data(domains, max_items_per_domain=None, seed=SEED):
    all_dfs = []
    print(f"Combining data from domains: {domains}")
    for domain in domains:
        df_domain = load_amazon_reviews(domain, max_items=max_items_per_domain, seed=seed)
        print(f"{domain} domain data shape: {df_domain.shape}")
        all_dfs.append(df_domain)
    all_df = pd.concat(all_dfs, ignore_index=True)
    all_df.to_csv("data/amazon_reviews_combined.csv", index=False)
    print(f"Total interactions across domains: {len(all_df)}")
    final_df = pd.read_csv("data/amazon_reviews_combined.csv")
    return final_df

combined_df = load_multi_domain_data(ALL_DOMAIN, max_items_per_domain=3_000_000, seed=SEED)
print(f"\nTotal interactions across domains: {len(combined_df)}")

# Preprocess the combined dataset
filtered_combined_df = preprocess_dataset(combined_df, min_user_interactions=20, min_item_interactions=20)
combined_df_encoded, user_encoder, item_encoder, domain_encoder = label_encoder(filtered_combined_df, shift_item_id=True)

df_source = combined_df_encoded[combined_df_encoded["domain"]== SOURCE_DOMAIN]
df_target = combined_df_encoded[combined_df_encoded["domain"] == TARGET_DOMAIN]

NUM_USERS_ALL = combined_df_encoded["user_id"].max() + 1
NUM_ITEMS_ALL = combined_df_encoded["item_id"].max() + 1
NUM_DOMAINS = combined_df_encoded["domain_id"].max() + 1
print(f"\nNumber of all users: {NUM_USERS_ALL}, all items: {NUM_ITEMS_ALL}, all domains: {NUM_DOMAINS}")

NUM_USERS_SOURCE = df_source["user_id"].max() + 1
NUM_ITEMS_SOURCE = df_source["item_id"].max() + 1
NUM_USERS_TARGET = df_target["user_id"].max() + 1
NUM_ITEMS_TARGET = df_target["item_id"].max() + 1
print(f"\nSource domain - users: {NUM_USERS_SOURCE}, items: {NUM_ITEMS_SOURCE}")
print(f"Target domain - users: {NUM_USERS_TARGET}, items: {NUM_ITEMS_TARGET}")

Combining data from domains: ['Books', 'Movies_and_TV']
Loaded data/amazon_reviews_Books.csv with 3000000 rows.
Books domain data shape: (3000000, 5)
File data/amazon_reviews_Movies_and_TV.csv not found. Downloading dataset for domain 'Movies_and_TV'...


Casting the dataset:   0%|          | 0/17328314 [00:00<?, ? examples/s]

Saved amazon_reviews_Movies_and_TV.csv to data/
Loaded data/amazon_reviews_Movies_and_TV.csv with 3000000 rows.
Movies_and_TV domain data shape: (3000000, 5)
Total interactions across domains: 6000000

Total interactions across domains: 6000000
After interactions filtering: 72182 rows, 7520 users, 25209 items

Number of all users: 7520, all items: 25210, all domains: 2

Source domain - users: 7520, items: 25210
Target domain - users: 7519, items: 25205


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user_id"] = user_enc.fit_transform(df["user"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["item_id"] = item_enc.fit_transform(df["item"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["item_id"] = df["item_id"] + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

In [5]:
def find_overlapping_users(df, domain1, domain2):
    # 1. Get the unique set of users for each domain
    users_in_domain1 = set(df[df["domain"] == domain1]["user"].unique())
    users_in_domain2 = set(df[df["domain"] == domain2]["user"].unique())

    # 2. Find the intersection of the two sets to get overlapping users
    overlapping_users = users_in_domain1.intersection(users_in_domain2)

    # --- Reporting Statistics ---
    num_domain1 = len(users_in_domain1)
    num_domain2 = len(users_in_domain2)
    num_overlap = len(overlapping_users)

    if num_domain1 == 0 or num_domain2 == 0:
        print("Warning: One or both domains have no users in the dataframe.")
        return set()

    print(f"Total unique users in '{domain1}': {num_domain1}")
    print(f"Total unique users in '{domain2}': {num_domain2}")
    print(f"Number of users active in BOTH domains: {num_overlap}")

    # 3. Calculate overlap percentage
    overlap_pct_d1 = (num_overlap / num_domain1) * 100
    overlap_pct_d2 = (num_overlap / num_domain2) * 100
    print(f"These overlapping users represent {overlap_pct_d1:.2f}% of the '{domain1}' user base.")
    print(f"These overlapping users represent {overlap_pct_d2:.2f}% of the '{domain2}' user base.")

    return overlapping_users

In [6]:
active_in_both = find_overlapping_users(combined_df_encoded, SOURCE_DOMAIN, TARGET_DOMAIN)

Total unique users in 'Books': 4256
Total unique users in 'Movies_and_TV': 5604
Number of users active in BOTH domains: 2340
These overlapping users represent 54.98% of the 'Books' user base.
These overlapping users represent 41.76% of the 'Movies_and_TV' user base.


In [8]:
def create_user_sequences(df):
    df_sorted = df.sort_values(["user_id", "timestamp"])
    user_sequences = {}
    for uid, group in df_sorted.groupby("user_id"):
        items = group["item_id"].tolist()
        user_sequences[uid] = items

    print(f"Number of users: {len(user_sequences)}")
    print(f"Max sequence length: {max(len(seq) for seq in user_sequences.values())}")
    print(f"Min sequence length: {min(len(seq) for seq in user_sequences.values())}")

    return user_sequences

user_sequences_src = create_user_sequences(df_source)
user_sequences_tgt = create_user_sequences(df_target)
pos_items_by_user_src = {u: set(seq) for u, seq in user_sequences_src.items()}
pos_items_by_user_tgt = {u: set(seq) for u, seq in user_sequences_tgt.items()}

Number of users: 4256
Max sequence length: 41
Min sequence length: 1
Number of users: 5604
Max sequence length: 170
Min sequence length: 1


In [10]:
def sequences_loo_split(user_sequences):
    train_seqs = {}
    val_data = {}
    test_data = {}

    for user, seq in user_sequences.items():
        if len(seq) < 3:  # Need at least 3 items for train/val/test
            continue

        train_seqs[user] = seq[:-2]  # All but last two
        val_data[user] = (seq[:-2], seq[-2])  # Train on all but last 2, predict second-to-last
        test_data[user] = (seq[:-1], seq[-1])  # Train on all but last, predict last

    print(f"Training sequences: {len(train_seqs)}")
    print(f"Validation users: {len(val_data)}")
    print(f"Test users: {len(test_data)}")

    return train_seqs, val_data, test_data

train_seqs_src, val_data_src, test_data_src = sequences_loo_split(user_sequences_src)
train_seqs_tgt, val_data_tgt, test_data_tgt = sequences_loo_split(user_sequences_tgt)
print(f"\nSource Sequences - Train: {len(train_seqs_src)}, Val: {len(val_data_src)}, Test: {len(test_data_src)}")
print(f"Target Sequences - Train: {len(train_seqs_tgt)}, Val: {len(val_data_tgt)}, Test: {len(test_data_tgt)}")

Training sequences: 1946
Validation users: 1946
Test users: 1946
Training sequences: 4602
Validation users: 4602
Test users: 4602

Source Sequences - Train: 1946, Val: 1946, Test: 1946
Target Sequences - Train: 4602, Val: 4602, Test: 4602


## Compute user representations from sequences

In [12]:
# SASRec model
class PointWiseFeedForward(nn.Module):
    def __init__(self, hidden_dim, dropout=0.2):
        super().__init__()
        self.w1 = nn.Linear(hidden_dim, hidden_dim)
        self.w2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w2(self.dropout(self.relu(self.w1(x))))

class AttentionBlock(nn.Module):
    def __init__(self, hidden_dim, num_heads, dropout=0.2):
        super().__init__()

        # Multi-head attention
        self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)

        # Layer norms
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)

        # Feed-forward network
        self.ffn = PointWiseFeedForward(hidden_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None):
        # Self-attention with residual connection
        attn_out, _ = self.attn(x, x, x, attn_mask=attn_mask)
        x = self.ln1(x + self.dropout(attn_out))

        # Feed-forward network with residual connection
        ffn_out = self.ffn(x)
        x = self.ln2(x + self.dropout(ffn_out))

        return x

class SASRec(nn.Module):
    def __init__(self,
                 num_items,
                 hidden_dim=64,
                 max_seq_len=50,
                 num_blocks=2,
                 num_heads=2,
                 dropout=0.2):
        super().__init__()

        self.num_items = num_items
        self.hidden_dim = hidden_dim
        self.max_seq_len = max_seq_len

        # Embedding layers
        self.item_embed = nn.Embedding(num_items, hidden_dim, padding_idx=0)
        self.positional_embed = nn.Embedding(max_seq_len, hidden_dim)
        self.dropout = nn.Dropout(dropout)

        # Stack of SASRec blocks
        self.blocks = nn.ModuleList([
            AttentionBlock(hidden_dim, num_heads, dropout) for _ in range(num_blocks)
        ])

        # Final layer norm
        self.ln = nn.LayerNorm(hidden_dim)

        # Initialize weights
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_normal_(self.item_embed.weight[1:])  # Skip padding idx
        nn.init.xavier_normal_(self.positional_embed.weight)

    def forward(self, input_seq, candidate_items=None):
        batch_size, seq_len = input_seq.shape

        # Get item embeddings
        item_embeds = self.item_embed(input_seq)  # [B, L, D]

        # Add positional embeddings
        positions = torch.arange(seq_len, device=input_seq.device).unsqueeze(0)
        pos_embeds = self.positional_embed(positions)  # [1, L, D]
        x = self.dropout(item_embeds + pos_embeds)

        # Create causal attention mask
        attn_mask = self._create_causal_mask(seq_len, input_seq.device)
        pad_mask = input_seq.eq(0)

        # Pass through transformer blocks
        for block in self.blocks:
            x = block(x, attn_mask=attn_mask)

        # Final layer norm
        x = self.ln(x)  # [B, L, D]
        x = x.masked_fill(pad_mask.unsqueeze(-1), 0.0)

        # If candidate_items provided, score them
        if candidate_items is not None:
            # Get embeddings for candidate items
            cand_emb = self.item_embed(candidate_items) # [B, N, D]

            # Use last position's representation for scoring
            last_hidden = x[:, -1, :].unsqueeze(1)  # [B, 1, D]

            # Compute scores via dot product
            scores = torch.matmul(last_hidden, cand_emb.transpose(1, 2)).squeeze(1) # [B, N]
            return scores

        return x

    def _create_causal_mask(self, seq_len, device):
        mask = torch.triu(torch.ones(seq_len, seq_len, device=device, dtype=torch.bool), diagonal=1)
        mask = mask.masked_fill(mask == 1, float("-inf"))
        return mask

    def predict_next(self, input_seq):
        # Get sequence representations
        seq_repr = self.forward(input_seq)  # [B, L, D]

        # Use last position for prediction
        last_hidden = seq_repr[:, -1, :]  # [B, D]

        # Score against all item embeddings
        all_item_embeds = self.item_embed.weight  # [num_items, D]
        scores = torch.matmul(last_hidden, all_item_embeds.T)  # [B, num_items]
        return scores

In [1]:
# Load trained model on source domain
def load_best_weights(model, ckpt_path="model/best_model.pth", device=None):
    if device is None:
        device = next(model.parameters()).device
    if not os.path.exists(ckpt_path):
        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
    state = torch.load(ckpt_path, map_location=device)
    model.load_state_dict(state)
    model.to(device)
    model.eval()
    return model

model = SASRec(
    num_items=NUM_ITEMS_SOURCE,
    hidden_dim=64,
    max_seq_len=50,
    num_blocks=2,
    num_heads=2,
    dropout=0.2
)

best_model = load_best_weights(model, ckpt_path="model_sasrec/best_model.pth", device=DEVICE)

NameError: name 'SASRec' is not defined

In [11]:
@torch.no_grad()
def compute_user_reprs_from_sequences(model_src, train_seqs_src, user_encoder_src, max_seq_len=50, device=DEVICE):
    model_src.eval().to(device)
    user_vecs = {}

    for user_id, seq in train_seqs_src.items():
        if len(seq) < 1:
            continue

        # Pad-left to max_seq_len
        seq = seq[-max_seq_len:]
        pad_len = max_seq_len - len(seq)
        input_seq = torch.tensor([([0] * pad_len + seq)], dtype=torch.long, device=device)
        hidden = model_src(input_seq)
        last_hidden = hidden[0, -1, :].squeeze(0)
        raw_user = user_encoder_src.inverse_transform([user_id])[0]
        user_vecs[raw_user] = last_hidden.detach().cpu().numpy()

    print(f"Computed user representations for {len(user_vecs)} users.")
    return user_vecs

user_vecs_src = compute_user_reprs_from_sequences(
    model_src="model_sasrec/best_model.pth",  # Path to the pre-trained source model

)