# Cross-Domain Recommendation System Development
This notebook is an experiment in building a cross-domain recommendation system using the Amazon Reviews dataset. It uses the best model from the single-domain experiments and extends it to handle multiple domains. The dataset is the same as in the single-domain experiments, but now will combine data from two different domains.

In [3]:
import os
import random
import numpy as np
import pandas as pd
import time
import gc
import matplotlib.pyplot as plt
from collections import defaultdict

# os.environ["HF_HOME"] = "D:/Python Projects/recommendation_system"
# os.environ["HF_DATASETS_CACHE"] = "D:/Python Projects/recommendation_system/recsys/data"
# os.environ["TRANSFORMERS_CACHE"] = "D:/Python Projects/recommendation_system/recsys/models"

os.environ["HF_HOME"] = "E:/Python Scripts/recsys"
os.environ['HF_DATASETS_CACHE'] = "E:/Python Scripts/recsys/data"
os.environ['TRANSFORMERS_CACHE'] = "E:/Python Scripts/recsys/models"

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset, Features, Value
from tqdm import tqdm
from tensorboardX import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

DEVICE: cuda


In [11]:
HF_DATASET = "McAuley-Lab/Amazon-Reviews-2023"

def load_amazon_reviews(domain:str,
                        save_dir:str = "data",
                        max_items:int | None = None,
                        seed:int = SEED) -> pd.DataFrame:

    os.makedirs(save_dir, exist_ok=True)
    filepath = f"{save_dir}/amazon_reviews_{domain}.csv"

    if not os.path.exists(filepath):
        print(f"File {filepath} not found. Downloading dataset for domain '{domain}'...")
        ds = load_dataset(
            "McAuley-Lab/Amazon-Reviews-2023",
            f"raw_review_{domain}",
            split="full",
            trust_remote_code=True,
        )

        # Keep only needed columns
        ds = ds.select_columns(["user_id", "parent_asin", "rating", "timestamp"])
        ds = ds.rename_columns({"user_id": "user", "parent_asin": "item"})
        ds = ds.cast(Features({
            "user": Value("string"),
            "item": Value("string"),
            "rating": Value("float32"),
            "timestamp": Value("int64"),
        }))

        # Convert to pandas (Arrow zero-copy where possible)
        df = ds.to_pandas()
        df.insert(3, "domain", domain)
        df.to_csv(f"{save_dir}/amazon_reviews_{domain}.csv", index=False)
        print(f"Saved amazon_reviews_{domain}.csv to {save_dir}/")

    final_df = pd.read_csv(filepath)
    # Random subset if max_items is set
    if max_items is not None:
        k = min(max_items, len(final_df))
        final_df = final_df.sample(n=k, random_state=seed).reset_index(drop=True)
    print(f"Loaded {filepath} with {len(final_df)} rows.")
    return final_df

def preprocess_dataset(df, min_user_interactions=5, min_item_interactions=5):
    # Make it implicit
    df["label"] = 1.0
    # Filter users and items with less than k interactions
    user_counts = df["user"].value_counts()
    item_counts = df["item"].value_counts()
    valid_users = user_counts[user_counts >= min_user_interactions].index
    valid_items = item_counts[item_counts >= min_item_interactions].index
    df = df[df["user"].isin(valid_users) & df["item"].isin(valid_items)].copy()
    print("After interactions filtering:", len(df), "rows,", df["user"].nunique(), "users,", df["item"].nunique(), "items")
    return df

def label_encoder(df, shift_item_id=False):
    user_enc = LabelEncoder()
    item_enc = LabelEncoder()
    domain_enc = LabelEncoder()
    df["user_id"] = user_enc.fit_transform(df["user"])
    df["item_id"] = item_enc.fit_transform(df["item"])
    if shift_item_id:
        df["item_id"] = df["item_id"] + 1  # Shift item IDs by 1 to reserve 0 for padding if needed
    df["domain_id"] = domain_enc.fit_transform(df["domain"])
    return {"encoded_df": df,
            "user_encoder": user_enc,
            "item_encoder": item_enc,
            "domain_encoder": domain_enc}

## Preparing Combined Dataset

In [15]:
# New input
SOURCE_DOMAIN = "Books"
TARGET_DOMAIN = "Movies_and_TV"
ALL_DOMAIN = [SOURCE_DOMAIN, TARGET_DOMAIN]

# Loading data from multiple domains
def load_multi_domain_data(domains, max_items_per_domain=None, seed=SEED):
    all_dfs = []
    print(f"Combining data from domains: {domains}")
    for domain in domains:
        df_domain = load_amazon_reviews(domain, max_items=max_items_per_domain, seed=seed)
        print(f"{domain} domain data shape: {df_domain.shape}")
        all_dfs.append(df_domain)
    all_df = pd.concat(all_dfs, ignore_index=True)
    all_df.to_csv("data/amazon_reviews_combined.csv", index=False)
    print(f"Total interactions across domains: {len(all_df)}")
    final_df = pd.read_csv("data/amazon_reviews_combined.csv")
    return final_df

combined_df = load_multi_domain_data(ALL_DOMAIN, max_items_per_domain=3_000_000, seed=SEED)
print(f"Total interactions across domains: {len(combined_df)}")

# Preprocess the combined dataset
filtered_combined_df = preprocess_dataset(combined_df, min_user_interactions=20, min_item_interactions=20)

le = label_encoder(filtered_combined_df, shift_item_id=True)
combined_df_encoded = le["encoded_df"]
user_encoder = le["user_encoder"]
item_encoder = le["item_encoder"]
domain_encoder = le["domain_encoder"]

df_source = combined_df_encoded[combined_df_encoded["domain"]== SOURCE_DOMAIN]
df_target = combined_df_encoded[combined_df_encoded["domain"] == TARGET_DOMAIN]

NUM_USERS_ALL = combined_df_encoded["user_id"].max() + 1
NUM_ITEMS_ALL = combined_df_encoded["item_id"].max() + 1
NUM_DOMAINS = combined_df_encoded["domain_id"].max() + 1
print(f"Number of all users: {NUM_USERS_ALL}, all items: {NUM_ITEMS_ALL}, all domains: {NUM_DOMAINS}")

NUM_USERS_SOURCE = df_source["user_id"].max() + 1
NUM_ITEMS_SOURCE = df_source["item_id"].max() + 1
NUM_USERS_TARGET = df_target["user_id"].max() + 1
NUM_ITEMS_TARGET = df_target["item_id"].max() + 1
print(f"Source domain - users: {NUM_USERS_SOURCE}, items: {NUM_ITEMS_SOURCE}")
print(f"Target domain - users: {NUM_USERS_TARGET}, items: {NUM_ITEMS_TARGET}")

Combining data from domains: ['Books', 'Movies_and_TV']
Loaded data/amazon_reviews_Books.csv with 3000000 rows.
Books domain data shape: (3000000, 5)
Loaded data/amazon_reviews_Movies_and_TV.csv with 3000000 rows.
Movies_and_TV domain data shape: (3000000, 5)
Total interactions across domains: 6000000
Total interactions across domains: 6000000
After interactions filtering: 72182 rows, 7520 users, 25209 items
Number of all users: 7520, all items: 25210, all domains: 2
Source domain - users: 7520, items: 25210
Target domain - users: 7519, items: 25205


In [16]:
def find_overlapping_users(df, domain1, domain2):
    # 1. Get the unique set of users for each domain
    users_in_domain1 = set(df[df["domain"] == domain1]["user"].unique())
    users_in_domain2 = set(df[df["domain"] == domain2]["user"].unique())

    # 2. Find the intersection of the two sets to get overlapping users
    overlapping_users = users_in_domain1.intersection(users_in_domain2)

    # --- Reporting Statistics ---
    num_domain1 = len(users_in_domain1)
    num_domain2 = len(users_in_domain2)
    num_overlap = len(overlapping_users)

    if num_domain1 == 0 or num_domain2 == 0:
        print("Warning: One or both domains have no users in the dataframe.")
        return set()

    print(f"Total unique users in '{domain1}': {num_domain1}")
    print(f"Total unique users in '{domain2}': {num_domain2}")
    print(f"Number of users active in BOTH domains: {num_overlap}")

    # 3. Calculate overlap percentage
    overlap_pct_d1 = (num_overlap / num_domain1) * 100
    overlap_pct_d2 = (num_overlap / num_domain2) * 100
    print(f"These overlapping users represent {overlap_pct_d1:.2f}% of the '{domain1}' user base.")
    print(f"These overlapping users represent {overlap_pct_d2:.2f}% of the '{domain2}' user base.")

    return overlapping_users

In [17]:
active_in_both = find_overlapping_users(combined_df_encoded, SOURCE_DOMAIN, TARGET_DOMAIN)

Total unique users in 'Books': 4256
Total unique users in 'Movies_and_TV': 5604
Number of users active in BOTH domains: 2340
These overlapping users represent 54.98% of the 'Books' user base.
These overlapping users represent 41.76% of the 'Movies_and_TV' user base.
