In [21]:
import os
import math
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tests.performance.recommenders.evaluation.test_python_evaluation_time_performance import test_merge_rating
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

In [4]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [116]:
# Select 3 categories to highlight cross-domain transfer
SOURCE_DOMAIN = "Movies_and_TV"
TARGET_DOMAIN = "Video_Games"
DOMAINS = [SOURCE_DOMAIN, TARGET_DOMAIN]

MIN_USER_INTERACTIONS = 5
MIN_ITEM_INTERACTIONS = 5
POSITIVE_THRESHOLD = 4.0  # Ratings >= 4.0 are considered positive

# Load the dataset
def load_amazon_reviews(domain:str, max_per_domain:int=100000) -> pd.DataFrame:
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023",
                           f"raw_review_{domain}",
                           trust_remote_code=True)
    rows = []
    for i, r in enumerate(dataset["full"]):
        if i >= max_per_domain:
            break
        rows.append({
            "user": r["user_id"],
            "item": r["parent_asin"],
            "rating": float(r["rating"]),
            "domain": domain,
            "implicit": 1 if r["rating"] >= POSITIVE_THRESHOLD else 0,
            "verified_purchase": r["verified_purchase"],
            "timestamp": int(r["timestamp"])
        })
    return pd.DataFrame(rows)

dfs = [load_amazon_reviews(dom, max_per_domain=100000) for dom in DOMAINS]
df = pd.concat(dfs, ignore_index=True).sort_values("timestamp").reset_index(drop=True)

In [117]:
df.head()

Unnamed: 0,user,item,rating,domain,implicit,verified_purchase,timestamp
0,AHAYX6YWLK52LPXFSE2QUNMMS44A,783114222,5.0,Movies_and_TV,1,False,913069725000
1,AG3S4FROO422V5KP7DJCBXVUQLJQ,800185676,5.0,Movies_and_TV,1,True,914267986000
2,AHVNRIAPM3GVNS3RH3MNIEVSSBNA,6303501281,5.0,Movies_and_TV,1,False,914297420000
3,AHAYX6YWLK52LPXFSE2QUNMMS44A,6300215571,5.0,Movies_and_TV,1,True,920170436000
4,AEKPXGAS7MDLNHMCEZMOOQUOYJLA,6304279485,5.0,Movies_and_TV,1,False,921529732000


In [118]:
df["verified_purchase"].value_counts()

verified_purchase
True     162906
False     37094
Name: count, dtype: int64

In [112]:
def filter_min_interactions(df):
    min_interactions = 5
    user_counts = df["user"].value_counts()
    item_counts = df["item"].value_counts()

    valid_users = user_counts[user_counts >= min_interactions].index
    valid_items = item_counts[item_counts >= min_interactions].index

    df_clean = df[(df["user"].isin(valid_users)) & (df["item"].isin(valid_items))]

    return df_clean

df_filtered = filter_min_interactions(df).reset_index(drop=True)
df_filtered.head()

Unnamed: 0,user,item,rating,domain,implicit,timestamp
0,AF2XY372UVDMEJUPRCPBKTL6MUAA,6305657947,5.0,Movies_and_TV,1,950129107000
1,AGHR3R7QF7YM6PRFO7KV4V7NLXWQ,B001ELJPW8,5.0,Video_Games,1,970115992000
2,AHVRJMMQMNEWRCZJZ6T5XHMER2PA,B00000K2R4,5.0,Video_Games,1,971974663000
3,AESB3UJ5JUPZRG7WXEPX7ARJ6MTA,B001ELJPW8,5.0,Video_Games,1,979854439000
4,AHVRJMMQMNEWRCZJZ6T5XHMER2PA,B001EYUQ96,4.0,Video_Games,1,988223200000


In [113]:
print(f"Length of original dataset: {len(df)}")
print(f"Length after filtering: {len(df_filtered)}")

Length of original dataset: 200000
Length after filtering: 50765


In [114]:
def encode_users_items(df):
    user2id = {u: idx for idx, u in enumerate(df["user"].unique())}
    df["user_id"] = df["user"].map(user2id).astype(np.int64)

    # Per domain item id mapping
    item2id_by_domain = {}
    def map_items_per_domain(row):
        dom = row["domain"].iloc[0]
        if dom not in item2id_by_domain:
            items = row["item"].unique()
            item2id_by_domain[dom] = {i: idx for idx, i in enumerate(items)}
        row["item_id"] = row["item"].map(item2id_by_domain[dom]).astype(np.int64)
        return row

    df = df.groupby("domain", group_keys=False)[df.columns].apply(map_items_per_domain)
    return df

df_encoded = encode_users_items(df_filtered)
df_encoded.head()

Unnamed: 0,user,item,rating,domain,implicit,timestamp,user_id,item_id
0,AF2XY372UVDMEJUPRCPBKTL6MUAA,6305657947,5.0,Movies_and_TV,1,950129107000,0,0
1,AGHR3R7QF7YM6PRFO7KV4V7NLXWQ,B001ELJPW8,5.0,Video_Games,1,970115992000,1,0
2,AHVRJMMQMNEWRCZJZ6T5XHMER2PA,B00000K2R4,5.0,Video_Games,1,971974663000,2,1
3,AESB3UJ5JUPZRG7WXEPX7ARJ6MTA,B001ELJPW8,5.0,Video_Games,1,979854439000,3,0
4,AHVRJMMQMNEWRCZJZ6T5XHMER2PA,B001EYUQ96,4.0,Video_Games,1,988223200000,2,2


In [115]:
n_users = df_encoded["user_id"].nunique()
n_items_target = df_encoded[df_encoded["domain"] == TARGET_DOMAIN]["item_id"].nunique()
n_items_source = df_encoded[df_encoded["domain"] == SOURCE_DOMAIN]["item_id"].nunique()
print(f"Users: {n_users}, {SOURCE_DOMAIN} items: {n_items_source}, {TARGET_DOMAIN} items: {n_items_target}")

Users: 8502, Movies_and_TV items: 2845, Video_Games items: 4206


In [None]:
def negative_sampling(df, n_negatives=1):
    interact_status = (df.groupby("user_id")["item_id"]
                       .apply(set)
                       .reset_index()
                       .rename(columns={"item_id", "interacted_items"}))
    interact_status["negative_items"] = interact_status["interacted_items"].apply(
        lambda x: set(df["item_id"].unique() - x)
    )
    interact_status["negative_samples"] = interact_status["negative_items"].apply(
        lambda x: random.sample(x, n_negatives)
    )
    return interact_status[["user_id", "negative_items", "negative_samples"]]

In [104]:
def leave_one_out(df):
    df["rank_latest"] = df.groupby(["user_id"])["timestamp"].rank(method="first", ascending=False)
    train = df[df["rank_latest"] > 2].drop(columns="rank_latest")
    val = df[df["rank_latest"] == 1].drop(columns="rank_latest")
    test = df[df["rank_latest"] == 2].drop(columns="rank_latest")
    return train, val, test

def leave_one_out_per_domain(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_parts, val_parts, test_parts = [], [], []
    for domain, sub in df.groupby("domain", group_keys=False):
        tr, va, te = leave_one_out(sub)
        train_parts.append(tr)
        val_parts.append(va)
        test_parts.append(te)
    train_all = pd.concat(train_parts, ignore_index=True)
    val_all   = pd.concat(val_parts, ignore_index=True)
    test_all  = pd.concat(test_parts, ignore_index=True)
    return train_all, val_all, test_all

train_df, val_df, test_df = leave_one_out_per_domain(df_encoded)

print("Source domain (train/val/test):",
      train_df[train_df.domain == SOURCE_DOMAIN].shape[0],
      val_df[val_df.domain == SOURCE_DOMAIN].shape[0],
      test_df[test_df.domain == SOURCE_DOMAIN].shape[0])

print("Target domain (train/val/test):",
      train_df[train_df.domain == TARGET_DOMAIN].shape[0],
      val_df[val_df.domain == TARGET_DOMAIN].shape[0],
      test_df[test_df.domain == TARGET_DOMAIN].shape[0])

Source domain (train/val/test): 84229 15154 13596
Target domain (train/val/test): 73724 16393 14242


In [109]:
train_df["implicit"].value_counts()

implicit
1    127556
0     30397
Name: count, dtype: int64

In [99]:
def build_positive_pairs(df, domain):
    pos = defaultdict(set)
    row = df[df["domain"] == domain]
    for u, g in row.groupby("user_id"):
        if len(g) == 0:
            continue
        pos[u] = set(g["item_id"].to_list())
    return pos

train_pos_target = build_positive_pairs(train_df, TARGET_DOMAIN)
train_pos_target[3]

set()

Unnamed: 0,user,interacted_items,negative_items
0,AE223GHNZEI5MRMBVVRGJONDNWRQ,"{B0B19ZNXXH, B09ZFBBSC6, B09VT2M351, B0044WZNB6}","{B00002ZMNV, B07G2MYV7Z, B00BMT9SQA, B008133O3..."
1,AE224GVO7OHTYF26U6ER6BEVIUAQ,"{B07GKZX4HP, B07GCQG6W5, B003CSV7TK, B00EV4EUT8}","{B00002ZMNV, B07G2MYV7Z, B00BMT9SQA, B008133O3..."
2,AE22CFXT3QZKUQJORVTGL3VQXAAA,"{B00YMIP30I, B005IF0NK4, B00VMIWA3O, B00932AGH...","{B00002ZMNV, B07G2MYV7Z, B00BMT9SQA, B008133O3..."
3,AE22CS5OIRQTN7WDKQ2F3BYP5CXA,"{B01IS2PWV8, B00BL5OJ14, B00SACPBU4, B017I3KWX...","{B00002ZMNV, B07G2MYV7Z, B00BMT9SQA, B008133O3..."
4,AE22Z3RLVIRU6RT5PNRK5CFFNEFQ,"{B07439CMGM, B073RPZK88}","{B00002ZMNV, B07G2MYV7Z, B00BMT9SQA, B008133O3..."


In [36]:
dummy = source_df_encoded.copy()
dummy["rating"] = source_df_encoded["rating"].apply(lambda x: float(x > 0))

44755