In [1]:
import os
import math
import random
import numpy as np
import pandas as pd
import time
from recommenders.datasets.python_splitters import python_chrono_split, python_stratified_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from collections import defaultdict

# os.environ["HF_HOME"] = "E:/Python Scripts/recsys"
# os.environ['HF_DATASETS_CACHE'] = "E:/Python Scripts/recsys/data"
# os.environ['TRANSFORMERS_CACHE'] = "E:/Python Scripts/recsys/models"

os.environ["HF_HOME"] = "D:/Python Projects/recommendation_system"
os.environ['HF_DATASETS_CACHE'] = "D:/Python Projects/recommendation_system/recsys/data"
os.environ['TRANSFORMERS_CACHE'] = "D:/Python Projects/recommendation_system/recsys/models"

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tensorboardX import SummaryWriter

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [3]:
# Select 2 categories to highlight cross-domain transfer
SOURCE_DOMAIN = "Movies_and_TV"
TARGET_DOMAIN = "Video_Games"
DOMAINS = [SOURCE_DOMAIN, TARGET_DOMAIN]

MIN_USER_INTERACTIONS = 10
MIN_ITEM_INTERACTIONS = 10
POSITIVE_THRESHOLD = 4.0  # Ratings >= 4.0 are considered positive

# Load the dataset
def load_amazon_reviews(domain:str, max_per_domain:int=100000) -> pd.DataFrame:
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023",
                           f"raw_review_{domain}",
                           trust_remote_code=True)
    rows = []
    for i, r in enumerate(dataset["full"]):
        if i >= max_per_domain:
            break
        rows.append({
            "user": r["user_id"],
            "item": r["parent_asin"],
            "rating": float(r["rating"]),
            "domain": domain,
            "verified_purchase": r["verified_purchase"],
            "timestamp": int(r["timestamp"])
        })

    return pd.DataFrame(rows)

# Sample 100k reviews per domain for model development
# dfs = [load_amazon_reviews(dom, max_per_domain=300000) for dom in DOMAINS]
# df = pd.concat(dfs, ignore_index=True).sort_values("timestamp").reset_index(drop=True)
df = load_amazon_reviews(SOURCE_DOMAIN, max_per_domain=100000)

In [4]:
class RatingDataset(torch.utils.data.Dataset):
	def __init__(self, user_list, item_list, rating_list):
		super().__init__()
		self.user_list = user_list
		self.item_list = item_list
		self.rating_list = rating_list

	def __len__(self):
		return len(self.user_list)

	def __getitem__(self, idx):
		user = self.user_list[idx]
		item = self.item_list[idx]
		rating = self.rating_list[idx]

		return (
			torch.tensor(user, dtype=torch.long),
			torch.tensor(item, dtype=torch.long),
			torch.tensor(rating, dtype=torch.float)
			)

In [24]:
class NCFData(object):
    def __init__(self, df, num_neg=4, num_neg_test=99, batch_size=1024):
        self.df = df
        self.num_neg = num_neg
        self.num_neg_test = num_neg_test
        self.batch_size = batch_size

        self.preprocess_df = self._reindex(self.df)

        ok_users = self.preprocess_df.groupby("user_id").size()
        ok_users = ok_users[ok_users >= 2].index
        self.preprocess_df = self.preprocess_df[self.preprocess_df.user_id.isin(ok_users)].copy()

        self.user_pool = set(self.preprocess_df["user_id"].unique().tolist())
        self.item_pool = set(self.preprocess_df["item_id"].unique().tolist())

        self.train_df, self.val_df, self.test_df = self._leave_one_out(self.preprocess_df)
        self.negatives = self._negative_sampling(self.preprocess_df)

    def _reindex(self, df):
        """Preprocess dataset to reindex userID and itemID, also set rating as binary feedback"""
        print("Reindexing users and items...")
        df = df.copy()
        u2id = {u: idx for idx, u in enumerate(df["user"].dropna().unique())}
        i2id = {i: idx for idx, i in enumerate(df["item"].dropna().unique())}
        df = df.dropna(subset=["user", "item", "rating", "timestamp"]).copy()

        df["user_id"] = df["user"].apply(lambda x: u2id[x]).astype(np.int64)
        df["item_id"] = df["item"].apply(lambda x: i2id[x]).astype(np.int64)
        df["rating"] = df["rating"].apply(lambda x: x > 0).astype(np.float32)
        print(f"Reindexed {len(u2id)} users and {len(i2id)} items.\n")
        return df

    def _leave_one_out(self, df):
        """Leave one out split for training and testing"""
        print("Leave one out split...")
        df = df.sort_values(["user_id", "timestamp"], ascending=[True, True]).copy()
        df["rank_latest"] = df.groupby("user_id")["timestamp"].rank(method="first", ascending=False)

        train = df.loc[df["rank_latest"] > 2, ["user_id", "item_id", "rating"]].copy()
        val = df.loc[df["rank_latest"] == 1, ["user_id", "item_id", "rating"]].copy()
        test = df.loc[df["rank_latest"] == 2, ["user_id", "item_id", "rating"]].copy()

        # Ensure same user sets exist in train/test
        common_users = set(train["user_id"].unique()) & set(test["user_id"].unique())
        train = train[train["user_id"].isin(common_users)]
        val = val[val["user_id"].isin(common_users)]
        test = test[test["user_id"].isin(common_users)]
        print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}\n")
        return train, val, test

    def _negative_sampling(self, df):
        """Negative sampling for training"""
        print("Negative sampling...")
        interact_status = (
            df.groupby("user_id")["item_id"]
            .apply(set)
            .reset_index()
            .rename(columns={"item_id": "interacted_items"}))
        all_items = self.item_pool

        def sample_negatives(user_set, k):
            pool = list(all_items - user_set)
            if len(pool) == 0:
                return []
            k = min(k, len(pool))
            return random.sample(pool, k)

        interact_status["negative_items"] = (interact_status["interacted_items"].apply(
            lambda x: all_items - x))
        interact_status["negative_samples"] = (interact_status["negative_items"].apply(
            lambda x: sample_negatives(x, self.num_neg_test)))
        print(f"Negative sampling done. Sampled {self.num_neg_test} negatives per user.\n")
        return interact_status[["user_id", "interacted_items", "negative_items", "negative_samples"]]

    def get_train_instance(self):
        """Get training instances with negative sampling"""
        print("Getting training instances...")
        users, items, ratings = [], [], []
        train_df = pd.merge(self.train_df,
                            self.negatives[["user_id", "negative_items"]],
                            on="user_id",
                            how="inner")

        train_df["negatives"] = (train_df["negative_items"].apply(
            lambda x: random.sample(list(x), min(self.num_neg, len(x))) if len(x) > 0 else []))

        for row in train_df.itertuples(index=False):
            users.append(int(row["user_id"]))
            items.append(int(row["item_id"]))
            ratings.append(float(row["rating"]))
            for neg in row["negatives"]:
                users.append(int(row["user_id"]))
                items.append(int(row["negatives"]))
                ratings.append(float(0))  # negative samples get 0 rating

        dataset = RatingDataset(users, items, ratings)
        print(f"Total training instances: {len(users)}\n")
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=False)

    def _pack_eval(self, split_df):
        """Pack evaluation instances with negative sampling"""
        users, items, ratings = [], [], []
        eval_df = pd.merge(split_df,
                           self.negatives[["user_id", "negative_samples"]],
                           on="user_id",
                           how="inner")

        for row in eval_df.itertuples(index=False):
            # positive first, then its negatives (candidate set == 1 + num_neg_test)
            users.append(int(row.user_id))
            items.append(int(row.item_id))
            ratings.append(float(row.rating))
            for neg in row["negative_samples"]:
                users.append(int(row["user_id"]))
                items.append(int(neg))
                ratings.append(float(0))  # negative samples get 0 rating

        dataset = RatingDataset(users, items, ratings)
        print(f"Total {split_df.name} instances: {len(users)}\n")
        return DataLoader(dataset, batch_size=self.num_neg_test + 1, shuffle=False, drop_last=False)

    def get_val_instance(self):
        """Get val instances with negative sampling"""
        print("Getting validation instances...")
        return self._pack_eval(self.val_df)

    def get_test_instance(self):
        """Get test instances with negative sampling"""
        print("Getting test instances...")
        return self._pack_eval(self.test_df)

In [25]:
def hit(ng_item, pred_items):
    return 1 if ng_item in pred_items else 0

def ndcg(ng_item, pred_items):
    if ng_item in pred_items:
        idx = pred_items.index(ng_item)
        return float(1.0 / np.log2(idx + 2))
    return 0.0

@torch.no_grad()
def metrics(model, test_loader, top_k, device):
    model.eval()
    hr, ndcg = [], []
    for user, item, label in test_loader:
        user = user.to(device); item = item.to(device)
        preds = model(user, item)
        _, indices = torch.topk(preds, k=top_k)
        recommends = torch.take(item, indices).cpu().tolist()
        ng_item = int(item[0].item())   # first element is the positive by construction
        hr.append(hit(ng_item, recommends))
        ndcg.append(ndcg(ng_item, recommends))
    return float(np.mean(hr)), float(np.mean(ndcg))

In [29]:
class SimpleMatrixFactorization(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.item_embedding = nn.Embedding(n_items, embedding_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))

        nn.init.normal_(self.user_embedding.weight, std=0.05)
        nn.init.normal_(self.item_embedding.weight, std=0.05)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_ids, item_ids):
        ue = self.user_embedding(user_ids)
        ie = self.item_embedding(item_ids)
        dot = (ue * ie).sum(dim=1, keepdim=True)
        out = dot + self.user_bias(user_ids) + self.item_bias(item_ids) + self.global_bias
        return out.squeeze(1)

In [27]:
# class GeneralizedMatrixFactorization(nn.Module):
#     def __init__(self, num_users, num_items, num_embeddings=32):
#         super().__init__()
#         self.user_embeddings = nn.Embedding(num_users, num_embeddings)
#         self.item_embeddings = nn.Embedding(num_items, num_embeddings)
#         self.affine_output = nn.Linear(in_features=num_embeddings, out_features=1)
#         self.sigmoid = nn.Sigmoid()
#
#     def forward(self, user_ids, item_ids):
#         user_embedding = self.user_embeddings(user_ids)
#         item_embedding = self.item_embeddings(item_ids)
#         element_product = torch.mul(user_embedding, item_embedding)
#         logits = self.affine_output(element_product)
#         rating = self.sigmoid(logits)
#         return rating
#
#     def init_weight(self):
#         pass
#
# class MultiLayerPerceptron(nn.Module):
#     def __init__(self, num_users, num_items, num_embeddings=32, num_layers=3):
#         super().__init__()
#         self.user_embeddings = nn.Embedding(num_users, num_embeddings)
#         self.item_embeddings = nn.Embedding(num_items, num_embeddings)
#
#         self.fc_layers = nn.ModuleList()
#         for idx, (in_size, out_size) in enumerate(zip(num_layers[:-1], num_layers[1:])):
#             self.fc_layers.append(nn.Linear(in_size, out_size))
#
#         self.affine_output = nn.Linear(num_layers[-1], out_features=1)
#         self.logistic = nn.Sigmoid()
#
#     def forward(self, user_indices, item_indices):
#         user_embedding = self.embedding_user(user_indices)
#         item_embedding = self.embedding_item(item_indices)
#         vector = torch.cat([user_embedding, item_embedding], dim=-1)  # the concat latent vector
#         for idx, _ in enumerate(range(len(self.fc_layers))):
#             vector = self.fc_layers[idx](vector)
#             vector = nn.ReLU()(vector)
#             # vector = nn.BatchNorm1d()(vector)
#             # vector = nn.Dropout(p=0.5)(vector)
#         logits = self.affine_output(vector)
#         rating = self.logistic(logits)
#         return rating
#
#     def init_weight(self):
#         pass
#
# class NeuMF(nn.Module):
#     def __init__(self, args, num_users, num_items):
#         super(NeuMF, self).__init__()
#         self.num_users = num_users
#         self.num_items = num_items
#         self.factor_num_mf = args.factor_num
#         self.factor_num_mlp =  int(args.layers[0]/2)
#         self.layers = args.layers
#         self.dropout = args.dropout
#
#         self.embedding_user_mlp = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num_mlp)
#         self.embedding_item_mlp = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num_mlp)
#
#         self.embedding_user_mf = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num_mf)
#         self.embedding_item_mf = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num_mf)
#
#         self.fc_layers = nn.ModuleList()
#         for idx, (in_size, out_size) in enumerate(zip(args.layers[:-1], args.layers[1:])):
#             self.fc_layers.append(torch.nn.Linear(in_size, out_size))
#             self.fc_layers.append(nn.ReLU())
#
#         self.affine_output = nn.Linear(in_features=args.layers[-1] + self.factor_num_mf, out_features=1)
#         self.logistic = nn.Sigmoid()
#         self.init_weight()
#
#     def init_weight(self):
#         nn.init.normal_(self.embedding_user_mlp.weight, std=0.01)
#         nn.init.normal_(self.embedding_item_mlp.weight, std=0.01)
#         nn.init.normal_(self.embedding_user_mf.weight, std=0.01)
#         nn.init.normal_(self.embedding_item_mf.weight, std=0.01)
#
#         for m in self.fc_layers:
#             if isinstance(m, nn.Linear):
#                 nn.init.xavier_uniform_(m.weight)
#
#         nn.init.xavier_uniform_(self.affine_output.weight)
#
#         for m in self.modules():
#             if isinstance(m, nn.Linear) and m.bias is not None:
#                 m.bias.data.zero_()
#
#     def forward(self, user_indices, item_indices):
#         user_embedding_mlp = self.embedding_user_mlp(user_indices)
#         item_embedding_mlp = self.embedding_item_mlp(item_indices)
#
#         user_embedding_mf = self.embedding_user_mf(user_indices)
#         item_embedding_mf = self.embedding_item_mf(item_indices)
#
#         mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
#         mf_vector =torch.mul(user_embedding_mf, item_embedding_mf)
#
#         for idx, _ in enumerate(range(len(self.fc_layers))):
#             mlp_vector = self.fc_layers[idx](mlp_vector)
#
#         vector = torch.cat([mlp_vector, mf_vector], dim=-1)
#         logits = self.affine_output(vector)
#         rating = self.logistic(logits)
#         return rating.squeeze()

In [None]:
data = NCFData(df, num_neg=4, num_neg_test=99, batch_size=1024)
train_loader = data.get_train_instance()
val_loader   = data.get_val_instance()
test_loader  = data.get_test_instance()

n_users = int(data.preprocess_df["user_id"].nunique())
n_items = int(data.preprocess_df["item_id"].nunique())
embedding_dim = 32
epochs = 10

model = SimpleMatrixFactorization(n_users, n_items, embedding_dim).to(DEVICE)
loss_fn = nn.BCEWithLogitsLoss()  # <-- changed
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
writer = SummaryWriter()

global_step = 0
best_hr = 0.0

for epoch in range(epochs):
    model.train()
    start_time = time.time()

    for user, item, label in train_loader:
        user = user.to(DEVICE); item = item.to(DEVICE); label = label.to(DEVICE)

        optimizer.zero_grad()
        logits = model(user, item)
        loss = loss_fn(logits, label)
        loss.backward()
        optimizer.step()

        writer.add_scalar("loss/train", float(loss.item()), global_step)
        global_step += 1

    hr, ndcg = metrics(model, test_loader, top_k=10, device=DEVICE)
    writer.add_scalar("perf/HR@10", hr, epoch)
    writer.add_scalar("perf/NDCG@10", ndcg, epoch)

    elapsed_time = time.time() - start_time
    print(f"The time elapsed for epoch {epoch+1} is {time.strftime('%H:%M:%S', time.gmtime(elapsed_time))} seconds")
    print(f"Epoch {epoch+1}: HR@10 = {hr:.4f}, NDCG@10 = {ndcg:.4f}")

writer.close()

Reindexing users and items...
Reindexed 17217 users and 56308 items.

Leave one out split...
Train: 72399, Val: 7347, Test: 7347

Negative sampling...
Negative sampling done. Sampled 99 negatives per user.

Getting training instances...
