# Recommender System - Baseline Model
This notebook implements a simple baseline which is content-based cosine similarity.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Importing processed dataset
df = pd.read_csv("data/amazon_reviews_2023_filtered.csv")
df.head()

Unnamed: 0,user,item,rating,domain,timestamp,implicit_rating,user_id,item_id,domain_id
0,AFERCDY2EFJKT7QUQ75GISNHTFOQ,0345409469,5.0,Books,874018431000,1,12197,588,0
1,AEAFMJT3QRZZEJ3CTGB4FNDL5FPA,0446606324,5.0,Books,912129267000,1,1818,1004,0
2,AEAFMJT3QRZZEJ3CTGB4FNDL5FPA,0440212561,5.0,Books,933620912000,1,1818,976,0
3,AE2HHCZARXNN3PDMFANR7XK23LIA,0800141601,1.0,Movies_and_TV,939519163000,0,114,1894,1
4,AFTFGYARCCB72L37G6RBSPPJZ3RA,B00002EPYD,2.0,Video_Games,949468790000,0,16286,3953,2


## Splitting the dataset chronologically and by user

In [4]:
from recommenders.datasets.python_splitters import python_chrono_split, python_random_split

train, temp = python_chrono_split(
    df, ratio=0.8, filter_by="user",
    col_user="user_id", col_item="item_id", col_timestamp="timestamp"
)

val, test = python_random_split(
    temp, ratio=0.5
)

print(f"Training dataset percentage: {(len(train) / len(df)) * 100}%")
print(f"Validation dataset percentage: {(len(val) / len(df)) * 100}%")
print(f"Test dataset percentage: {(len(test) / len(df)) * 100}%")

Training dataset percentage: 80.06679160231765%
Validation dataset percentage: 9.966425036817796%
Test dataset percentage: 9.966783360864564%


In [78]:
# # Splitting the datasets into train and test sets by user and time
# def stratified_temporal_split(dataframe, val_ratio=0.1, test_ratio=0.1):
#     dataframe = dataframe.sort_values("timestamp").reset_index(drop=True)
#     grouped = dataframe.groupby("user_id")
#
#     train_indices, val_indices, test_indices = [], [], []
#     for _, group in grouped:
#         n_records = len(group)
#         indices = group.index
#
#         # If user has less that 3 interactions, put all in training set
#         if n_records < 3:
#             train_indices.extend(indices)
#             continue
#
#         # Calculate split points
#         test_split_idx = int(n_records * (1 - test_ratio))
#         val_split_idx = int(n_records * (1 - test_ratio - val_ratio))
#
#         # Ensure validation set has at least one record if percentages are non-zero
#         if val_split_idx <= 0:
#             val_split_idx = 1
#
#         # Assign indices to respective sets
#         train_indices.extend(indices[:val_split_idx])
#         val_indices.extend(indices[val_split_idx:test_split_idx])
#         test_indices.extend(indices[test_split_idx:])
#
#     # Create the final datasets
#     train_df = dataframe.loc[train_indices].sort_values("timestamp")
#     val_df = dataframe.loc[val_indices].sort_values("timestamp")
#     test_df = dataframe.loc[test_indices].sort_values("timestamp")
#
#     return train_df, val_df, test_df
#
# train, val, test = stratified_temporal_split(df, val_ratio=0.2, test_ratio=0.1)

## Creating dataset and dataloader

In [6]:
class MFDataset(Dataset):
    def __init__(self, data, all_item_ids, is_train=False, num_neg=4):
        self.data = data
        self.all_item_ids = all_item_ids
        self.is_train = is_train
        self.num_neg = num_neg if self.is_train else 0
        self.user_item_pairs = set(zip(self.data["user_id"], self.data["item_id"]))

    def __len__(self):
        return len(self.data) * (1 + self.num_neg)

    def __getitem__(self, idx):
        original_idx = idx
        idx = idx // (1 + self.num_neg)
        user = self.data["user_id"].iloc[idx]
        item = self.data["item_id"].iloc[idx]

        if self.is_train:
            label = 1 if original_idx % (1 + self.num_neg) == 0 else 0
            if label == 0:
                neg_item = np.random.choice(self.all_item_ids)
                while (user, neg_item) in self.user_item_pairs:
                    neg_item = np.random.choice(self.all_item_ids)
                item = neg_item
        else:
            label = float(self.data["implicit_rating"].iloc[idx])

        return torch.tensor(user, dtype=torch.long), torch.tensor(item, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

In [5]:
class MFDataset(Dataset):
    def __init__(self, data, all_item_ids, is_train=False, num_neg=4, include_neg_val=True):
        self.data = data
        self.all_item_ids = all_item_ids
        self.is_train = is_train
        self.include_neg_val = include_neg_val  # New parameter for validation negative sampling

        # Determine negative sampling based on mode
        if self.is_train:
            self.num_neg = num_neg
        elif self.include_neg_val and not self.is_train:
            self.num_neg = 1  # Include 1 negative sample for validation/test
        else:
            self.num_neg = 0

        self.user_item_pairs = set(zip(self.data["user_id"], self.data["item_id"]))

        # Pre-compute user interactions for faster negative sampling
        self.user_items = {}
        for user, item in self.user_item_pairs:
            if user not in self.user_items:
                self.user_items[user] = set()
            self.user_items[user].add(item)

    def __len__(self):
        return len(self.data) * (1 + self.num_neg)

    def __getitem__(self, idx):
        original_idx = idx
        idx = idx // (1 + self.num_neg)
        user = self.data["user_id"].iloc[idx]
        item = self.data["item_id"].iloc[idx]

        if self.num_neg > 0:  # If using negative sampling
            label = 1 if original_idx % (1 + self.num_neg) == 0 else 0
            if label == 0:
                # More efficient negative sampling
                neg_item = np.random.choice(self.all_item_ids)
                while neg_item in self.user_items.get(user, set()):
                    neg_item = np.random.choice(self.all_item_ids)
                item = neg_item
        else:
            label = float(self.data["implicit_rating"].iloc[idx])

        return torch.tensor(user, dtype=torch.long), torch.tensor(item, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

In [13]:
class MFDataset(Dataset):
    def __init__(self, data, all_item_ids, is_train=False, num_neg=4):
        self.data = data
        self.all_item_ids = all_item_ids
        self.is_train = is_train
        self.num_neg = num_neg
        self.user_positives = data.groupby('user_id')['item_id'].apply(set).to_dict()

    def __len__(self):
        return len(self.data) * (1 + self.num_neg) if self.is_train else len(self.data)

    def __getitem__(self, idx):
        if self.is_train:
            user_idx = idx // (1 + self.num_neg)
            user = self.data["user_id"].iloc[user_idx]
            item = self.data["item_id"].iloc[user_idx]

            if idx % (1 + self.num_neg) == 0:
                label = 1.0
            else:
                label = 0.0
                positive_items = self.user_positives.get(user, set())
                item = np.random.choice(self.all_item_ids)
                while item in positive_items:
                    item = np.random.choice(self.all_item_ids)
        else:
            user = self.data["user_id"].iloc[idx]
            item = self.data["item_id"].iloc[idx]
            label = float(self.data["implicit_rating"].iloc[idx])

        return torch.tensor(user, dtype=torch.long), torch.tensor(item, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

In [14]:
all_item_ids = np.unique(df["item_id"])

train_ds = MFDataset(train, all_item_ids, is_train=True, num_neg=4)
val_ds = MFDataset(val, all_item_ids, is_train=False)
test_ds = MFDataset(test, all_item_ids, is_train=False)

train_dataloader = DataLoader(train_ds, batch_size=4096, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=4096, shuffle=False)
test_dataloader = DataLoader(test_ds, batch_size=4096, shuffle=False)

In [25]:
ub, ib, lb = next(iter(test_dataloader))
print(ub)
print("\n")
print(ib)
print("\n")
print(np.unique_counts(lb))

tensor([35180, 23714, 24932,  ...,  4889, 20470,  3891])


tensor([24382, 26106,  7448,  ..., 27040,  7879,  8356])


UniqueCountsResult(values=array([0., 1.], dtype=float32), counts=array([ 781, 3315]))


In [103]:
# ### TRY WITH DIFFERENT APPROACH
# class MFDatasetV2(Dataset):
#     def __init__(self, df):
#         self.df = df[["user_id", "item_id", "implicit_rating"]]
#
#     def __len__(self):
#         return len(self.df)
#
#     def __getitem__(self, idx):
#         return list(self.df.iloc[idx])
#
# # Split the dataset
# train, val = python_chrono_split(
#     df, ratio=0.9, filter_by="user",
#     col_user="user_id", col_item="item_id", col_timestamp="timestamp"
# )
#
# train_ds = MFDatasetV2(train)
# val_ds = MFDatasetV2(val)
#
# train_dataloader = DataLoader(train_ds, batch_size=128, shuffle=True)
# val_dataloader = DataLoader(val_ds, batch_size=128, shuffle=False)

In [104]:
# ub = next(iter(train_dataloader))
# print(ub)

[tensor([16502, 23185, 12225,  1373, 25865, 33463, 19502, 23621, 33698, 29719,
        11128, 25724, 15476, 29323, 17185,  5056, 20964, 34612, 17988, 24759,
        10800,  7807, 22101,  9254, 27756,  4480,  3495, 19864, 12376, 30971,
        29056,  6054, 13898, 34624, 13640,  2279,  3144,  9160, 22642, 28481,
        31769, 16833, 16703,  6778, 12110, 11448,  7765, 16974, 30431,   761,
         1840,   731, 24754, 35819, 24174,   656,  1326, 14519, 34281, 22202,
         6371, 15507, 14158,  2653,  8423, 30071, 34556, 19799, 18132, 31736,
        19323, 15548,  2984, 16084, 22410, 33688, 29555,   332, 18537, 23425,
        11104, 12959, 16168, 27987,   555,  1854, 28824, 19777,  8945,  2551,
        31418, 12150, 13975,  2084, 11873, 17871, 19337, 21398, 21616,   231,
        10829,  9674,  6075,  6288, 12295,  7375, 19882, 34695, 30755,  7136,
         5865, 20922,  1743, 35113, 15657,  5272, 18015, 23059, 29833, 11389,
        20522,   391, 25758, 15829, 17956,  9923, 12713, 34723]

In [26]:
# Simple matrix factorization
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)

    def forward(self, user_ids, item_ids):
        return (self.user_embedding(user_ids) * self.item_embedding(item_ids)).sum(-1)

In [27]:
# Set up model, optimizer, and loss function
from tqdm import tqdm

n_users = len(df["user_id"].unique())
n_items = len(df["item_id"].unique())

# Model hyperparameters
embedding_dim = 32
lr = 1e-3
epochs = 10

model = MatrixFactorization(n_users, n_items, embedding_dim=embedding_dim).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
pos_weight = torch.tensor([4.0], device=DEVICE)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

epoch_train_losses, epoch_val_losses = [], []

# Training loop
for epoch in range(epochs):
    # TRAINING
    model.train()
    train_loss = 0.0

    pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
    for users, items, labels in pbar:
        users, items, labels = users.to(DEVICE), items.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        output = model(users, items)
        loss = loss_fn(output, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})

    avg_train_loss = train_loss / len(train_dataloader)
    epoch_train_losses.append(avg_train_loss)

    # VALIDATION
    model.eval()
    val_loss = 0.0
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():
        pbar_val = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Val]")
        for users, items, labels in pbar_val:
            users, items, labels = users.to(DEVICE), items.to(DEVICE), labels.to(DEVICE)
            output = model(users, items)
            loss = loss_fn(output, labels)
            val_loss += loss.item()
            probs = torch.sigmoid(output)
            preds = (output > 0.5).float()
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)
            pbar_val.set_postfix({"val_loss": loss.item()})

    avg_val_loss = val_loss / len(val_dataloader)
    val_accuracy = correct_preds / total_preds
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

print("/n/nTraining completed.")

Epoch 1/10 [Train]: 100%|██████████| 273/273 [00:57<00:00,  4.78it/s, loss=1.12]
Epoch 1/10 [Val]: 100%|██████████| 7/7 [00:01<00:00,  6.67it/s, val_loss=2.39]


Epoch 1/10 - Train Loss: 1.1090, Val Loss: 2.3822, Val Accuracy: 0.1879


Epoch 2/10 [Train]: 100%|██████████| 273/273 [00:56<00:00,  4.82it/s, loss=1.14]
Epoch 2/10 [Val]: 100%|██████████| 7/7 [00:01<00:00,  6.77it/s, val_loss=2.39]


Epoch 2/10 - Train Loss: 1.1091, Val Loss: 2.3822, Val Accuracy: 0.1879


Epoch 3/10 [Train]:  98%|█████████▊| 268/273 [00:55<00:01,  4.79it/s, loss=1.12]


KeyboardInterrupt: 

In [105]:
# # Training the model
# n_users = len(df["user_id"].unique())
# n_items = len(df["item_id"].unique())
#
# # Hyperparameters
# EPOCHS = 50
# LEARNING_RATE = 0.005
#
# model = MatrixFactorization(n_users, n_items, embedding_dim=32).to(DEVICE)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# loss_fn = nn.BCEWithLogitsLoss()
# epoch_train_losses, epoch_val_losses = [], []
#
# for i in range(EPOCHS):
#     train_losses = []
#     model.train()
#
#     pbar = tqdm(train_dataloader, desc=f"Epoch {i+1}/{EPOCHS} [Train]")
#     for users, items, ratings in train_dataloader:
#         users, items, ratings = users.to(DEVICE, dtype=torch.long), items.to(DEVICE, dtype=torch.long), ratings.to(DEVICE, dtype=torch.float32)
#         optimizer.zero_grad()
#         preds = model(users, items).reshape(-1)
#         loss = loss_fn(preds, ratings)
#         train_losses.append(loss.item())
#         loss.backward()
#         optimizer.step()
#
#     model.eval()
#     val_losses = []
#     with torch.no_grad():
#         for users, items, ratings in val_dataloader:
#             users, items, ratings = users.to(DEVICE, dtype=torch.long), items.to(DEVICE, dtype=torch.long), ratings.to(DEVICE, dtype=torch.float32)
#             preds = model(users, items).reshape(-1)
#             loss = loss_fn(preds, ratings)
#             val_losses.append(loss.item())
#
#     # Start logging
#     epoch_train_loss = np.mean(train_losses)
#     epoch_val_loss = np.mean(val_losses)
#     epoch_train_losses.append(epoch_train_loss)
#     epoch_val_losses.append(epoch_val_loss)
#     print(f"Epoch {i+1}/{EPOCHS} - Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")

Epoch 1/50 - Train Loss: 0.6758, Val Loss: 0.6337
Epoch 2/50 - Train Loss: 0.3072, Val Loss: 0.5899
Epoch 3/50 - Train Loss: 0.0549, Val Loss: 0.6442
Epoch 4/50 - Train Loss: 0.0130, Val Loss: 0.7055


KeyboardInterrupt: 