# Baseline Recommender System
This notebook implements a simple baseline which is content-based cosine similarity.

In [76]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [41]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# Importing processed dataset
df = pd.read_csv("data/amazon_reviews_2023/amazon_reviews_2023_filtered.csv")
df.head()

Unnamed: 0,user,item,rating,title,text,domain,timestamp,label,user_id,item_id,domain_id
0,AGWDYYVVWM3DC3CASUZKXK67G6IA,0394800796,5.0,A grouchy Grinch turns loveable and reveals a ...,When the Grinch looks down on Whoville from hi...,Books,974942691000,1,3154,58,0
1,AF4KRDA6XVQE357OWPILTPXV7TSA,0805047905,5.0,Fun for mom AND baby,...The large colorful pictures are great (albe...,Books,989349077000,1,1196,167,0
2,AF4KRDA6XVQE357OWPILTPXV7TSA,0307120007,4.0,Kid loves it...I,My 15 month old son has enjoyed this book for ...,Books,989349159000,1,1196,22,0
3,AHRGTIMQO47C2VLJILIDU53BQKSA,B00005ALS0,4.0,It's a dog-eat-dog world out there ...,... and Christopher Guest manages to find the ...,Movies_and_TV,990492274000,1,4054,397,1
4,AEBTXSUFLRBUQXLA4RPUU7DJ7WPQ,B004O4C0G0,1.0,What's with the BUGS??,10-15 minutes in the game and it FREEZES(crash...,Video_Games,992873810000,0,272,1230,2


In [78]:
# Splitting the datasets into train and test sets by user and time
def stratified_temporal_split(dataframe, val_ratio=0.1, test_ratio=0.1):
    dataframe = dataframe.sort_values("timestamp").reset_index(drop=True)
    grouped = dataframe.groupby("user_id")

    train_indices, val_indices, test_indices = [], [], []
    for _, group in grouped:
        n_records = len(group)
        indices = group.index

        # If user has less that 3 interactions, put all in training set
        if n_records < 3:
            train_indices.extend(indices)
            continue

        # Calculate split points
        test_split_idx = int(n_records * (1 - test_ratio))
        val_split_idx = int(n_records * (1 - test_ratio - val_ratio))

        # Ensure validation set has at least one record if percentages are non-zero
        if val_split_idx <= 0:
            val_split_idx = 1

        # Assign indices to respective sets
        train_indices.extend(indices[:val_split_idx])
        val_indices.extend(indices[val_split_idx:test_split_idx])
        test_indices.extend(indices[test_split_idx:])

    # Create the final datasets
    train_df = dataframe.loc[train_indices].sort_values("timestamp")
    val_df = dataframe.loc[val_indices].sort_values("timestamp")
    test_df = dataframe.loc[test_indices].sort_values("timestamp")

    return train_df, val_df, test_df

train, val, test = stratified_temporal_split(df, val_ratio=0.2, test_ratio=0.1)

In [61]:
# def leave_one_out_split(dataframe):
#     train, val, test = [], [], []
#     for u, g in dataframe.sort_values("timestamp").groupby("user_id"):
#         items = g["item_id"].tolist()
#         if len(items) < 3:
#             train += g.to_dict("records")
#             continue
#         train += g.iloc[:-2].to_dict("records")
#         val += [g.iloc[-2].to_dict]
#         test += [g.iloc[-1].to_dict]
#     return pd.DataFrame(train), pd.DataFrame(val), pd.DataFrame(test)
#
# train, val, test = leave_one_out_split(df)

In [79]:
n_users = len(df["user_id"].unique())
n_items = len(df["item_id"].unique())
n_domains = len(df["domain"].unique())

In [74]:
# Domain-aware item key
df["item_key"] = df["domain"].astype(str) + "::" + df["item"].astype(str)

# Map ids to continuous integers
uid2ix = {u:i for i, u in enumerate(df["user"].unique())}
ix2uid = {i:u for u, i in uid2ix.items()}

iid2ix = {it:i for i, it in enumerate(df["item_key"].unique())}
ix2iid = {i:it for it, i in iid2ix.items()}

df["uid"] = df["user"].map(uid2ix)
df["iid"] = df["item_key"].map(iid2ix)
dom2ix = {d:i for i, d in enumerate(df["domain"].unique())}
df["did"] = df["domain"].map(dom2ix)

In [None]:
def sample_negative(u, n=1):
    pool = []
    while len(pool) < n:
        cand = np.random.randint(0, )

In [57]:
class MFDataset(Dataset):
    def __init__(self, dataframe):
        super().__init__()
        self.df = dataframe[["user_id", "item_id", "label"]].copy()
        self.x_user_item = list(zip(self.df["user_id"].values, self.df["item_id"].values))
        self.y_label = self.df["label"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.x_user_item[idx], self.y_label[idx]

In [93]:
# Testing the dataset
train_ds = MFDataset(train)
train_dataloader = DataLoader(train_ds, batch_size=128, shuffle=True)
valid_ds = MFDataset(val)
valid_dataloader = DataLoader(valid_ds, batch_size=128, shuffle=False)

xb, yb = next(iter(train_dataloader))
print(xb)
print("\n")
print(yb)

[tensor([2167,  292,  286, 1537, 2502, 1494,  712, 1774,  330,  262, 3253, 1652,
        2373, 3214, 1238, 3371, 1087, 3963, 3127, 1290,  769, 1788, 3020,  828,
        1130, 1991, 1106,  401, 3522,  325,  941, 3780, 1537, 2257, 3179, 2759,
        1754, 3419, 1354, 3446, 2725, 2481, 2423, 2944, 1747,  625, 2622, 1650,
        2313, 3963, 1517, 2536,  772,  333, 3769, 1745,  957, 1528, 1670, 3648,
        3272, 3020, 1227, 3578,  402, 3929, 3725, 3396, 2148, 2350, 3412, 2055,
        1656, 1046, 1239, 3123, 1751, 1235, 2711, 2514, 2153, 3316,  458, 2425,
        1603, 1257, 1529, 2717,  679,  262, 4314, 1689, 3353, 2339, 3666, 2507,
         247,   35, 1721, 3151, 2272, 2194, 2347, 3806, 3543, 3570, 1558, 1169,
         308, 2478, 3335, 3788, 1133, 4131, 1237,  980, 1837, 1088,  146, 1597,
        1959, 3574,  426, 1333, 3564, 1017, 2876, 1254]), tensor([3784,  710, 1486, 3071,  626, 1547, 1356, 3634,  742,  581, 3576, 3810,
        4385, 2423, 3010, 2580, 2408, 2197, 1380, 2258, 2894,

In [104]:
# Simple matrix factorization
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.dropout = nn.Dropout(0.2)
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.normal_(self.user_embedding.weight, std=0.02)
        nn.init.normal_(self.item_embedding.weight, std=0.02)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_ids, item_ids):
        ue = self.dropout(self.user_embedding(user_ids))
        ie = self.dropout(self.item_embedding(item_ids))
        dot = (ue * ie).sum(dim=1)
        out = dot + self.user_bias(user_ids).squeeze(1) + self.item_bias(item_ids).squeeze(1)
        return torch.sigmoid(out)

In [105]:
n_users = len(df["user_id"].unique())
n_items = len(df["item_id"].unique())
baseline = MatrixFactorization(n_users, n_items, embedding_dim=32)
baseline.to(DEVICE)
print(baseline)

MatrixFactorization(
  (user_embedding): Embedding(4320, 32)
  (item_embedding): Embedding(4736, 32)
  (user_bias): Embedding(4320, 1)
  (item_bias): Embedding(4736, 1)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [106]:
# Training the model
# Hyperparameters
EPOCHS = 50
LEARNING_RATE = 0.005

optimizer = optim.Adam(baseline.parameters(), lr=LEARNING_RATE)
loss_fn = nn.BCELoss()
epoch_train_losses, epoch_val_losses = [], []

for i in range(EPOCHS):
    train_losses = []
    baseline.train()
    for xb, yb in train_dataloader:
        x_user = xb[0].to(DEVICE, dtype=torch.long)
        x_item = xb[1].to(DEVICE, dtype=torch.long)
        y_label = yb.to(DEVICE, dtype=torch.float)
        preds = baseline(x_user, x_item)
        loss = loss_fn(preds, y_label)
        train_losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    baseline.eval()
    val_losses = []
    with torch.no_grad():
        for xb, yb in valid_dataloader:
            x_user = xb[0].to(DEVICE, dtype=torch.long)
            x_item = xb[1].to(DEVICE, dtype=torch.long)
            y_label = yb.to(DEVICE, dtype=torch.float)
            preds = baseline(x_user, x_item)
            loss = loss_fn(preds, y_label)
            val_losses.append(loss.item())

    # Start logging
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_losses.append(epoch_train_loss)
    epoch_val_losses.append(epoch_val_loss)
    print(f"Epoch {i+1}/{EPOCHS} - Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")

Epoch 1/50 - Train Loss: 0.6581, Val Loss: 0.6228
Epoch 2/50 - Train Loss: 0.5415, Val Loss: 0.5676
Epoch 3/50 - Train Loss: 0.3718, Val Loss: 0.5361
Epoch 4/50 - Train Loss: 0.2130, Val Loss: 0.5290
Epoch 5/50 - Train Loss: 0.1178, Val Loss: 0.5357
Epoch 6/50 - Train Loss: 0.0697, Val Loss: 0.5475
Epoch 7/50 - Train Loss: 0.0471, Val Loss: 0.5601
Epoch 8/50 - Train Loss: 0.0332, Val Loss: 0.5730
Epoch 9/50 - Train Loss: 0.0260, Val Loss: 0.5856
Epoch 10/50 - Train Loss: 0.0205, Val Loss: 0.5973
Epoch 11/50 - Train Loss: 0.0172, Val Loss: 0.6089
Epoch 12/50 - Train Loss: 0.0140, Val Loss: 0.6195
Epoch 13/50 - Train Loss: 0.0123, Val Loss: 0.6300
Epoch 14/50 - Train Loss: 0.0111, Val Loss: 0.6399
Epoch 15/50 - Train Loss: 0.0103, Val Loss: 0.6503
Epoch 16/50 - Train Loss: 0.0088, Val Loss: 0.6597
Epoch 17/50 - Train Loss: 0.0083, Val Loss: 0.6688
Epoch 18/50 - Train Loss: 0.0074, Val Loss: 0.6792
Epoch 19/50 - Train Loss: 0.0077, Val Loss: 0.6886
Epoch 20/50 - Train Loss: 0.0068, Val Lo

KeyboardInterrupt: 