In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from recommenders.datasets.python_splitters import python_chrono_split, python_stratified_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tqdm import tqdm

In [2]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
df = pd.read_csv("ratings_electronics.csv", names=["user", "item", "rating", "timestamp"])
df.head()

Unnamed: 0,user,item,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [4]:
df["user"].nunique()

4201696

In [5]:
# of interactions, and that every user has at least certain number of interactions
def filter_interactions(df, min_user_interactions, min_item_interactions):
    while True:
        before = len(df)
        # Count interactions per user and item
        user_counts = df.groupby("user")["item"].count()
        item_counts = df.groupby("item")["user"].count()
        # Filter users and items based on interaction counts
        users_to_keep = set(user_counts[user_counts >= min_user_interactions].index)
        items_to_keep = set(item_counts[item_counts >= min_item_interactions].index)
        # Filter the DataFrame
        df = df[df["user"].isin(users_to_keep) & df["item"].isin(items_to_keep)]
        after = len(df)
        if after == before:
            break
    print(f"Dataframe after interactions filtering: {after}")
    return df

df = filter_interactions(df, 10, 10)
# df = df.sample(min(300_000, len(df)), random_state=42) # Speed up training by sampling

Dataframe after interactions filtering: 347393


In [6]:
df.nunique()

user         20247
item         11589
rating           5
timestamp     4103
dtype: int64

In [7]:
def encode_categorical_columns(df, columns):
    for col in columns:
        le = LabelEncoder()
        df[col + "_enc"] = le.fit_transform(df[col])
    return df

df = encode_categorical_columns(df, ["user", "item"])

In [8]:
# def scale_column(df, column):
#     scaler = MinMaxScaler()
#     df[column + "_norm"] = scaler.fit_transform(df[[column]])
#     return df
#
# df = scale_column(df, "rating")
# df.head()

In [9]:
# def custom_split(df, require_min_interactions=True, min_interactions=5):
#     # Work on a copy to avoid side effects
#     tmp = df.copy()
#     tmp["rank_latest"] = tmp.groupby("user_enc")["timestamp"] \
#                            .rank(method="first", ascending=False)
#
#     if require_min_interactions:
#         counts = tmp.groupby("user_enc")["item_enc"].transform("count")
#         tmp = tmp[counts >= min_interactions]
#
#     data_test = tmp[tmp["rank_latest"] == 1]
#     data_val = tmp[tmp["rank_latest"] == 2]
#     data_train = tmp[tmp["rank_latest"] > 2]
#
#     return (
#         data_train.drop(columns="rank_latest"),
#         data_val.drop(columns="rank_latest"),
#         data_test.drop(columns="rank_latest"),
#     )
#
# data_train, data_val, data_test = custom_split(df)

In [10]:
# len(data_train), len(data_val), len(data_test)

In [11]:
def split_chronological_by_user(df, test_size=0.2):
    data_train, data_test = python_chrono_split(
        df, ratio=1 - test_size, filter_by="user",
        col_user="user_enc", col_item="item_enc", col_timestamp="timestamp"
    )
    return data_train, data_test

def split_random_by_user(df, test_size=0.2):
    data_train, data_test = python_stratified_split(
        df, ratio=1 - test_size, filter_by="user",
        col_user="user_enc", col_item="item_enc"
    )
    return data_train, data_test

data_train, data_test = split_chronological_by_user(df)

In [12]:
class MFDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        return {
            "users": torch.tensor(row["user_enc"], dtype=torch.long),
            "items": torch.tensor(row["item_enc"], dtype=torch.long),
            "ratings": torch.tensor(row["rating"], dtype=torch.float)
        }

In [13]:
train_dataset = MFDataset(data_train)
test_dataset = MFDataset(data_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2048, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=2048, shuffle=False)

In [22]:
class MFModel(torch.nn.Module):
    def __init__(self, n_users, n_items, embedding_dim=64):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(n_items, embedding_dim)
        self.user_bias = torch.nn.Embedding(n_users, 1)
        self.item_bias = torch.nn.Embedding(n_items, 1)
        self.global_bias = torch.nn.Parameter(torch.zeros(1))

        # init
        torch.nn.init.normal_(self.user_embedding.weight, std=0.05)
        torch.nn.init.normal_(self.item_embedding.weight, std=0.05)
        torch.nn.init.zeros_(self.user_bias.weight)
        torch.nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embedding(user_ids)
        item_embeds = self.item_embedding(item_ids)
        dot_product = (user_embeds * item_embeds).sum(dim=1, keepdim=True)
        out = dot_product + self.user_bias(user_ids) + self.item_bias(item_ids) + self.global_bias
        return out.squeeze(1)

In [23]:
# import torch.nn as nn
#
# class NeuralMF(nn.Module):
#     def __init__(self, n_users, n_items, mf_embedding_dim=32, mlp_embedding_dim=32, mlp_layers=[64, 32, 16]):
#         """
#         Neural Matrix Factorization Model.
#
#         Args:
#             n_users (int): Number of unique users.
#             n_items (int): Number of unique items.
#             mf_embedding_dim (int): Embedding dimension for the GMF path.
#             mlp_embedding_dim (int): Embedding dimension for the MLP path.
#             mlp_layers (list of int): A list of layer sizes for the MLP path.
#         """
#         super().__init__()
#
#         # --- GMF Path ---
#         self.mf_user_embedding = nn.Embedding(n_users, mf_embedding_dim)
#         self.mf_item_embedding = nn.Embedding(n_items, mf_embedding_dim)
#
#         # --- MLP Path ---
#         self.mlp_user_embedding = nn.Embedding(n_users, mlp_embedding_dim)
#         self.mlp_item_embedding = nn.Embedding(n_items, mlp_embedding_dim)
#
#         # Build the MLP layers dynamically
#         mlp_input_dim = mlp_embedding_dim * 2
#
#         self.mlp = nn.Sequential()
#         for i, layer_size in enumerate(mlp_layers):
#             self.mlp.add_module(f"linear_{i}", nn.Linear(mlp_input_dim, layer_size))
#             self.mlp.add_module(f"relu_{i}", nn.ReLU())
#             self.mlp.add_module(f"dropout_{i}", nn.Dropout(0.3)) # Adding dropout for regularization
#             mlp_input_dim = layer_size
#
#         # --- Final Prediction Layer ---
#         # The input size is the GMF output dim (mf_embedding_dim) + MLP output dim (last layer size)
#         prediction_input_dim = mf_embedding_dim + mlp_layers[-1]
#         self.prediction_layer = nn.Linear(prediction_input_dim, 1)
#
#         # Initialize weights
#         self._init_weights()
#
#     def _init_weights(self):
#         # Use Xavier initialization for embeddings and linear layers
#         nn.init.xavier_uniform_(self.mf_user_embedding.weight)
#         nn.init.xavier_uniform_(self.mf_item_embedding.weight)
#         nn.init.xavier_uniform_(self.mlp_user_embedding.weight)
#         nn.init.xavier_uniform_(self.mlp_item_embedding.weight)
#
#         for module in self.mlp.modules():
#             if isinstance(module, nn.Linear):
#                 nn.init.xavier_uniform_(module.weight)
#
#         nn.init.xavier_uniform_(self.prediction_layer.weight)
#         self.prediction_layer.bias.data.fill_(0.01)
#
#     def forward(self, users, items):
#         # --- GMF Path ---
#         mf_user_emb = self.mf_user_embedding(users)
#         mf_item_emb = self.mf_item_embedding(items)
#         mf_vector = mf_user_emb * mf_item_emb # Element-wise product
#
#         # --- MLP Path ---
#         mlp_user_emb = self.mlp_user_embedding(users)
#         mlp_item_emb = self.mlp_item_embedding(items)
#         mlp_vector = torch.cat([mlp_user_emb, mlp_item_emb], dim=1)
#         mlp_vector = self.mlp(mlp_vector)
#
#         # --- Concatenate GMF and MLP outputs ---
#         combined_vector = torch.cat([mf_vector, mlp_vector], dim=1)
#
#         # --- Final Prediction ---
#         prediction = self.prediction_layer(combined_vector)
#
#         return prediction.squeeze()

In [24]:
dataiter = next(iter(train_loader))
print(dataiter)

{'users': tensor([ 9871, 17857,  4363,  ..., 19869,  5576,   206]), 'items': tensor([ 7419, 10677,  8970,  ...,  2236,  6270,  2199]), 'ratings': tensor([5., 4., 5.,  ..., 1., 5., 3.])}


In [25]:
n_users = df["user_enc"].nunique()
n_items = df["item_enc"].nunique()

user_embed = torch.nn.Embedding(n_users, 32)
item_embed = torch.nn.Embedding(n_items, 32)
linear = torch.nn.Linear(32, 1)

In [26]:
ue = user_embed(dataiter["users"])
ie = item_embed(dataiter["items"])
print(f"ue {ue.size()}")
print(f"ue {ie}")
print(f"ie {ie.size()}")
print(f"ie {ie}")

ue torch.Size([2048, 32])
ue tensor([[ 6.1860e-01, -1.4095e+00, -6.6337e-01,  ..., -5.1911e-01,
          3.8000e-01, -6.4145e-01],
        [-4.2898e-01, -1.1247e+00,  9.6239e-01,  ...,  7.0314e-01,
         -7.9808e-01, -2.0124e+00],
        [-2.4749e+00,  6.1989e-01,  2.0604e+00,  ...,  2.7381e+00,
          1.2434e+00,  1.7003e+00],
        ...,
        [ 6.3711e-01,  1.6581e+00, -4.3391e-01,  ..., -2.8884e-01,
          1.2085e+00,  6.3904e-01],
        [-6.3111e-01,  4.7434e-01, -5.8121e-01,  ..., -3.7808e-01,
          1.2493e-01, -3.3517e+00],
        [-7.1588e-01, -2.0351e+00,  1.4031e-01,  ...,  6.1738e-01,
          3.0129e-01, -8.2433e-04]], grad_fn=<EmbeddingBackward0>)
ie torch.Size([2048, 32])
ie tensor([[ 6.1860e-01, -1.4095e+00, -6.6337e-01,  ..., -5.1911e-01,
          3.8000e-01, -6.4145e-01],
        [-4.2898e-01, -1.1247e+00,  9.6239e-01,  ...,  7.0314e-01,
         -7.9808e-01, -2.0124e+00],
        [-2.4749e+00,  6.1989e-01,  2.0604e+00,  ...,  2.7381e+00,
       

In [27]:
output = (ue * ie).sum(dim=1)
print(f"output {output.size()}")
print(f"output {output}")

output torch.Size([2048])
output tensor([-0.3017, -1.7871, 11.6894,  ...,  2.8224, 11.4021,  0.9011],
       grad_fn=<SumBackward1>)


In [28]:
def train_one_epoch(model, optimizer, loss_fn, train_loader, device):
    model.train()
    total_loss = 0.0
    pbar = tqdm(train_loader, desc="Training", unit="batch", leave=False)
    for batch in pbar:
        users = batch["users"].to(device)
        items = batch["items"].to(device)
        ratings = batch["ratings"].to(device)

        optimizer.zero_grad()
        predictions = model(users, items)
        loss = loss_fn(predictions, ratings)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * users.size(0)
        pbar.set_postfix({"loss": loss.item()})

    return total_loss / len(train_loader.dataset)

def evaluate(model, loss_fn, data_loader, device):
    model.eval()
    target_rating_list = []
    model_output_list = []
    total_loss = 0.0

    with torch.no_grad():
        pbar = tqdm(data_loader, desc="Evaluating", unit="batch", leave=False)
        for batch in pbar:
            users = batch["users"].to(device)
            items = batch["items"].to(device)
            ratings = batch["ratings"].to(device)

            predictions = model(users, items)
            predictions = predictions.clamp(min=1.0, max=5.0)
            loss = loss_fn(predictions, ratings)
            total_loss += loss.item() * users.size(0)

            target_rating_list.extend(ratings.cpu().numpy())
            model_output_list.extend(predictions.cpu().numpy())
            pbar.set_postfix({"loss": loss.item()})

    rmse = np.sqrt(np.mean((np.array(model_output_list) - np.array(target_rating_list))**2))
    avg_loss = total_loss / len(data_loader.dataset)
    return avg_loss, rmse

In [29]:
n_users = df["user_enc"].nunique()
n_items = df["item_enc"].nunique()
epochs = 10

model = MFModel(n_users, n_items, embedding_dim=32).to(DEVICE)
# model = NeuralMF(n_users, n_items, mf_embedding_dim=32, mlp_embedding_dim=32, mlp_layers=[64, 32, 16]).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.7)
loss_fn = torch.nn.MSELoss()

epoch_train_losses, epoch_val_losses = [], []
best_val_rmse = float('inf')

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # TRAINING
    train_loss = train_one_epoch(model, optimizer, loss_fn, train_loader, DEVICE)
    epoch_train_losses.append(train_loss)

    # VALIDATION (not test set!)
    # The lists are now local to the evaluate function, fixing the bug
    val_loss, val_rmse = evaluate(model, loss_fn, test_loader, DEVICE)
    epoch_val_losses.append(val_loss)

    print(f"  Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val RMSE: {val_rmse:.4f}")

    # Model Checkpointing
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"  New best model saved with RMSE: {best_val_rmse:.4f}")

    scheduler.step()

print("\nTraining finished.")
print(f"Best validation RMSE: {best_val_rmse:.4f}")

Epoch 1/10


                                                                         

  Train Loss: 18.6891 | Val Loss: 12.1352 | Val RMSE: 3.4836
  New best model saved with RMSE: 3.4836
Epoch 2/10


                                                                         

  Train Loss: 16.6690 | Val Loss: 12.1352 | Val RMSE: 3.4836
Epoch 3/10


                                                                         

  Train Loss: 14.7952 | Val Loss: 12.1348 | Val RMSE: 3.4835
  New best model saved with RMSE: 3.4835
Epoch 4/10


                                                                         

  Train Loss: 13.0027 | Val Loss: 11.9615 | Val RMSE: 3.4585
  New best model saved with RMSE: 3.4585
Epoch 5/10


                                                                         

KeyboardInterrupt: 