# Matrix Factorization

## Prepare the data

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

## Parameters

In [None]:
use_gpu = True
batch_size = 256
learning_rate = 0.001
embedding_dim = 16
epochs = 500
device = "cpu" if not use_gpu else "cuda"

## Imports and initialization

In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')

In [None]:
ratings_df.head()

When considering the movies, we'll perform a slight change of notation, refering to movies as items, to be more generic in terms of recommendation.

In [None]:
ratings_df = ratings_df.rename(columns={'movieId': 'itemId'})

In [None]:
max_user_id = ratings_df.userId.max()
max_item_id = ratings_df.itemId.max()

In [None]:
ratings_df["rating"] = ratings_df["rating"] / 5.0 

Produce the following splits: 60% train, 20% val, 20% test

In [None]:
df_train, df_val_test = train_test_split(ratings_df, stratify=ratings_df.rating, train_size=0.6)
df_val, df_test = train_test_split(df_val_test, stratify=df_val_test.rating, train_size=0.5)

In [None]:
def df_to_dataset(df):
    return torch.utils.data.TensorDataset(
        torch.tensor(df[["userId", "itemId"]].values), 
        torch.tensor(df.rating.values).to(torch.float32)
    )

# Get datasets
train_dataset = df_to_dataset(df_train)
val_dataset = df_to_dataset(df_val)
test_dataset = df_to_dataset(df_test)

# Get dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Model definition

In [None]:
class Model(torch.nn.Module):

    def __init__(self, embedding_dim, cardinalities):
        super().__init__()
        self.user_embedding_table = torch.nn.Embedding(cardinalities["user"] + 1, embedding_dim)
        self.item_embedding_table = torch.nn.Embedding(cardinalities["item"] + 1, embedding_dim)

    def interaction(self, user_embeddings, item_embeddings):
        # batch-wise dot product 
        return torch.einsum('bi,bj->b', user_embeddings, item_embeddings)

    def forward(self, data):
        # Assume that data has the form of (uid, iid)
        user_ids = data[:, 0]
        item_ids = data[:, 1]
        # Embed
        user_embeddings = self.user_embedding_table(user_ids)
        item_embeddings = self.item_embedding_table(item_ids)
        # Reconstruct
        reconstruction = self.interaction(user_embeddings, item_embeddings)
        return reconstruction

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device, epoch, print_every=100):
    model.train()
    losses = []
    for current_step, (inputs, feedback_gt) in enumerate(dataloader):
        inputs = inputs.to(device)
        feedback_gt = feedback_gt.to(device)
        # Forward 
        feedback_pred = model(inputs)
        loss = criterion(feedback_pred, feedback_gt)
        # Backward
        optimizer.zero_grad()
        loss.backward()
        # Optimizer step
        optimizer.step()
        # Log
        losses.append(loss)
        if current_step % print_every == 0:
            print(f"[{epoch}, {current_step}] Loss: {loss.item() * 5.0}")
    loss_avg = torch.mean(torch.stack(losses)).item()
    return {"loss": loss_avg}

In [None]:
@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()
    losses = []
    for (inputs, feedback_gt) in dataloader:
        inputs = inputs.to(device)
        feedback_gt = feedback_gt.to(device)
        # Forward 
        feedback_pred = model(inputs)
        loss = criterion(feedback_pred, feedback_gt)
        losses.append(loss * 5.0)
    loss_avg = torch.mean(torch.stack(losses)).item()
    return {"loss": loss_avg}

In [None]:
model = Model(embedding_dim, {"user": max_user_id, "item": max_item_id}).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()

## Training

In [None]:
train_epoch_losses = []
val_epoch_losses = []
for epoch in range(epochs):
    train_stats = train_epoch(model, train_loader, optimizer, criterion, device, epoch, print_every=50)
    val_stats = evaluate(model, val_loader, device)

    train_epoch_losses.append(train_stats["loss"])
    val_epoch_losses.append(val_stats["loss"])

    print(f"[{epoch}] Validation loss: {val_stats['loss']}")

## Results

Let us see the results from training, from training and validation sets

In [None]:
plt.plot(train_epoch_losses, label="Train")
plt.plot(val_epoch_losses, label="Test")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

And evaluate on the test set

In [None]:
test_stats = evaluate(model, test_loader, device)

In [None]:
print(f"Test set loss: {test_stats['loss']}")

To get a better loss, you could try to find better hyperparameters such as learning rate, optimizer, batch size. You could also try using a bigger dataset such as MovieLens25M.