In [1]:
%load_ext autoreload
%autoreload 2

import torch
from torchmetrics import MeanSquaredError
import numpy as np
import pandas as pd
from dynaconf import Dynaconf
import lightning as L

from src.models.mf_with_bias import MatrixFactorizationWithBias
from src.lit_models.base import LightningModel

from src.datasets.movielens import MovielensDataModule

## Load config with dynaconf

In [2]:
cfg = Dynaconf(root_path="configs", settings_files=["config_mf.yaml"])

Load model inputs from the config file

In [3]:
n_users = cfg.model.pytorch_model.init_args.n_users
n_items = cfg.model.pytorch_model.init_args.n_items
n_factors = cfg.model.pytorch_model.init_args.n_factors
print(f"{n_users=}, {n_items=}, {n_factors=}")

n_users=943, n_items=1625, n_factors=128


## Load model

In [4]:
pytorch_model = MatrixFactorizationWithBias(n_users, n_items, 128)
pytorch_model

MatrixFactorizationWithBias(
  (user_emb): Embedding(943, 128)
  (user_bias): Embedding(943, 1)
  (item_emb): Embedding(1625, 128)
  (item_bias): Embedding(1625, 1)
)

In [5]:
checkpoint_file = "lightning_logs/embedding_dim/version_1/checkpoints/best_model.ckpt"

# load weights
model = LightningModel.load_from_checkpoint(
    checkpoint_path=checkpoint_file, pytorch_model=pytorch_model
)

In [6]:
device = model.device
print(device)

cpu


In [7]:
dm = MovielensDataModule(dataset="ml-100k", target="rating", batch_size=32)
dm.setup(stage="test")

## Predict on new data

Here we are going to use test set as a new data but, of course, we could use any new dataset.

In [8]:
test_dataloader = dm.test_dataloader()

In [9]:
# Get first batch of data
for batch_data in test_dataloader:
    users = batch_data["user"].to(device)
    items = batch_data["item"].to(device)
    ratings = batch_data["rating"].to(device)
    break

### Pytorch

We can make our predictions with plain Pytorch our using the Lighning Trainer.

Docs: https://lightning.ai/docs/pytorch/stable/deploy/production_intermediate.html

In [10]:
model.eval()
with torch.inference_mode():
    y_hat = model(users, items)  #* (5.5 - 1) + 1

y_hat

tensor([1.9380, 2.1209, 1.5993, 2.1122, 2.0556, 1.7461, 2.0709, 1.9205, 1.8691,
        1.5280, 1.5109, 1.1762, 1.3715, 1.8609, 2.2000, 2.2489, 2.2388, 2.3173,
        2.0523, 2.7145, 2.6110, 2.5191, 2.5386, 1.6416, 2.2497, 2.2378, 2.0332,
        1.7170, 2.5124, 2.0731, 2.1645, 2.2891])

Which is equivalent to call the `forward` of the pytorch model directly 

In [11]:
model.eval()
with torch.inference_mode():
    print(model.pytorch_model(users, items))

tensor([1.9380, 2.1209, 1.5993, 2.1122, 2.0556, 1.7461, 2.0709, 1.9205, 1.8691,
        1.5280, 1.5109, 1.1762, 1.3715, 1.8609, 2.2000, 2.2489, 2.2388, 2.3173,
        2.0523, 2.7145, 2.6110, 2.5191, 2.5386, 1.6416, 2.2497, 2.2378, 2.0332,
        1.7170, 2.5124, 2.0731, 2.1645, 2.2891])


In [12]:
# mse = MeanSquaredError().to(device)
# pred_list = []
# for batch_data in test_dataloader:
#     users = batch_data["user"].to(device)
#     items = batch_data["item"].to(device)
#     ratings = batch_data["rating"].to(device)
#     with torch.inference_mode():
#         y_hat = model.predict_step(batch_data) * (5.5 - 1) + 1
#         pred_list.append(y_hat.cpu().detach().numpy().squeeze())
    
#     mse(y_hat, ratings)

### Using Lightning Trainer

In [13]:
trainer = L.Trainer(enable_checkpointing=False)
batched_predictions = trainer.predict(model, dataloaders=[test_dataloader])

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Predicting DataLoader 0: 100%|██████████| 501/501 [00:01<00:00, 361.43it/s]


For comparison, show first batch fo predictions. Why are they different from the pure pytorch predictions? That's because in the LightningModule predictions are transformed using `Sigmoid` to normalize the oputpus.

In [14]:
batched_predictions[0]

tensor([0.8741, 0.8929, 0.8319, 0.8921, 0.8865, 0.8515, 0.8880, 0.8722, 0.8664,
        0.8217, 0.8192, 0.7643, 0.7976, 0.8654, 0.9003, 0.9046, 0.9037, 0.9103,
        0.8862, 0.9379, 0.9316, 0.9255, 0.9268, 0.8378, 0.9046, 0.9036, 0.8842,
        0.8477, 0.9250, 0.8883, 0.8970, 0.9080])

Concatenate all predictions in a vector

In [15]:
predictions = torch.cat(batched_predictions)
predictions

tensor([0.8741, 0.8929, 0.8319,  ..., 0.7197, 0.7358, 0.6126])

Get true ratings

In [16]:
users = torch.cat([batch_data["user"] for batch_data in test_dataloader], dim=0)
items = torch.cat([batch_data["item"] for batch_data in test_dataloader], dim=0)
ratings = torch.cat([batch_data["rating"] for batch_data in test_dataloader], dim=0)
ratings

tensor([5., 3., 3.,  ..., 2., 3., 1.])

In [17]:
def scale_predictions(x, range=(1, 5.5)):
    min_y, max_y = range
    return x * (max_y - min_y) + min_y

In [18]:
mse = MeanSquaredError()
rmse = MeanSquaredError(squared=False)

scaled_predictions = scale_predictions(predictions)

test_mse = mse(scaled_predictions, ratings)
test_rmse = rmse(scaled_predictions, ratings)

print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {torch.sqrt(test_rmse):.3f}")

Test MSE: 2.0449
Test RMSE: 1.196


In [19]:
test_mae = mse.compute()
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {torch.sqrt(test_mse):.3f}")

Test MSE: 2.0449
Test RMSE: 1.430


In [20]:
# Load mappng
import joblib


item2int = joblib.load("output/encoders/ml-100k/title_encoder.joblib")
int2item = {v: k for k, v in item2int.items()}

In [21]:
# veamos las diferencias...
data = torch.vstack([users, items, ratings, scaled_predictions]).detach().cpu().numpy().T

# Convert to DataFrame
df = pd.DataFrame.from_records(data, columns=["users_enc", "items_enc", "ratings", "predictions"])
df.insert(loc=0, column="title", value=df.items_enc.map(int2item))
df["error"] = abs(df.ratings - df.predictions)
df.head()

Unnamed: 0,title,users_enc,items_enc,ratings,predictions,error
0,Apt Pupil (1998),625.0,1489.0,5.0,4.933583,0.066417
1,"Peacemaker, The (1997)",625.0,994.0,3.0,5.018129,2.018129
2,Crash (1996),625.0,308.0,3.0,4.743651,1.743651
3,Starship Troopers (1997),625.0,1325.0,3.0,5.014387,2.014387
4,"Devil's Own, The (1997)",625.0,224.0,2.0,4.989292,2.989292


In [22]:
errors_df = df.groupby(["title"]).agg(error=("error", "mean"), count=("title", "count"))
errors_df.sort_values(by="error", ascending=True)

# Movies with lower error
errors_df.head()

Unnamed: 0_level_0,error,count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),1.388391,4
101 Dalmatians (1996),1.296285,12
12 Angry Men (1957),0.528588,23
187 (1997),1.362163,11
2 Days in the Valley (1996),1.154927,17


In [23]:
# Movies with biggest errors
errors_df.sort_values(by="error", ascending=False).head()

Unnamed: 0_level_0,error,count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",4.086637,1
In the Army Now (1994),4.081493,1
Mighty Morphin Power Rangers: The Movie (1995),4.073963,1
Herbie Rides Again (1974),4.055377,1
"Ciao, Professore! (1993)",4.023501,1
