# Evaluation of Recommender Systems

This notebook exemplifies how to evaluate the performance of a recommender
system using the implementation from evaluation.benchmark module.

Copyright 2024 Bernardo C. Rodrigues

See COPYING file for license details

In [1]:
# Setup notebook
import random
import numpy as np
import evaluation.plot as plot

# Load the autoreload extension
%load_ext autoreload

# Set autoreload to reload all modules every time a cell is executed
%autoreload 2

# Call the function to customize the default template
plot.customize_default_template()

seed = 0
random.seed(seed)
np.random.seed(seed)

In [2]:
from dataset.common import resolve_folds
from dataset.movie_lens import load_ml_100k_folds

data, k_fold = load_ml_100k_folds()
folds = resolve_folds(data, k_fold)

folds_without_index = [fold[1] for fold in folds]

_, (trainset, testset) = folds[0]

Already downloaded!. Nothing to do.


In [3]:
from evaluation.strategies import (
    MAEStrategy,
    RMSEStrategy,
    MicroAveragedRecallStrategy,
    MacroAveragedRecallStrategy,
    RecallAtKStrategy,
    MicroAveragedPrecisionStrategy,
    MacroAveragedPrecisionStrategy,
    PrecisionAtKStrategy,
    F1ScoreStrategy,
    NDCGStrategy,
    PredictionCoverageStrategy,
)

train_measures = [ ]

test_measures = [
    MAEStrategy(verbose=False),
    RMSEStrategy(verbose=False),
    MicroAveragedRecallStrategy(threshold=4.0),
    MacroAveragedRecallStrategy(threshold=4.0),
    RecallAtKStrategy(k=20, threshold=4.0),
    MicroAveragedPrecisionStrategy(threshold=4.0),
    MacroAveragedPrecisionStrategy(threshold=4.0),
    PrecisionAtKStrategy(k=20, threshold=4.0),
    F1ScoreStrategy(k=20, threshold=4.0),
    NDCGStrategy(k=20, threshold=4.0),
    PredictionCoverageStrategy(),
]


fit_and_score will benchmark the recommender system against a single fold

In [4]:
from evaluation.benchmark import fit_and_score
from surprise.prediction_algorithms import SVD

recommender = SVD()

test_measurements, train_measurements, fit_time, test_time = fit_and_score(
    recommender_system=recommender,
    trainset=trainset,
    testset=testset,
    test_measures=test_measures,
    train_measures=train_measures,
    verbose=True
)

for measure, measurement in test_measurements.items():
    print(f"{measure:<30}|  {measurement:.3f}")

print(f"Fit time:                     |  {fit_time:.3f}")
print(f"Test time:                    |  {test_time:.3f}")

Completed 20000/20000 | Avg. time/task: 0m 0.0s | Time left: 0m 0.0s
All tasks completed.
Total time: 0h 0m 3.8s
mae                           |  0.750
rmse                          |  0.951
micro_averaged_recall         |  0.383
macro_averaged_recall         |  0.334
recall_at_20                  |  0.372
micro_averaged_precision      |  0.842
macro_averaged_precision      |  0.696
precision_at_20               |  0.770
f1_score                      |  0.502
nDCG_at_20                    |  0.443
prediction_coverage           |  1.000
Fit time:                     |  0.276
Test time:                    |  3.846


cross_validate will benchmark the recommender system against multiple folds and return a list of scores

In [10]:
from evaluation.benchmark import cross_validate

measurements = cross_validate(
    recommender_system=recommender,
    folds=folds_without_index,
    test_measures=test_measures,
    train_measures=train_measures,
    max_workers=16,
    verbose=True,
)

Completed 5/5 | Avg. time/task: 0m 0.2s | Time left: 0m 0.0s | Estimated completion time: 17:09:28
All tasks completed.
Total time: 0h 0m 1.1s

In [None]:
for measure, measurement in measurements.items():
    print(f"{measure:<30}|  {measurement}")

cross_validate_recommenders will benchmark the list of recommenders using the provided folds and measures. The function will return a dictionary with the measurements for each recommender.

In [None]:
from surprise.prediction_algorithms import SVD
from evaluation.benchmark import cross_validade_recommenders

recommenders = [
    SVD(n_factors=10, n_epochs=10),
    SVD(n_factors=10, n_epochs=20),
    SVD(n_factors=20, n_epochs=20),
    SVD(n_factors=20, n_epochs=40),
    SVD(n_factors=40, n_epochs=40),
]

recommenders_measurements = cross_validade_recommenders(
    recommenders=recommenders,
    folds=folds_without_index,
    test_measures=test_measures,
    train_measures=train_measures,
    max_workers=16,
    verbose=True,
)

In [None]:
for recommender_measurements in recommenders_measurements:
    for measure, measurement in recommender_measurements.items():
        print(f"{measure:<30}|  {measurement}")

In [None]:
from surprise.prediction_algorithms import SVD
from evaluation.benchmark import GridSearch

parameters_grid = {
    "n_factors": [50, 100, 200],
    "n_epochs": [10, 20, 40],
    "biased": [True, False],
}

grid_search = GridSearch(
    SVD,
    parameters_grid,
    test_measures,
    train_measures,
    max_workers=16,
)

best, ordering, raw = grid_search.fit(folds_without_index)

In [None]:
for measure, result in best.items():
    parameters = result['parameters']
    mean = result['mean']

    print(f"{measure:<30}|  {mean:.3f}  |  {parameters}")