In [None]:
# Load the autoreload extension
%load_ext autoreload

# Set autoreload to reload all modules every time a cell is executed
%autoreload 2

In [None]:
# Assert reproducibility of the results

import random
import numpy as np

seed = 0
random.seed(seed)
np.random.seed(seed)

In [None]:
from pathlib import Path
from scripts.generate_movielens_folds import download_movielens, generate_folds

output_dir = Path("/tmp/folds")
movielens_path = output_dir / "ml-100k"

download_movielens(output_dir)
generate_folds(movielens_path)

In [None]:
from surprise import Dataset, Reader
from surprise.model_selection import PredefinedKFold

reader = Reader("ml-100k")

folds_files = [
    (movielens_path / f"u{i}.base", movielens_path / f"u{i}.test") for i in (1, 2, 3, 4, 5)
]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

# Add index to the folds so it easier to track
folds = [(index, fold) for index, fold in enumerate(pkf.split(data))]

In [None]:
EPOCHS = 10000                           # Number of epochs to be used in the training of the models
WEIGHTS_BINARIZATION_THRESHOLD = 0.7    # Threshold that contros which weights are going to be considered part of an itemset
BINARIZATION_THRESHOLD = 4              # Threshold for a rating to be considered True
KNN_K = 20                              # Number of neighbors to be considered in the KNN algorithm or KNN-based algorithms
RELEVANCE_THRESHOLD = 4                 # Threshold for a rating to be considered relevant or selected
NUMBER_OF_TOP_RECOMMENDATIONS = 20      # Number of top recommendations to be considered in Precision@K and Recall@K

In [None]:
import time
import itertools
import statistics
from multiprocessing import Pool, Manager, cpu_count

from recommenders.grecond_recommender import GreConDRecommender
from recommenders.common import cosine_distance

# Create a dictionary to store the results
manager = Manager()
output = manager.dict()

# Construct thread arguments
recommenders = [
    GreConDRecommender(
        dataset_binarization_threshold=BINARIZATION_THRESHOLD,
        knn_k=KNN_K,
        knn_distance_strategy=cosine_distance,
    )
]
thread_args = [
    args
    for args in itertools.product(
        folds, [output], recommenders, [RELEVANCE_THRESHOLD], [NUMBER_OF_TOP_RECOMMENDATIONS]
    )
]

# Run the threads
start_time = time.time()
with Pool(cpu_count()) as pool:
    pool.starmap(GreConDRecommender.thread, iterable=thread_args)
end_time = time.time()
grecond_runtime = end_time - start_time

# Compile and calculate the mean of the metrics
grecond_actual_coverage = []
grecond_number_of_factors = []
grecond_mae = []
grecond_rmse = []
grecond_micro_averaged_recall = []
grecond_macro_averaged_recall = []
grecond_recall_at_k = []
grecond_micro_averaged_precision = []
grecond_macro_averaged_precision = []
grecond_precision_at_k = []

for key, value in output.items():
    grecond_coverage, _, _ = key
    grecond_actual_coverage.append(value["actual_coverage"])
    grecond_number_of_factors.append(value["number_of_factors"])
    grecond_mae.append(value["mae"])
    grecond_rmse.append(value["rmse"])
    grecond_micro_averaged_recall.append(value["micro_averaged_recall"])
    grecond_macro_averaged_recall.append(value["macro_averaged_recall"])
    grecond_recall_at_k.append(value["recall_at_k"])
    grecond_micro_averaged_precision.append(value["micro_averaged_precision"])
    grecond_macro_averaged_precision.append(value["macro_averaged_precision"])
    grecond_precision_at_k.append(value["precision_at_k"])

mean_grecond_actual_coverage = statistics.mean(grecond_actual_coverage)
mean_grecond_number_of_factors = statistics.mean(grecond_number_of_factors)
mean_grecond_mae = statistics.mean(grecond_mae)
mean_grecond_rmse = statistics.mean(grecond_rmse)
mean_grecond_micro_averaged_recall = statistics.mean(grecond_micro_averaged_recall)
mean_grecond_macro_averaged_recall = statistics.mean(grecond_macro_averaged_recall)
mean_grecond_recall_at_k = statistics.mean(grecond_recall_at_k)
mean_grecond_micro_averaged_precision = statistics.mean(grecond_micro_averaged_precision)
mean_grecond_macro_averaged_precision = statistics.mean(grecond_macro_averaged_precision)
mean_grecond_precision_at_k = statistics.mean(grecond_precision_at_k)

In [None]:
from surprise.prediction_algorithms import KNNBasic
from recommenders.common import generic_thread

# Create a dictionary to store the results
output = manager.dict()

# Construct thread arguments
recommenders = [KNNBasic(k=KNN_K, sim_options={"name": "cosine"})]
thread_args = [
    args
    for args in itertools.product(
        folds, [output], recommenders, [RELEVANCE_THRESHOLD], [NUMBER_OF_TOP_RECOMMENDATIONS]
    )
]

# Run the threads
start_time = time.time()
with Pool(cpu_count()) as pool:
    pool.starmap(generic_thread, iterable=thread_args)
end_time = time.time()
knn_runtime = end_time - start_time

# Compile and calculate the mean of the metrics
knn_mae = []
knn_rmse = []
knn_micro_averaged_recall = []
knn_macro_averaged_recall = []
knn_recall_at_k = []
knn_micro_averaged_precision = []
knn_macro_averaged_precision = []
knn_precision_at_k = []

for key, value in output.items():
    knn_mae.append(value["mae"])
    knn_rmse.append(value["rmse"])
    knn_micro_averaged_recall.append(value["micro_averaged_recall"])
    knn_macro_averaged_recall.append(value["macro_averaged_recall"])
    knn_recall_at_k.append(value["recall_at_k"])
    knn_micro_averaged_precision.append(value["micro_averaged_precision"])
    knn_macro_averaged_precision.append(value["macro_averaged_precision"])
    knn_precision_at_k.append(value["precision_at_k"])

mean_knn_mae = statistics.mean(knn_mae)
mean_knn_rmse = statistics.mean(knn_rmse)
mean_knn_micro_averaged_recall = statistics.mean(knn_micro_averaged_recall)
mean_knn_macro_averaged_recall = statistics.mean(knn_macro_averaged_recall)
mean_knn_recall_at_k = statistics.mean(knn_recall_at_k)
mean_knn_micro_averaged_precision = statistics.mean(knn_micro_averaged_precision)
mean_knn_macro_averaged_precision = statistics.mean(knn_macro_averaged_precision)
mean_knn_precision_at_k = statistics.mean(knn_precision_at_k)

In [None]:
from surprise.prediction_algorithms import SVD
from recommenders.common import generic_thread

# Create a dictionary to store the results
output = manager.dict()

# Construct thread arguments
recommenders = [SVD()]
thread_args = [
    args
    for args in itertools.product(
        folds, [output], recommenders, [RELEVANCE_THRESHOLD], [NUMBER_OF_TOP_RECOMMENDATIONS]
    )
]

# Run the threads
start_time = time.time()
with Pool(cpu_count()) as pool:
    pool.starmap(generic_thread, iterable=thread_args)
end_time = time.time()
svd_runtime = end_time - start_time

# Compile and calculate the mean of the metrics
svd_mae = []
svd_rmse = []
svd_micro_averaged_recall = []
svd_macro_averaged_recall = []
svd_recall_at_k = []
svd_micro_averaged_precision = []
svd_macro_averaged_precision = []
svd_precision_at_k = []

for key, value in output.items():
    svd_mae.append(value["mae"])
    svd_rmse.append(value["rmse"])
    svd_micro_averaged_recall.append(value["micro_averaged_recall"])
    svd_macro_averaged_recall.append(value["macro_averaged_recall"])
    svd_recall_at_k.append(value["recall_at_k"])
    svd_micro_averaged_precision.append(value["micro_averaged_precision"])
    svd_macro_averaged_precision.append(value["macro_averaged_precision"])
    svd_precision_at_k.append(value["precision_at_k"])

mean_svd_mae = statistics.mean(svd_mae)
mean_svd_rmse = statistics.mean(svd_rmse)
mean_svd_micro_averaged_recall = statistics.mean(svd_micro_averaged_recall)
mean_svd_macro_averaged_recall = statistics.mean(svd_macro_averaged_recall)
mean_svd_recall_at_k = statistics.mean(svd_recall_at_k)
mean_svd_micro_averaged_precision = statistics.mean(svd_micro_averaged_precision)
mean_svd_macro_averaged_precision = statistics.mean(svd_macro_averaged_precision)
mean_svd_precision_at_k = statistics.mean(svd_precision_at_k)

In [None]:
from recommenders.pedro import PedroRecommender
from recommenders.common import generic_thread

# Create a dictionary to store the results
output = manager.dict()

# Construct thread arguments
recommenders = [
    PedroRecommender(
        epochs=EPOCHS,
        weights_binarization_threshold=WEIGHTS_BINARIZATION_THRESHOLD,
        dataset_binarization_threshold=BINARIZATION_THRESHOLD,
    )
]
thread_args = [
    args
    for args in itertools.product(
        folds, [output], recommenders, [RELEVANCE_THRESHOLD], [NUMBER_OF_TOP_RECOMMENDATIONS]
    )
]

# Run the threads
start_time = time.time()
with Pool(cpu_count()) as pool:
    pool.starmap(generic_thread, iterable=thread_args)
end_time = time.time()
pedro_runtime = end_time - start_time

# Compile and calculate the mean of the metrics
pedro_mae = []
pedro_rmse = []
pedro_micro_averaged_recall = []
pedro_macro_averaged_recall = []
pedro_recall_at_k = []
pedro_micro_averaged_precision = []
pedro_macro_averaged_precision = []
pedro_precision_at_k = []

for key, value in output.items():
    pedro_mae.append(value["mae"])
    pedro_rmse.append(value["rmse"])
    pedro_micro_averaged_recall.append(value["micro_averaged_recall"])
    pedro_macro_averaged_recall.append(value["macro_averaged_recall"])
    pedro_recall_at_k.append(value["recall_at_k"])
    pedro_micro_averaged_precision.append(value["micro_averaged_precision"])
    pedro_macro_averaged_precision.append(value["macro_averaged_precision"])
    pedro_precision_at_k.append(value["precision_at_k"])

mean_pedro_mae = statistics.mean(pedro_mae)
mean_pedro_rmse = statistics.mean(pedro_rmse)
mean_pedro_micro_averaged_recall = statistics.mean(pedro_micro_averaged_recall)
mean_pedro_macro_averaged_recall = statistics.mean(pedro_macro_averaged_recall)
mean_pedro_recall_at_k = statistics.mean(pedro_recall_at_k)
mean_pedro_micro_averaged_precision = statistics.mean(pedro_micro_averaged_precision)
mean_pedro_macro_averaged_precision = statistics.mean(pedro_macro_averaged_precision)
mean_pedro_precision_at_k = statistics.mean(pedro_precision_at_k)

In [None]:
from recommenders.binaps_recommender import BinaPsRecommender
from recommenders.common import generic_thread, cosine_distance

# Create a dictionary to store the results
output = manager.dict()

# Construct thread arguments
recommenders = [
    BinaPsRecommender(
        epochs=EPOCHS,
        dataset_binarization_threshold=BINARIZATION_THRESHOLD,
        weights_binarization_threshold=WEIGHTS_BINARIZATION_THRESHOLD,
        knn_k=KNN_K,
        knn_distance_strategy=cosine_distance,
    )
]
thread_args = [
    args
    for args in itertools.product(
        folds, [output], recommenders, [RELEVANCE_THRESHOLD], [NUMBER_OF_TOP_RECOMMENDATIONS]
    )
]

# Run the threads
start_time = time.time()
with Pool(cpu_count()) as pool:
    pool.starmap(generic_thread, iterable=thread_args)
end_time = time.time()
binaps_runtime = end_time - start_time

# Compile and calculate the mean of the metrics
binaps_mae = []
binaps_rmse = []
binaps_micro_averaged_recall = []
binaps_macro_averaged_recall = []
binaps_recall_at_k = []
binaps_micro_averaged_precision = []
binaps_macro_averaged_precision = []
binaps_precision_at_k = []

for key, value in output.items():
    binaps_mae.append(value["mae"])
    binaps_rmse.append(value["rmse"])
    binaps_micro_averaged_recall.append(value["micro_averaged_recall"])
    binaps_macro_averaged_recall.append(value["macro_averaged_recall"])
    binaps_recall_at_k.append(value["recall_at_k"])
    binaps_micro_averaged_precision.append(value["micro_averaged_precision"])
    binaps_macro_averaged_precision.append(value["macro_averaged_precision"])
    binaps_precision_at_k.append(value["precision_at_k"])

mean_binaps_mae = statistics.mean(binaps_mae)
mean_binaps_rmse = statistics.mean(binaps_rmse)
mean_binaps_micro_averaged_recall = statistics.mean(binaps_micro_averaged_recall)
mean_binaps_macro_averaged_recall = statistics.mean(binaps_macro_averaged_recall)
mean_binaps_recall_at_k = statistics.mean(binaps_recall_at_k)
mean_binaps_micro_averaged_precision = statistics.mean(binaps_micro_averaged_precision)
mean_binaps_macro_averaged_precision = statistics.mean(binaps_macro_averaged_precision)
mean_binaps_precision_at_k = statistics.mean(binaps_precision_at_k)

In [None]:
import pandas as pd

df = pd.DataFrame(
    {
        "GreConD-KNN": [
            mean_grecond_mae,
            mean_grecond_rmse,
            mean_grecond_micro_averaged_recall,
            mean_grecond_macro_averaged_recall,
            mean_grecond_recall_at_k,
            mean_grecond_micro_averaged_precision,
            mean_grecond_macro_averaged_precision,
            mean_grecond_precision_at_k,
            grecond_runtime
        ],
        "KNN (Surprise)": [
            mean_knn_mae,
            mean_knn_rmse,
            mean_knn_micro_averaged_recall,
            mean_knn_macro_averaged_recall,
            mean_knn_recall_at_k,
            mean_knn_micro_averaged_precision,
            mean_knn_macro_averaged_precision,
            mean_knn_precision_at_k,
            knn_runtime
        ],
        "SVD (Surprise)": [
            mean_svd_mae,
            mean_svd_rmse,
            mean_svd_micro_averaged_recall,
            mean_svd_macro_averaged_recall,
            mean_svd_recall_at_k,
            mean_svd_micro_averaged_precision,
            mean_svd_macro_averaged_precision,
            mean_svd_precision_at_k,
            svd_runtime
        ],
        "Pedro": [
            mean_pedro_mae,
            mean_pedro_rmse,
            mean_pedro_micro_averaged_recall,
            mean_pedro_macro_averaged_recall,
            mean_pedro_recall_at_k,
            mean_pedro_micro_averaged_precision,
            mean_pedro_macro_averaged_precision,
            mean_pedro_precision_at_k,
            pedro_runtime
        ],
        "Bernardo": [
            mean_binaps_mae,
            mean_binaps_rmse,
            mean_binaps_micro_averaged_recall,
            mean_binaps_macro_averaged_recall,
            mean_binaps_recall_at_k,
            mean_binaps_micro_averaged_precision,
            mean_binaps_macro_averaged_precision,
            mean_binaps_precision_at_k,
            binaps_runtime
        ]
    },
    index=[
        "MAE",
        "RMSE",
        "Micro Averaged Recall",
        "Macro Averaged Recall",
        "Recall@K",
        "Micro Averaged Precision",
        "Macro Averaged Precision",
        "Precision@K",
        "Runtime (in seconds)"
    ],
)

pd.options.display.float_format = "{:,.3f}".format
# print(df)
df