# Comparison between the recommender systems

This notebook compares different recommender systems in terms of various metrics. We benchmark the
algorithms on the MovieLens 100k dataset and Movielens 1M dataset. 

Copyright 2023 Bernardo C. Rodrigues

See COPYING file for license details

In [None]:
# Setup notebook
import random
import numpy as np
import evaluation.plot as plot

# Load the autoreload extension
%load_ext autoreload

# Set autoreload to reload all modules every time a cell is executed
%autoreload 2

# Call the function to customize the default template
plot.customize_default_template()

seed = 0
random.seed(seed)
np.random.seed(seed)

In [None]:
# Load folds
from dataset.movie_lens import load_ml_100k_folds

folds = load_ml_100k_folds(predefined=True)

In [None]:
# Set global parameters

# Threshold that contros which weights are going to be considered part of an itemset.
WEIGHTS_BINARIZATION_THRESHOLD = 0.7

# Percentage of the dataset should be covered by the mined formal concepts before grecond stops.
GRECOND_COVERAGE = 0.1

# Threshold that defines wether a rating is considered positive or not before forwarding it to
# GreConD.
DATASET_BINARIZATION_THRESHOLD = 4.0

# Minimum bicluster sparsity derived from a pattern to be considered in the neighborhood of a user.
MINIMUM_PATTERN_BICLUSTER_SPARSITY = 0.0

# The threshold used to binarize the user when generating the user-item neighborhood.
USER_BINARIZATION_THRESHOLD = 1.0

# How many patterns should be considered when generating the user-item neighborhood. High values
# mean all patterns will be considered.
TOP_K_PATTERNS = 10000

# How many items should be considered when estimating the a rating for a user-item pair.
KNN_K = 20

# Threshold that defines wether a rating is considered relevant in the metrics computation.
RELEVANCE_THRESHOLD = 4.0

# Number of top recommendations to be considered in the metrics computation (e.g. precision@k).
NUMBER_OF_TOP_RECOMMENDATIONS = 20

# Number of times the each experiment should be repeated. Keep in mind that an experiment already
# involves a 5 fold cross validation.
REPEAT = 1

# Number of epochs to be used in the training of BinaPs.
EPOCHS = 100

In [None]:
# Define the recommender variations to be used in the experiments.
from surprise.prediction_algorithms import KNNBasic, SVD, KNNWithMeans, KNNBaseline
from recommenders import grecond_recommender, binaps_based_recommenders, knn_based_recommenders
import evaluation.threads as threads

parallel_recommender_variations = [
    threads.RecommenderVariation(
        "UBkNN (Surprise)",
        KNNBasic(k=KNN_K, sim_options={"name": "cosine", "user_based": True}, verbose=False),
    ),
    threads.RecommenderVariation(
        "IBkNN (Surprise)",
        KNNBasic(k=KNN_K, sim_options={"name": "cosine", "user_based": False}, verbose=False),
    ),
    threads.RecommenderVariation(
        "UBkNN Means (Surprise)",
        KNNWithMeans(k=KNN_K, sim_options={"name": "cosine", "user_based": True}, verbose=False),
    ),
    threads.RecommenderVariation(
        "IBkNN Means (Surprise)",
        KNNWithMeans(k=KNN_K, sim_options={"name": "cosine", "user_based": False}, verbose=False),
    ),
    threads.RecommenderVariation(
        "UBkNN Baseline (Surprise)",
        KNNBaseline(k=KNN_K, sim_options={"name": "cosine", "user_based": True}, verbose=False),
    ),
    threads.RecommenderVariation(
        "IBkNN Baseline (Surprise)",
        KNNBaseline(k=KNN_K, sim_options={"name": "cosine", "user_based": False}, verbose=False),
    ),
    threads.RecommenderVariation("SVD", SVD(verbose=False)),
    (
        "GreConD+MF+UBkNN",
        grecond_recommender.GreConDKNNRecommender(
            GRECOND_COVERAGE,
            DATASET_BINARIZATION_THRESHOLD,
            KNN_K,
        ),
    ),
    threads.RecommenderVariation(
        "GreConD+IBkNN",
        grecond_recommender.GreConDBiAKNNRecommender(
            GRECOND_COVERAGE,
            DATASET_BINARIZATION_THRESHOLD,
            MINIMUM_PATTERN_BICLUSTER_SPARSITY,
            USER_BINARIZATION_THRESHOLD,
            TOP_K_PATTERNS,
            KNN_K,
        ),
    ),
]


sequential_recommender_variations = [
    threads.RecommenderVariation(
        "BinaPs+IBkNN",
        binaps_based_recommenders.BinaPsKNNRecommender(
            epochs=EPOCHS,
            dataset_binarization_threshold=DATASET_BINARIZATION_THRESHOLD,
            minimum_pattern_bicluster_sparsity=MINIMUM_PATTERN_BICLUSTER_SPARSITY,
            top_k_patterns=TOP_K_PATTERNS,
            knn_k=KNN_K,
        ),
    )
]

In [None]:
# Run the benchmarks on MovieLens 100k
from evaluation.plot import benchmark
from evaluation.threads import generic_benchmark_thread


results = benchmark(
    folds,
    parallel_recommender_variations,
    sequential_recommender_variations,
    REPEAT,
    RELEVANCE_THRESHOLD,
    NUMBER_OF_TOP_RECOMMENDATIONS,
    generic_benchmark_thread,
)

In [None]:
# Plot results
from evaluation.plot import plot_metric_box_plot, get_result_table
from evaluation.threads import GENERIC_METRIC_NAMES


for metric_name in GENERIC_METRIC_NAMES:
    plot_metric_box_plot(metric_name, results)
    print(get_result_table(metric_name, results))

In [None]:
# Benchmark MovieLens 1M
from dataset.movie_lens import load_ml_1m_folds
from surprise.prediction_algorithms import KNNBasic, SVD, KNNWithMeans, KNNBaseline
from recommenders import grecond_recommender, binaps_based_recommenders
import evaluation.threads as threads
from evaluation.plot import benchmark
from evaluation.threads import generic_benchmark_thread
from evaluation.plot import plot_metric_box_plot, get_result_table
from evaluation.threads import GENERIC_METRIC_NAMES

folds = load_ml_1m_folds()

# For MovieLens 1M, the best results were obtained with the following parameter:
GRECOND_COVERAGE = 0.2

parallel_recommender_variations = [
    threads.RecommenderVariation(
        "UBkNN (Surprise)",
        KNNBasic(k=KNN_K, sim_options={"name": "cosine", "user_based": True}, verbose=False),
    ),
    threads.RecommenderVariation(
        "IBkNN (Surprise)",
        KNNBasic(k=KNN_K, sim_options={"name": "cosine", "user_based": False}, verbose=False),
    ),
    threads.RecommenderVariation(
        "UBkNN Means (Surprise)",
        KNNWithMeans(k=KNN_K, sim_options={"name": "cosine", "user_based": True}, verbose=False),
    ),
    threads.RecommenderVariation(
        "IBkNN Means (Surprise)",
        KNNWithMeans(k=KNN_K, sim_options={"name": "cosine", "user_based": False}, verbose=False),
    ),
    threads.RecommenderVariation(
        "UBkNN Baseline (Surprise)",
        KNNBaseline(k=KNN_K, sim_options={"name": "cosine", "user_based": True}, verbose=False),
    ),
    threads.RecommenderVariation(
        "IBkNN Baseline (Surprise)",
        KNNBaseline(k=KNN_K, sim_options={"name": "cosine", "user_based": False}, verbose=False),
    ),
    threads.RecommenderVariation("SVD", SVD(verbose=False)),
    (
        "GreConD+MF+UBkNN",
        grecond_recommender.GreConDKNNRecommender(
            GRECOND_COVERAGE,
            DATASET_BINARIZATION_THRESHOLD,
            KNN_K,
        ),
    ),
    threads.RecommenderVariation(
        "GreConD+IBkNN",
        grecond_recommender.GreConDBiAKNNRecommender(
            GRECOND_COVERAGE,
            DATASET_BINARIZATION_THRESHOLD,
            MINIMUM_PATTERN_BICLUSTER_SPARSITY,
            USER_BINARIZATION_THRESHOLD,
            TOP_K_PATTERNS,
            KNN_K,
        ),
    ),
]


sequential_recommender_variations = [
    threads.RecommenderVariation(
        "BinaPs+IBkNN",
        binaps_based_recommenders.BinaPsKNNRecommender(
            epochs=EPOCHS,
            dataset_binarization_threshold=DATASET_BINARIZATION_THRESHOLD,
            minimum_pattern_bicluster_sparsity=MINIMUM_PATTERN_BICLUSTER_SPARSITY,
            top_k_patterns=TOP_K_PATTERNS,
            knn_k=KNN_K,
        ),
    )
]

results = benchmark(
    folds,
    parallel_recommender_variations,
    sequential_recommender_variations,
    REPEAT,
    RELEVANCE_THRESHOLD,
    NUMBER_OF_TOP_RECOMMENDATIONS,
    generic_benchmark_thread,
)

for metric_name in GENERIC_METRIC_NAMES:
    plot_metric_box_plot(metric_name, results)
    print(get_result_table(metric_name, results))