# 'An FCA-based Boolean Matrix Factorization for Collaborative Filtering' replication

This notebook replicates the results from the paper [1]. Results are somewhat different from the ones reported in the
paper, but the general trend is the same. We believe the differences are due to the fact that the authors used
different folds for the training and test sets. Even though the folds themselves are different, the ratios of the
training and test sets are the same.

[1] Elena Nenova, Dmitry I. Ignatov, and Andrey V. Konstantinov, 'An FCA-based Boolean Matrix Factorization for
    Collaborative Filtering <https://publications.hse.ru/pubs/share/folder/2yoq2ezea5/97014436.pdf>

Copyright 2022 Bernardo C. Rodrigues

See COPYING file for license details

In [None]:
# Create a trainset from the complete MovieLens 100K dataset

from surprise import Dataset
from surprise.model_selection import KFold

dataset = Dataset.load_builtin("ml-100k", prompt=False)

kf = KFold(n_splits=5)
folds = [(index, fold) for index, fold in enumerate(kf.split(dataset))]

In [None]:
# Define worker thread for parallelized GreConD execution

from surprise.accuracy import mae, rmse

from recommenders.grecond_recommender import GreConDRecommender
from recommenders.common import cosine_distance

from evaluation import (
    get_global_recall,
    get_user_averaged_recall,
    get_recall_at_k,
    get_global_precision,
    get_user_averaged_precision,
    get_precision_at_k,
)

RELEVANCE_THRESHOLD = 5
NUMBER_OF_TOP_RECOMMENDATIONS = 20


def grecond_thread(grecond_coverage, knn_k, fold):
    """
    This function is used to parallelize the GreConD recommender. It puts the results a global
    dictionary called 'output'. 'output' is expected to be a Manager().dict() object since it is
    shared between processes.

    Args:
        grecond_coverage (float): The coverage of the GreConD recommender.
        knn_k (int): The number of neighbors to consider in the GreConD recommender.
        fold (tuple): The fold to use for the GreConD recommender.
    """
    global output
    fold_index, (trainset, testset) = fold
    recommender = GreConDRecommender(
        grecond_coverage=grecond_coverage, knn_k=knn_k, knn_distance_strategy=cosine_distance
    )
    recommender.fit(trainset)
    predictions = recommender.test(testset)
    output[(grecond_coverage, knn_k, fold_index)] = {
        "actual_coverage": recommender.actual_coverage,
        "number_of_factors": recommender.number_of_factors,
        "mae": mae(predictions=predictions, verbose=False),
        "rmse": rmse(predictions=predictions, verbose=False),
        "global_recall": get_global_recall(
            predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
        ),
        "user_averaged_recall": get_user_averaged_recall(
            predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
        ),
        "recall_at_k": get_recall_at_k(
            predictions=predictions,
            relevance_threshold=RELEVANCE_THRESHOLD,
            k=NUMBER_OF_TOP_RECOMMENDATIONS,
        ),
        "global_precision": get_global_precision(
            predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
        ),
        "user_averaged_precision": get_user_averaged_precision(
            predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
        ),
        "precision_at_k": get_precision_at_k(
            predictions=predictions,
            relevance_threshold=RELEVANCE_THRESHOLD,
            k=NUMBER_OF_TOP_RECOMMENDATIONS,
        ),
    }

In [None]:
# Reproduction of Table 2 from page 64 of the paper [1]

import itertools
import statistics
import pandas as pd
from collections import defaultdict
from multiprocessing import Pool, Manager, cpu_count

manager = Manager()
output = manager.dict()

# ks is just a placeholder to fit function signature. We are not interested in the predictions
# in this experiment.
ks = [30]
coverages = [1.0, 0.8, 0.6]

thread_args = [d for d in itertools.product(coverages, ks, folds)]

with Pool(cpu_count()) as pool:
    pool.starmap(grecond_thread, iterable=thread_args)

actual_coverage = defaultdict(list)
number_of_factors = defaultdict(list)

for key, value in output.items():
    grecond_coverage, _, _ = key
    actual_coverage[grecond_coverage].append(value["actual_coverage"])
    number_of_factors[grecond_coverage].append(value["number_of_factors"])

results = []
results.append((statistics.mean(actual_coverage[0.6]), statistics.mean(number_of_factors[0.6])))
results.append((statistics.mean(actual_coverage[0.8]), statistics.mean(number_of_factors[0.8])))
results.append((statistics.mean(actual_coverage[1.0]), statistics.mean(number_of_factors[1.0])))

print(
    pd.DataFrame(
        results, columns=["Actual Coverage", "Number of factors"], index=["100%", "80%", "60%"]
    )
)

In [None]:
# Replication of Figures 2, 3, 4 and part of 7 and 8 from [1]

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Compute the results for the GreConD recommender with different coverages and k values
coverages = [1.0, 0.8, 0.6]
ks = [1, 5, 10, 20, 30, 40, 50, 60]

thread_args = [d for d in itertools.product(coverages, ks, folds)]

with Pool(cpu_count()) as pool:
    pool.starmap(grecond_thread, iterable=thread_args)

# Compile the results into a dictionary that aggregates fold results for each coverage and k
results = {}
for coverage in coverages:
    results[coverage] = {}
    for k in ks:
        results[coverage][k] = defaultdict(list)


for key, value in output.items():
    grecond_coverage, knn_k, fold_index = key

    results[grecond_coverage][knn_k]["actual_coverages"].append(value["actual_coverage"])
    results[grecond_coverage][knn_k]["numbers_of_factors"].append(value["number_of_factors"])

    results[grecond_coverage][knn_k]["maes"].append(value["mae"])
    results[grecond_coverage][knn_k]["rmses"].append(value["rmse"])

    results[grecond_coverage][knn_k]["global_recalls"].append(value["global_recall"])
    results[grecond_coverage][knn_k]["user_averaged_recalls"].append(value["user_averaged_recall"])
    results[grecond_coverage][knn_k]["recalls_at_k"].append(value["recall_at_k"])

    results[grecond_coverage][knn_k]["global_precisions"].append(value["global_precision"])
    results[grecond_coverage][knn_k]["user_averaged_precisions"].append(
        value["user_averaged_precision"]
    )
    results[grecond_coverage][knn_k]["precisions_at_k"].append(value["precision_at_k"])


# Generate the curves for the GreConD recommender with different coverages and k values
actual_coverages_curves = defaultdict(list)
numbers_of_factors_curves = defaultdict(list)

mae_curves = defaultdict(list)
rmse_curves = defaultdict(list)

global_recall_curves = defaultdict(list)
user_averaged_recall_curves = defaultdict(list)
recalls_at_k_curve = defaultdict(list)

global_precision_curves = defaultdict(list)
user_averaged_precision_curves = defaultdict(list)
precisions_at_k_curve = defaultdict(list)

for coverage in coverages:
    for k in ks:
        actual_coverages_curves[coverage].append(
            statistics.mean(results[coverage][k]["actual_coverages"])
        )
        numbers_of_factors_curves[coverage].append(
            statistics.mean(results[coverage][k]["numbers_of_factors"])
        )

        rmse_curves[coverage].append(statistics.mean(results[coverage][k]["rmses"]))
        mae_curves[coverage].append(statistics.mean(results[coverage][k]["maes"]))

        global_recall_curves[coverage].append(
            statistics.mean(results[coverage][k]["global_recalls"])
        )
        user_averaged_recall_curves[coverage].append(
            statistics.mean(results[coverage][k]["user_averaged_recalls"])
        )
        recalls_at_k_curve[coverage].append(statistics.mean(results[coverage][k]["recalls_at_k"]))

        global_precision_curves[coverage].append(
            statistics.mean(results[coverage][k]["global_precisions"])
        )
        user_averaged_precision_curves[coverage].append(
            statistics.mean(results[coverage][k]["user_averaged_precisions"])
        )
        precisions_at_k_curve[coverage].append(
            statistics.mean(results[coverage][k]["precisions_at_k"])
        )

# Plot the curves for the GreConD recommender with different coverages and k values
fig = make_subplots(
    rows=5,
    cols=2,
    subplot_titles=(
        "Actual Coverage",
        "Number of factors",
        "MAE",
        "RMSE",
        "Global Recall",
        "User Averaged Recall",
        "Global Precision",
        "User Averaged Precision",
        "Recall@K",
        "Precision@K",
    ),
)

fig.update_xaxes(title_text="Number of nearest neighbors used (k)")

fig.update_yaxes(title_text="Actual Coverage", row=1, col=1)
fig.update_yaxes(title_text="Number of factors", row=1, col=2)

fig.update_yaxes(title_text="MAE", row=2, col=1)
fig.update_yaxes(title_text="RMSE", row=2, col=2)

fig.update_yaxes(title_text="Global Recall", row=3, col=1)
fig.update_yaxes(title_text="User Averaged Recall", row=3, col=2)

fig.update_yaxes(title_text="Global Precision", row=4, col=1)
fig.update_yaxes(title_text="User Averaged Precision", row=4, col=2)

fig.update_yaxes(title_text="Recall@K", row=5, col=1)
fig.update_yaxes(title_text="Precision@K", row=5, col=2)

fig.update_layout(legend_title_text="Rating Coverage Level", height=2000)

line_color = {
    1.0: dict(color="red"),
    0.8: dict(color="green"),
    0.6: dict(color="blue"),
}

for coverage in coverages:
    name_string = f"{coverage*100}%"

    fig.add_trace(
        go.Scatter(
            x=ks,
            y=actual_coverages_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=True,
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=ks,
            y=numbers_of_factors_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=1,
        col=2,
    )

    fig.add_trace(
        go.Scatter(
            x=ks,
            y=mae_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=2,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=ks,
            y=rmse_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=2,
        col=2,
    )

    fig.add_trace(
        go.Scatter(
            x=ks,
            y=global_recall_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=3,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=ks,
            y=user_averaged_recall_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=3,
        col=2,
    )

    fig.add_trace(
        go.Scatter(
            x=ks,
            y=global_precision_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=4,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=ks,
            y=user_averaged_precision_curves[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=4,
        col=2,
    )

    fig.add_trace(
        go.Scatter(
            x=ks,
            y=recalls_at_k_curve[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=5,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=ks,
            y=precisions_at_k_curve[coverage],
            mode="lines+markers",
            name=name_string,
            line=line_color[coverage],
            showlegend=False,
        ),
        row=5,
        col=2,
    )


fig.show()