In [None]:
# Load the autoreload extension
%load_ext autoreload

# Set autoreload to reload all modules every time a cell is executed
%autoreload 2

In [None]:
# Connect to the database

import sqlite3

DATABASE = sqlite3.connect("your_database.db")

In [None]:
# Customize plotly's template
import plotly.io as pio
import plotly.graph_objects as go


DPI = 300
WIDTH = 1200
HEIGHT = 800
FORMAT = "png"


def customize_default_template():
    """
    Customize the default template with specific layout settings.

    This function modifies the default template provided by Plotly with customizations
    to the font, margin, width, background color, y-axis, x-axis, and legend.

    The modified template is then set as the default template for all subsequent figures.

    Returns:
        None
    """

    # Access the default template
    default_template = pio.templates[pio.templates.default]

    # Customize font settings
    default_template.layout.font.family = "Latin Modern"
    default_template.layout.font.size = 16
    default_template.layout.font.color = "black"

    # Customize margin and width
    default_template.layout.margin = go.layout.Margin(t=50, b=50, l=50, r=50)
    default_template.layout.width = WIDTH
    default_template.layout.height = HEIGHT

    # Customize background color
    default_template.layout.plot_bgcolor = "rgb(245,245,245)"

    # Customize y-axis settings
    default_template.layout.yaxis = dict(
        mirror=True, ticks="outside", showline=True, linecolor="black", gridcolor="lightgrey"
    )

    # Customize x-axis settings
    default_template.layout.xaxis = dict(
        mirror=True, ticks="outside", showline=True, linecolor="black", gridcolor="lightgrey"
    )

    # Customize legend background color
    default_template.layout.legend = dict(bgcolor="rgb(245,245,245)")

    # Set the default renderer to JPEG
    pio.renderers.default = FORMAT


# Call the function to customize the default template
customize_default_template()

In [None]:
# Load an Experiment: Full 100K @ 50K epochs
from lib.BinaryDataset import BinaryDataset
import pickle

binary_dataset = BinaryDataset.load_from_binaps_compatible_input("datasets/movielens_100k.dat")

# Set the experiment ID
EXPERIMENT_ID = 45

# Create a cursor object to interact with the database
cursor = DATABASE.cursor()

# Fetch the experiment details from the database
cursor.execute("SELECT * FROM binaps_experiments WHERE id = ?", (EXPERIMENT_ID,))
results = cursor.fetchone()

# Unpack the results into individual variables
(
    id,
    dataset,
    train_set_size,
    batch_size,
    test_batch_size,
    epochs,
    learning_rate,
    weight_decay,
    gamma,
    seed,
    hidden_dimension,
    serialized_weights,
    serialized_training_losses,
    serialized_test_losses,
    runtime,
) = results

# Deserialize the weights, training losses, and test losses from their serialized forms
weights = pickle.loads(serialized_weights)
training_losses = pickle.loads(serialized_training_losses)
test_losses = pickle.loads(serialized_test_losses)

In [None]:
# Experiment Runtime
import datetime

print(runtime)
str(datetime.timedelta(seconds=runtime))

In [None]:
# Plot the Loss convergence

from statsmodels.nonparametric.smoothers_lowess import lowess
import plotly.graph_objects as go
import numpy as np

# Generate x-values
x = np.arange(len(test_losses))

# Calculate LOWESS trendline
lowess_data = lowess(test_losses, x, frac=0.1)
lowess_x = lowess_data[:, 0]
lowess_y = lowess_data[:, 1]


# Calculate cumulative minimum trendline
cumulative_min = np.minimum.accumulate(test_losses)

In [None]:
# Create the scatter plot
fig = go.Figure()

# Add scatter trace
fig.add_trace(
    go.Scatter(
        x=x,
        y=test_losses,
        mode="markers",
        marker=dict(color="black", size=2, opacity=0.2),
        name="Test Loss",
    )
)

# Add LOWESS trendline trace
fig.add_trace(
    go.Scatter(
        x=lowess_x,
        y=lowess_y,
        mode="lines",
        line=dict(color="cyan", dash="dash"),
        name="LOWESS Trendline",
    )
)

# Add cumulative minimum trendline trace
fig.add_trace(
    go.Scatter(
        x=x,
        y=cumulative_min,
        mode="lines",
        line=dict(color="red", dash="dot"),
        name="Cumulative Minimum",
    )
)

# Update axis labels
fig.update_layout(xaxis=dict(title="Epochs"), yaxis=dict(title="Loss"))

# Update the legend title
fig.update_traces(showlegend=True)
fig.update_layout(
    width=WIDTH,
    height=HEIGHT / 2,
    margin_l=60,
)
fig.show(renderer="png")

fig.write_image("loss.png", format=FORMAT, width=WIDTH, height=HEIGHT / 2, scale=DPI / 96)

In [None]:
# Generate Binaps formal context

from fca.FormalConceptAnalysis import construct_context_from_binaps_patterns
from lib.BinapsWrapper import get_patterns_from_weights

patterns = get_patterns_from_weights(weights, 0.2)

# Construct the context from the binaps patterns
binaps_context = construct_context_from_binaps_patterns(binary_dataset, patterns, True)

print(len(binaps_context))

binaps_intent_sizes = []
binaps_extent_sizes = []
binaps_concept_sizes = []

# Iterate over each formal concept in the context
for formal_concept in binaps_context:
    # Calculate the sizes of the formal concept's intent and extent
    formal_concept_intent_size = len(formal_concept.intent)
    formal_concept_extent_size = len(formal_concept.extent)

    # Append the intent and extent sizes to the respective lists
    binaps_intent_sizes.append(formal_concept_intent_size)
    binaps_extent_sizes.append(formal_concept_extent_size)

    # Calculate the concept size as the product of intent size and extent size, divided by 20
    binaps_concept_sizes.append(formal_concept_intent_size * formal_concept_extent_size / 20)

In [None]:
# Generate GreCond formal context

# import fca
from fca.FormalConceptAnalysis import GreConD

grecond_formal_context, _ = GreConD(binary_dataset, coverage=1)

# Print the length of the formal context
print(len(grecond_formal_context))

# Initialize empty lists to store concept sizes
grecond_intent_sizes = []
grecond_extent_sizes = []
grecond_concept_sizes = []

# Iterate over the formal concepts in the formal context
for formal_concept in grecond_formal_context:
    # Calculate the intent and extent sizes
    formal_concept_intent_size = len(formal_concept.intent)
    formal_concept_extent_size = len(formal_concept.extent)

    # Append the sizes to the respective lists
    grecond_intent_sizes.append(formal_concept_intent_size)
    grecond_extent_sizes.append(formal_concept_extent_size)
    grecond_concept_sizes.append(formal_concept_intent_size * formal_concept_extent_size / 20)

In [None]:
# Plot Binaps and Grecond concepts

import plotly.graph_objects as go

# BinaPs scatter plot
fig1 = go.Figure()
fig1.add_trace(
    go.Scatter(
        x=binaps_intent_sizes,
        y=binaps_extent_sizes,
        mode="markers",
        marker=dict(color="black", size=10, opacity=0.5),
        name="Concept Area",
    )
)
fig1.update_layout(
    xaxis_title="Concept Intent Size",
    yaxis_title="Concept Extent Size",
    width=500,
    height=500,
    showlegend=False,
    xaxis=dict(range=[-100, 800], tickvals=[0, 200, 400, 600, 800]),
    yaxis=dict(range=[-100, 800], tickvals=[0, 200, 400, 600, 800]),
    margin_l=80,
)

# GreCond2 scatter plot
fig2 = go.Figure()
fig2.add_trace(
    go.Scatter(
        x=grecond_intent_sizes,
        y=grecond_extent_sizes,
        mode="markers",
        marker=dict(
            color="black",
            opacity=0.5,
            size=10,
        ),
        name="Concept",
    )
)
fig2.update_layout(
    xaxis_title="Concept Intent Size",
    yaxis_title="Concept Extent Size",
    width=575,
    height=500,
    showlegend=True,
    xaxis=dict(range=[-100, 800], tickvals=[0, 200, 400, 600, 800]),
    yaxis=dict(range=[-100, 800], tickvals=[0, 200, 400, 600, 800]),
    margin_l=80,
)


# Show the figures
fig1.show()
fig2.show()

fig1.write_image(
    "concepts_binaps.png", format=FORMAT, width=HEIGHT / 2, height=HEIGHT / 2, scale=DPI / 96
)
fig2.write_image("concepts_grecond.png", format=FORMAT, width=575, height=500, scale=DPI / 96)

In [None]:
# boxplots for Binaps and Grecond concepts

fig1 = go.Figure()
fig1.add_trace(
    go.Box(y=binaps_intent_sizes, name="BinaPs", marker_color="black"),
)
fig1.add_trace(go.Box(y=grecond_intent_sizes, name="GreConD"))

fig2 = go.Figure()
fig2.add_trace(
    go.Box(y=binaps_extent_sizes, name="BinaPs", marker_color="black"),
)
fig2.add_trace(go.Box(y=grecond_extent_sizes,  name="GreConD"))

fig3 = go.Figure()
fig3.add_trace(
    go.Box(y=binaps_concept_sizes, name="BinaPs", marker_color="black"),
)
fig3.add_trace(go.Box(y=grecond_concept_sizes, name="GreConD"))


fig1.update_layout(width=WIDTH, height=HEIGHT / 2, yaxis_title="Concept Intent Size", margin_l=80)
fig2.update_layout(width=WIDTH, height=HEIGHT / 2, yaxis_title="Concept Extent Size", margin_l=80)
fig3.update_layout(width=WIDTH, height=HEIGHT / 2, yaxis_title="Concept Area", margin_l=80)

fig1.show()
fig2.show()
fig3.show()

fig1.write_image("intent_size.png", format=FORMAT, width=WIDTH, height=HEIGHT / 2, scale=DPI / 96)
fig2.write_image("extent_size.png", format=FORMAT, width=WIDTH, height=HEIGHT / 2, scale=DPI / 96)
fig3.write_image("area_size.png", format=FORMAT, width=WIDTH, height=HEIGHT / 2, scale=DPI / 96)

In [None]:
# Generate the latent space matrices

from fca.FormalConceptAnalysis import get_factor_matrices_from_concepts

binaps_Af, binaps_Bf = get_factor_matrices_from_concepts(
    binaps_context, binary_dataset.shape[0], binary_dataset.shape[1]
)

grecond_Af, _ = get_factor_matrices_from_concepts(
    grecond_formal_context, binary_dataset.shape[0], binary_dataset.shape[1]
)

I = np.matmul(binaps_Af, binaps_Bf)

real_coverage = np.count_nonzero(I) / np.count_nonzero(binary_dataset._binary_dataset)
print(real_coverage)

In [None]:
# Choose cosine as the similarity distance strategy

from lib.BooleanMatrixBasedRecomenders import cosine_distance as distance_strategy

In [None]:
# Generate the similarities matrices

from lib.BooleanMatrixBasedRecomenders import get_similarity_matrix

binary_dataset_similarities = get_similarity_matrix(binary_dataset, distance_strategy)

# Binaps Similarity matrix
binaps_latent_binary_dataset = BinaryDataset(binaps_Af)
binaps_latent_binary_dataset_similarities = get_similarity_matrix(
    binaps_latent_binary_dataset, distance_strategy
)
binaps_similarity_delta = binaps_latent_binary_dataset_similarities - binary_dataset_similarities

# # Grecond Similarity matrix
grecond_latent_binary_dataset = BinaryDataset(grecond_Af)
grecond_latent_binary_dataset_similarities = get_similarity_matrix(
    grecond_latent_binary_dataset, distance_strategy
)
grecond_similarity_delta = grecond_latent_binary_dataset_similarities - binary_dataset_similarities

In [None]:
# Plot all similarity matrices

import plotly.graph_objects as go


def plot_similarities(binary_dataset_similarities, colorscale, delta_range, file_name):
    fig = go.Figure(
        data=go.Heatmap(
            z=binary_dataset_similarities,
            colorbar=dict(
                title="Similarity",
                titleside="right",
            ),
            colorscale=colorscale,
            zmin=delta_range[0],
            zmax=delta_range[1],
        ),
    )
    fig.update_layout(
        xaxis_title="User",
        yaxis_title="User",
        width=470,
        height=400,
        margin_l=60,
    )
    fig.update_yaxes(autorange="reversed")
    fig.show()
    fig.write_image(file_name, format=FORMAT, scale=DPI / 96)

similary_colorscale = [
    [0.0, "blue"],
    [0.25, "cyan"],
    [0.5, "green"],
    [0.75, "yellow"],
    [1.0, "red"],
]

plot_similarities(
    binary_dataset_similarities, similary_colorscale, (0, 1), f"dataset_similarities_{distance_strategy.__name__}.png"
)

plot_similarities(
    binaps_latent_binary_dataset_similarities,
    similary_colorscale,
    (0, 1),
    f"binaps_latent_binary_dataset_similarities_{distance_strategy.__name__}.png",
)
plot_similarities(
    binaps_similarity_delta, similary_colorscale, (-0.15, 0.15), f"binaps_similarity_delta_{distance_strategy.__name__}.png"
)

plot_similarities(
    grecond_latent_binary_dataset_similarities,
    similary_colorscale,
    (0, 1),
    f"grecond_latent_binary_dataset_similarities_{distance_strategy.__name__}.png",
)
plot_similarities(
    grecond_similarity_delta, similary_colorscale, (-0.15, 0.15), f"grecond_similarity_delta_{distance_strategy.__name__}.png"
)

In [None]:
# Compute neighbor similarities correlations

from statistics import mean
from scipy.stats import spearmanr, kendalltau
from lib.BooleanMatrixBasedRecomenders import get_k_nearest_neighbors


def get_jaccard_distance_from_sets(A, B):
    set_a = set(A)
    set_b = set(B)

    jaccard_similarity = len(set_a.intersection(set_b)) / len(set_a.union(set_b))

    return 1 - jaccard_similarity


def get_correlations_between_neighbors(ks, neighbors_a, neighbors_b):
    spearman_correlations_per_k = []
    kendall_correlations_per_k = []
    jaccards_per_k = []

    for k in ks:
        spearman_correlations = []
        kendall_correlations = []
        jaccards = []

        for index in range(binary_dataset_similarities.shape[0]):
            a_rankings = get_k_nearest_neighbors(neighbors_a, index, k)
            b_rankings = get_k_nearest_neighbors(neighbors_b, index, k)

            if not a_rankings.any() or not b_rankings.any():
                # print(1)
                continue

            spearman_correlation, _ = spearmanr(a_rankings, b_rankings)
            kendall_correlation, _ = kendalltau(a_rankings, b_rankings)
            this_jaccard_distance = get_jaccard_distance_from_sets(a_rankings, b_rankings)

            spearman_correlations.append(spearman_correlation)
            kendall_correlations.append(kendall_correlation)
            jaccards.append(this_jaccard_distance)

        spearman_correlations_per_k.append(mean(spearman_correlations))
        kendall_correlations_per_k.append(mean(kendall_correlations))
        jaccards_per_k.append(mean(jaccards))

    return spearman_correlations_per_k, kendall_correlations_per_k, jaccards_per_k


ks = list(range(1, 100))


(
    binaps_spearman_correlations_per_k,
    binaps_kendall_correlations_per_k,
    binaps_jaccards_per_k,
) = get_correlations_between_neighbors(
    ks, binary_dataset_similarities, binaps_latent_binary_dataset_similarities
)

(
    grecond_spearman_correlations_per_k,
    grecond_kendall_correlations_per_k,
    grecond_jaccards_per_k,
) = get_correlations_between_neighbors(
    ks, binary_dataset_similarities, grecond_latent_binary_dataset_similarities
)

In [None]:
# plot neighbor similarities correlations

import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=ks,
        y=grecond_spearman_correlations_per_k,
        name="Mean Spearman Correlation",
        legendgroup="group",
        legendgrouptitle_text="GreCond",
        line=dict(color="blue", width=4),
    )
)

fig.add_trace(
    go.Scatter(
        x=ks,
        y=binaps_spearman_correlations_per_k,
        name="Mean Spearman Correlation",
        legendgroup="group2",
        legendgrouptitle_text="BinaPs",
        line=dict(color="blue", dash="dot", width=4),
    )
)

fig.add_trace(
    go.Scatter(
        x=ks,
        y=grecond_kendall_correlations_per_k,
        name="Mean Kendall Correlation",
        legendgroup="group",
        line=dict(color="green", width=4),
    )
)
fig.add_trace(
    go.Scatter(
        x=ks,
        y=binaps_kendall_correlations_per_k,
        name="Mean Kendall Correlation",
        legendgroup="group2",
        line=dict(color="green", dash="dot", width=4),
    )
)

fig.add_trace(
    go.Scatter(
        x=ks,
        y=grecond_jaccards_per_k,
        name="Mean Jaccard Distance",
        legendgroup="group",
        line=dict(color="red", width=4),
    )
)

fig.add_trace(
    go.Scatter(
        x=ks,
        y=binaps_jaccards_per_k,
        name="Mean Jaccard Distance",
        legendgroup="group2",
        line=dict(color="red", dash="dot", width=4),
    )
)


# Update the legend title
fig.update_traces(showlegend=True)
fig.update_layout(height=HEIGHT / 2)
fig.update_xaxes(title="K nearest neighbors")
fig.show()

fig.write_image("top_k_correlation.png", format=FORMAT, scale=DPI / 96)

In [None]:
# Create folds from movie lens 100k

import os
from surprise import Dataset, Reader
from surprise.model_selection import PredefinedKFold


fold_files_dir = os.path.expanduser("/workdir/datasets/ml-100k")
folds_files = [(f"{fold_files_dir}/u{i}.base", f"{fold_files_dir}/u{i}.test") for i in (1, 2, 3, 4, 5)]

reader = Reader("ml-100k")
dataset = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

folds = []
for index, (trainset, testset) in enumerate(pkf.split(dataset)):
    folds.append((index, trainset, BinaryDataset.load_from_trainset(trainset), testset))

In [None]:
# Load range of experiments

import numpy as np
import pickle

from lib.BooleanMatrixBasedRecomenders import BinapsRecommender, cosine_distance
from fca.FormalConceptAnalysis import construct_context_from_binaps_patterns
from lib.BinapsWrapper import get_patterns_from_weights

EPOCH_START_EXPERIMENT_IDS = [56, 61, 66, 71, 76, 81, 86]

# experiments = []

experiment_predictions_per_epoch = []
number_of_patterns_per_epoch = []
real_coverages_per_epoch = []

for epoch_start_id in EPOCH_START_EXPERIMENT_IDS:

    fold_ids = [epoch_start_id + i for i in range(5)]

    experiment_predictions = []
    number_of_patterns = []
    real_coverages = []

    for id, (index, trainset, fold_binary_dataset, testset) in zip(fold_ids, folds):

        cursor = DATABASE.cursor()
        cursor.execute("SELECT * FROM binaps_experiments WHERE id = ?", (id,))
        (
            id,
            dataset,
            train_set_size,
            batch_size,
            test_batch_size,
            epochs,
            learning_rate,
            weight_decay,
            gamma,
            seed,
            hidden_dimension,
            serialized_weights,
            serialized_training_losses,
            serialized_test_losses,
            runtime,
        ) = cursor.fetchone()

        weights = pickle.loads(serialized_weights)
        training_losses = pickle.loads(serialized_training_losses)
        test_losses = pickle.loads(serialized_test_losses)

        patterns = get_patterns_from_weights(weights, 0.4)
        number_of_patterns.append(len(patterns))

        context = construct_context_from_binaps_patterns(fold_binary_dataset, patterns, True)
        Af, Bf = get_factor_matrices_from_concepts(
            context, fold_binary_dataset.shape[0], fold_binary_dataset.shape[1]
        )
        I = np.matmul(Af, Bf)

        assert np.all(fold_binary_dataset._binary_dataset[I])

        real_coverage = np.count_nonzero(I) / np.count_nonzero(binary_dataset._binary_dataset)
        real_coverages.append(real_coverage)

        recommender = BinapsRecommender.from_previously_computed_patterns(patterns, k=60, threshold=1, distance_strategy=cosine_distance)
        recommender.fit(trainset)

        fold_predictions = recommender.test(testset)
        experiment_predictions.append(fold_predictions)

    number_of_patterns_per_epoch.append(mean(number_of_patterns))
    real_coverages_per_epoch.append(mean(real_coverages))
    experiment_predictions_per_epoch.append(experiment_predictions)


print(number_of_patterns_per_epoch)
print(real_coverages_per_epoch)



In [None]:
# pruned_NUM = []

def prune_impossible_predictions(predictions):
    pruned = [prediction for prediction in predictions if not prediction[4]["was_impossible"]]
    # pruned_NUM.append(len(pruned))
    return pruned

In [None]:
import statistics
from surprise.accuracy import mae, rmse

mae_array = []
rmse_array = []


for experiment_predictions in experiment_predictions_per_epoch:
    maes = []
    rmses = []

    for fold_predictions in experiment_predictions:
        maes.append(
            mae(
                predictions=prune_impossible_predictions(fold_predictions),
                verbose=False,
            )
        )
        rmses.append(
            rmse(
                predictions=prune_impossible_predictions(fold_predictions),
                verbose=False,
            )
        )


    mae_array.append(statistics.mean(maes))
    rmse_array.append(statistics.mean(rmses))

In [None]:
mean(pruned_NUM)/ 20000

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig1 = make_subplots(specs=[[{"secondary_y": True}]])

x_array = [500, 1000, 2000, 5000, 10000, 30000, 50000]

fig1.add_trace(
    go.Scatter(
        x=x_array,
        y=mae_array,
        mode="lines+markers",
        name="MAE",
        line=dict(color="blue", width=4, dash="dash"),
    )
)
fig1.add_trace(
    go.Scatter(
        x=x_array,
        y=rmse_array,
        mode="lines+markers",
        name="RMSE",
        line=dict(color="red", width=4),
    ),
    secondary_y=True,
)

fig1.update_xaxes(title_text="Epochs")
fig1.update_yaxes(title_text="MAE", secondary_y=False)
fig1.update_yaxes(title_text="RMSE", secondary_y=True)
fig1.update_layout(margin_l=100)

fig1.update_layout(
    yaxis=dict(gridcolor="lightblue"), yaxis2=dict(gridcolor="pink"), height=HEIGHT / 2
)

fig1.show()

fig1.write_image("mae_rmse_epochs.png", format=FORMAT, scale=DPI / 96)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(
        x=x_array,
        y=number_of_patterns_per_epoch,
        mode="lines+markers",
        name="Number of mined Concepts",
        yaxis="y2",
        line=dict(
            color="green",
            width=4,
        ),
    )
)
fig.add_trace(
    go.Scatter(
        x=x_array,
        y=real_coverages_per_epoch,
        mode="lines+markers",
        name="Original matrix reconstruction (%)",
        yaxis="y",
        line=dict(color="blue", width=4, dash="dot"),
    )
)
fig.add_trace(
    go.Scatter(
        x=x_array,
        y=rmse_array,
        mode="lines+markers",
        name="RMSE",
        yaxis="y3",
        line=dict(color="red", width=4, dash="dash"),
    )
)

fig.update_layout(
    xaxis=dict(domain=[0.15, 0.95]),
    yaxis=dict(
        title="Original matrix reconstruction (%)",
        tickformat=".0%",
    ),
    yaxis2=dict(
        title="Number of mined Concepts",
        anchor="free",
        overlaying="y",
        side="left",
        position=0.05,
    ),
    yaxis3=dict(
        title="RMSE",
        anchor="x",
        overlaying="y",
        side="right",
    ),
)

fig.update_xaxes(title_text="Epochs")

fig.update_layout(
    yaxis=dict(gridcolor="lightblue"),
    yaxis2=dict(gridcolor="lightgreen"),
    yaxis3=dict(gridcolor="pink"),
    height=HEIGHT / 2,
)

fig.show()

fig.write_image("reconstruction_x_epochs.png", format=FORMAT, scale=DPI / 96)

In [None]:
# Run binaps




# Load range of experiments

import numpy as np
import pickle

from lib.BooleanMatrixBasedRecomenders import BinapsRecommender, cosine_distance
from fca.FormalConceptAnalysis import construct_context_from_binaps_patterns
from lib.BinapsWrapper import get_patterns_from_weights

EPOCH_START_EXPERIMENT_IDS = [56, 61, 66, 71, 76, 81, 86]

ks = [1, 5, 10, 20, 30, 40, 50, 60]

x_array = []
y_array = []
z_array = []

for epoch_start_id in EPOCH_START_EXPERIMENT_IDS:

    fold_ids = [epoch_start_id + i for i in range(5)]

    # experiment_predictions = []
    # number_of_patterns = []
    # real_coverages = []

    for k in ks:
        recommender.k = k

        experiment_rmse = []

        for id, (index, trainset, fold_binary_dataset, testset) in zip(fold_ids, folds):

            cursor = DATABASE.cursor()
            cursor.execute("SELECT * FROM binaps_experiments WHERE id = ?", (id,))
            (
                id,
                dataset,
                train_set_size,
                batch_size,
                test_batch_size,
                epochs,
                learning_rate,
                weight_decay,
                gamma,
                seed,
                hidden_dimension,
                serialized_weights,
                serialized_training_losses,
                serialized_test_losses,
                runtime,
            ) = cursor.fetchone()

            weights = pickle.loads(serialized_weights)
            training_losses = pickle.loads(serialized_training_losses)
            test_losses = pickle.loads(serialized_test_losses)

            patterns = get_patterns_from_weights(weights, 0.7)

            recommender = BinapsRecommender.from_previously_computed_patterns(patterns, k=k, threshold=1, distance_strategy=cosine_distance)
            recommender.fit(trainset)

            fold_predictions = recommender.test(testset)
            fold_rmse = rmse(predictions=fold_predictions)
            experiment_rmse.append(fold_rmse)

        x_array.append(epochs)
        y_array.append(k)
        z_array.append(statistics.mean(experiment_rmse))


In [None]:
import plotly.graph_objects as go

fig = go.Figure(
    data=[
        go.Mesh3d(
            x=x_array,
            y=y_array,
            z=z_array,
            # opacity=0.6,
            intensity=z_array,
            # colorscale="Viridis",
            colorscale=[[0, "cyan"], [0.05, "blue"], [1.0, "purple"]],
            colorbar_title="RMSE",
        ),
        go.Scatter3d(
            x=x_array,
            y=y_array,
            z=z_array,
            mode="markers",
            marker=dict(size=4, color="yellow"),
            hoverinfo="none",
        ),
    ]
)


fig.update_layout(
    scene=dict(
        xaxis_title="Epochs",
        yaxis_title="K",
        zaxis_title="RMSE",
        # xaxis=dict(autorange="reversed"),  # Reversing the x-axis
        yaxis=dict(autorange="reversed"),  # Reversing the x-axis
    ),
    width=500,
    height=HEIGHT / 2,
    scene_camera=dict(
        up=dict(x=0, y=0, z=1), center=dict(x=-0.2, y=0, z=-0.1), eye=dict(x=-1.25, y=-1., z=1.5)
    ),
)

fig.update_yaxes(autorange="reversed")

fig.show()

fig.write_image("rmse_epochs_k.png", format=FORMAT, scale=DPI / 96)

In [None]:
# Run binaps


# Load range of experiments

import numpy as np
import pickle

from lib.BooleanMatrixBasedRecomenders import BinapsRecommender, cosine_distance
from fca.FormalConceptAnalysis import construct_context_from_binaps_patterns
from lib.BinapsWrapper import get_patterns_from_weights

EPOCH_START_EXPERIMENT_IDS = [56, 61, 66, 71, 76, 81, 86]

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

x_array = []
y_array = []
z_array = []

for epoch_start_id in EPOCH_START_EXPERIMENT_IDS:

    fold_ids = [epoch_start_id + i for i in range(5)]

    for threshold in thresholds:

        experiment_rmse = []

        for id, (index, trainset, fold_binary_dataset, testset) in zip(fold_ids, folds):

            cursor = DATABASE.cursor()
            cursor.execute("SELECT * FROM binaps_experiments WHERE id = ?", (id,))
            (
                id,
                dataset,
                train_set_size,
                batch_size,
                test_batch_size,
                epochs,
                learning_rate,
                weight_decay,
                gamma,
                seed,
                hidden_dimension,
                serialized_weights,
                serialized_training_losses,
                serialized_test_losses,
                runtime,
            ) = cursor.fetchone()

            weights = pickle.loads(serialized_weights)
            training_losses = pickle.loads(serialized_training_losses)
            test_losses = pickle.loads(serialized_test_losses)

            patterns = get_patterns_from_weights(weights, threshold)

            recommender = BinapsRecommender.from_previously_computed_patterns(patterns, k=60, threshold=1, distance_strategy=cosine_distance)
            recommender.fit(trainset)

            fold_predictions = recommender.test(testset)
            fold_rmse = rmse(predictions=fold_predictions)
            experiment_rmse.append(fold_rmse)

        x_array.append(epochs)
        y_array.append(threshold)
        z_array.append(statistics.mean(experiment_rmse))



# import numpy as np

# from lib.BooleanMatrixBasedRecomenders import BinapsRecommender
# from lib.BinapsWrapper import get_patterns_from_weights

# x = 5  # epochs
# x_array = []
# y_array = []
# z_array = []

# predictions = []
# number_of_patterns = []
# real_coverages = []

# thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

# for experiment in experiments:
#     (
#         id,
#         dataset,
#         train_set_size,
#         batch_size,
#         test_batch_size,
#         epochs,
#         learning_rate,
#         weight_decay,
#         gamma,
#         seed,
#         hidden_dimension,
#         serialized_weights,
#         serialized_training_losses,
#         serialized_test_losses,
#         runtime,
#     ) = experiment

#     weights = pickle.loads(serialized_weights)
#     training_losses = pickle.loads(serialized_training_losses)
#     test_losses = pickle.loads(serialized_test_losses)

#     for threshold in thresholds:
#         patterns = get_patterns_from_weights(weights, threshold)

#         recommender = BinapsRecommender.from_previously_computed_patterns(
#             patterns, k=60, threshold=1
#         )

#         experiment_rmse = []

#         for index, trainset, testset in folds:
#             recommender.fit(trainset)

#             fold_predictions = recommender.test(testset)
#             fold_rmse = rmse(predictions=fold_predictions)
#             experiment_rmse.append(fold_rmse)

#         x_array.append(experiment[x])
#         y_array.append(threshold)
#         z_array.append(statistics.mean(experiment_rmse))

#     # predictions.append(experiment_predictions)

In [None]:
import plotly.graph_objects as go

fig = go.Figure(
    data=[
        go.Mesh3d(
            x=x_array,
            y=y_array,
            z=z_array,
            opacity=0.9,
            intensity=z_array,
            colorscale=[[0, "cyan"], [0.1, "blue"], [1.0, "purple"]],
            colorbar_title="RMSE",
        ),
        go.Scatter3d(
            x=x_array,
            y=y_array,
            z=z_array,
            mode="markers",
            marker=dict(size=4, color="yellow"),  # Set the marker size  # Set the marker color
            hoverinfo="none",  # Disable hoverinfo for the markers
        ),
    ]
)

fig.update_layout(
    scene=dict(xaxis_title="Epochs", yaxis_title="Binarization Threshold", zaxis_title="RMSE")
)

fig.update_layout(
    scene=dict(
        xaxis=dict(
            title="Epochs",
            tickmode="array",  # Use specific tick values
            tickvals=[0, 25000, 50000],  # Specify the desired tick values for the x-axis
        ),
        yaxis=dict(
            title="Binarization<br>Threshold",
            tickmode="array",  # Use specific tick values
            tickvals=[
                0,
                0.2,
                0.4,
                0.6,
                0.8,
            ],  # Specify the desired tick values for the x-axis
            autorange="reversed",
        ),
        zaxis_title="RMSE",
        # xaxis=dict(autorange="reversed"),  # Reversing the x-axis
    ),
    width=500,
    height=HEIGHT / 2,
    scene_camera=dict(
        up=dict(x=0, y=0, z=1),
        center=dict(x=0.1, y=0, z=-0.3),
        eye=dict(x=-1.3625, y=-1.3625, z=1.1),
    ),
)

# fig.update_yaxes(autorange="reversed")/

fig.show()

fig.write_image("rmse_epochs_threshold.png", format=FORMAT, scale=DPI / 96)

In [None]:
# Create a trainset from the complemte MovieLens 100K dataset
from surprise import Dataset

dataset = Dataset.load_builtin("ml-100k")
trainset = dataset.build_full_trainset()

In [None]:
import itertools
import os
from surprise import Dataset, Reader
from surprise.model_selection import PredefinedKFold
from lib.BooleanMatrixBasedRecomenders import cosine_distance, jaccard_distance, BinaryDataset

fold_files_dir = os.path.expanduser("/workdir/datasets/ml-100k")
folds_files = [(f"{fold_files_dir}/u{i}.base", f"{fold_files_dir}/u{i}.test") for i in (1, 2, 3, 4, 5)]

reader = Reader("ml-100k")
dataset = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()

folds = []
for index, (trainset, testset) in enumerate(pkf.split(dataset)):
    folds.append((index, trainset, BinaryDataset.load_from_trainset(trainset), testset))

ks = [1, 5, 10, 20, 30, 40, 50, 60]
distance_strategies = [cosine_distance, jaccard_distance]

thread_args = [d for d in itertools.product(distance_strategies, folds)]


In [None]:
import statistics
from collections import defaultdict


def get_global_precision(predictions, relevance_threshold=1):
    """
    Returns the global precision, or micro-averaged precision, from a predictions list.

    Precision is defined as the fraction of relevant instances among the retrieved instances. In recommender systems,
    it measures the fraction of items that are liked by the user among the items that are recommended by the system.

    Precision = True Positives / (True Positives + False Positives)

    For example, if you have a recommender system that suggests movies to a user, and you have 100 movies in total,
    10 of which are liked by the user. If your system recommends 8 movies to the user, 4 of which are liked by the user
    (true positives), but also 4 of which are disliked by the user (false positives), then your precision is 4 / (4 + 4)
    = 0.5. This means that 50% of the movies that your system recommended were actually liked by the user.

    Global precision gives equal weight to each item, regardless of which user rated or was recommended it.
    """

    def is_relevant(measure):
        return measure >= relevance_threshold

    true_positives = 0
    false_positives = 0

    for _, _, true_rating, estimate, _ in predictions:
        if is_relevant(estimate):
            if is_relevant(true_rating):
                true_positives += 1
            else:
                false_positives += 1

    return true_positives / (true_positives + false_positives)


def get_user_averaged_precision(predictions, relevance_threshold=1):
    def is_relevant(measure):
        return measure >= relevance_threshold

    precisions = []
    ratings_per_user = defaultdict(list)
    for user_id, _, true_rating, estimate, _ in predictions:
        ratings_per_user[user_id].append((estimate, true_rating))

    for _, user_ratings in ratings_per_user.items():
        true_positives = 0
        false_positives = 0

        for estimate, true_rating in user_ratings:
            if is_relevant(estimate):
                if is_relevant(true_rating):
                    true_positives += 1
                else:
                    false_positives += 1

        try:
            precision = true_positives / (true_positives + false_positives)
        except ZeroDivisionError:
            pass
        else:
            precisions.append(precision)

    return statistics.mean(precisions)


def get_precision_at_k(predictions, relevance_threshold=1, k=20):
    def is_relevant(measure):
        return measure >= relevance_threshold

    precisions = []
    ratings_per_user = defaultdict(list)
    for user_id, _, true_rating, estimate, _ in predictions:
        ratings_per_user[user_id].append((estimate, true_rating))

    for _, user_ratings in ratings_per_user.items():
        relevant_itens_in_the_top_k = 0

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        for estimate, true_rating in user_ratings[:k]:
            if is_relevant(true_rating):
                relevant_itens_in_the_top_k += 1

        precisions.append(relevant_itens_in_the_top_k / k)

    return statistics.mean(precisions)


def get_global_recall(predictions, relevance_threshold=1):
    """
    Returns the recall from a predictions list.

    Recall is defined as the fraction of relevant instances that were retrieved. In recommender systems,
    it measures the fraction of items that are liked by the user among all the items that are available.

    Recall = True Positives / (True Positives + False Negatives)

    For example, if you have a recommender system that suggests movies to a user, and you have 100 movies in total,
    10 of which are liked by the user. If your system misses 6 movies that are liked by the user (false negatives), then
    your recall is 4 / (4 + 6) = 0.4. This means that 40% of the movies that are actually liked by the user were
    recommended by your system.

    """

    def is_relevant(measure):
        return measure >= relevance_threshold

    true_positives = 0
    false_negatives = 0

    for _, _, true_rating, estimate, _ in predictions:
        if is_relevant(estimate):
            if is_relevant(true_rating):
                true_positives += 1
        else:
            if is_relevant(true_rating):
                false_negatives += 1

    return true_positives / (true_positives + false_negatives)


def get_user_averaged_recall(predictions, relevance_threshold=1):
    def is_relevant(measure):
        return measure >= relevance_threshold

    recalls = []
    ratings_per_user = defaultdict(list)
    for user_id, _, true_rating, estimate, _ in predictions:
        ratings_per_user[user_id].append((estimate, true_rating))

    for _, user_ratings in ratings_per_user.items():
        true_positives = 0
        false_negatives = 0

        for estimate, true_rating in user_ratings:
            if is_relevant(estimate):
                if is_relevant(true_rating):
                    true_positives += 1
            else:
                if is_relevant(true_rating):
                    false_negatives += 1
        try:
            recall = true_positives / (true_positives + false_negatives)
        except ZeroDivisionError:
            pass
        else:
            recalls.append(recall)

    return statistics.mean(recalls)


def get_recall_at_k(predictions, relevance_threshold=1, k=20):
    def is_relevant(measure):
        return measure >= relevance_threshold

    recalls = []
    ratings_per_user = defaultdict(list)
    for user_id, _, true_rating, estimate, _ in predictions:
        ratings_per_user[user_id].append((estimate, true_rating))

    for _, user_ratings in ratings_per_user.items():
        relevant_itens_in_the_top_k = 0
        total_relevant_itens = 0

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        for estimate, true_rating in user_ratings[:k]:
            if is_relevant(true_rating):
                relevant_itens_in_the_top_k += 1

        for estimate, true_rating in user_ratings:
            if is_relevant(true_rating):
                total_relevant_itens += 1
        try:
            recalls.append(relevant_itens_in_the_top_k / total_relevant_itens)
        except ZeroDivisionError:
            pass

    return statistics.mean(recalls)

In [None]:
from lib.BooleanMatrixBasedRecomenders import FcaBmf


def work(distance_strategy, dataset_tuple):
    result = {}

    (index, trainset, binary_dataset, testset) = dataset_tuple

    algo = FcaBmf(distance_strategy=distance_strategy)
    algo.fit(trainset)


    result["index"] = index
    result["distance_strategy"] = distance_strategy
    result["algo"] = algo
    return result


In [None]:
from multiprocessing import Pool

with Pool(16) as pool:
    raw_results = pool.starmap(work, iterable=thread_args)

In [None]:
consolidated = {d: {} for d in range(5)}

for result in raw_results:
    consolidated[result["index"]][f"fcabmf_{result['distance_strategy'].__name__}"] = result["algo"]

print(consolidated)

In [None]:
from surprise.prediction_algorithms import KNNBasic

for index, trainset, binary_dataset, testset in folds:
    KNN_recommender = KNNBasic(k=5, sim_options={"name": "cosine"})
    KNN_recommender.fit(trainset)

    consolidated[index]["knn"] = KNN_recommender

In [None]:
from lib.BooleanMatrixBasedRecomenders import BinapsRecommender

# for index, trainset, testset in folds:
#     binaps_recommender = BinapsRecommender(
#         epochs=50000, binarization_threshold=0.4, distance_strategy=jaccard_distance
#     )
#     binaps_recommender.fit(trainset)
#     consolidated[index]["binaps_recommender_jaccard"] = binaps_recommender

#     binaps_recommender = BinapsRecommender(
#         epochs=50000, binarization_threshold=0.4, distance_strategy=cosine_distance
#     )
#     binaps_recommender.fit(trainset)
#     consolidated[index]["binaps_recommender_cosine"] = binaps_recommender

import numpy as np
import pickle

from lib.BooleanMatrixBasedRecomenders import BinapsRecommender, cosine_distance, jaccard_distance
from fca.FormalConceptAnalysis import construct_context_from_binaps_patterns
from lib.BinapsWrapper import get_patterns_from_weights





fold_ids = [86 + i for i in range(5)]

for id, (index, trainset, fold_binary_dataset, testset) in zip(fold_ids, folds):

    cursor = DATABASE.cursor()
    cursor.execute("SELECT * FROM binaps_experiments WHERE id = ?", (id,))
    (
        id,
        dataset,
        train_set_size,
        batch_size,
        test_batch_size,
        epochs,
        learning_rate,
        weight_decay,
        gamma,
        seed,
        hidden_dimension,
        serialized_weights,
        serialized_training_losses,
        serialized_test_losses,
        runtime,
    ) = cursor.fetchone()

    weights = pickle.loads(serialized_weights)
    training_losses = pickle.loads(serialized_training_losses)
    test_losses = pickle.loads(serialized_test_losses)

    patterns = get_patterns_from_weights(weights, 0.7)

    recommender = BinapsRecommender.from_previously_computed_patterns(patterns, k=60, threshold=1, distance_strategy=jaccard_distance)
    recommender.fit(trainset)

    consolidated[index]["binaps_recommender_jaccard"] = recommender

    recommender = BinapsRecommender.from_previously_computed_patterns(patterns, k=60, threshold=1, distance_strategy=cosine_distance)
    recommender.fit(trainset)
    
    consolidated[index]["binaps_recommender_cosine"] = recommender

In [None]:
from surprise.accuracy import mae, rmse


ks = [1, 5, 10, 20, 30, 40, 50, 60]
# ks = [1, 5]

RELEVANCE_THRESHOLD = 3
NUMBER_OF_TOP_RECOMMENDATIONS = 20

recommenders = consolidated[0].keys()

results = {}
for recommender in recommenders:
    results[recommender] = {}
    for k in ks:
        results[recommender][k] = defaultdict(list)


for recommender in recommenders:
    for k in ks:
        for index, trainset, _, testset in folds:
            recommender_object = consolidated[index][recommender]
            recommender_object.k = k
            predictions = recommender_object.test(testset)

            predictions = prune_impossible_predictions(predictions)

            results[recommender][k]["maes"].append(mae(predictions=predictions, verbose=False))
            results[recommender][k]["rmses"].append(rmse(predictions=predictions, verbose=False))

            results[recommender][k]["global_recalls"].append(
                get_global_recall(predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD)
            )
            results[recommender][k]["user_averaged_recalls"].append(
                get_user_averaged_recall(
                    predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
                )
            )
            results[recommender][k]["recalls_at_k"].append(
                get_recall_at_k(
                    predictions=predictions,
                    relevance_threshold=RELEVANCE_THRESHOLD,
                    k=NUMBER_OF_TOP_RECOMMENDATIONS,
                )
            )

            results[recommender][k]["global_precisions"].append(
                get_global_precision(
                    predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
                )
            )
            results[recommender][k]["user_averaged_precisions"].append(
                get_user_averaged_precision(
                    predictions=predictions, relevance_threshold=RELEVANCE_THRESHOLD
                )
            )
            results[recommender][k]["precisions_at_k"].append(
                get_precision_at_k(
                    predictions=predictions,
                    relevance_threshold=RELEVANCE_THRESHOLD,
                    k=NUMBER_OF_TOP_RECOMMENDATIONS,
                )
            )


# print(results)

In [None]:
import statistics

mae_curves = defaultdict(list)
rmse_curves = defaultdict(list)

global_recall_curves = defaultdict(list)
user_averaged_recall_curves = defaultdict(list)
recalls_at_k_curve = defaultdict(list)

global_precision_curves = defaultdict(list)
user_averaged_precision_curves = defaultdict(list)
precisions_at_k_curve = defaultdict(list)

for recommender in recommenders:
    for k in ks:
        rmse_curves[recommender].append(statistics.mean(results[recommender][k]["rmses"]))
        mae_curves[recommender].append(statistics.mean(results[recommender][k]["maes"]))

        global_recall_curves[recommender].append(
            statistics.mean(results[recommender][k]["global_recalls"])
        )
        user_averaged_recall_curves[recommender].append(
            statistics.mean(results[recommender][k]["user_averaged_recalls"])
        )
        recalls_at_k_curve[recommender].append(
            statistics.mean(results[recommender][k]["recalls_at_k"])
        )

        global_precision_curves[recommender].append(
            statistics.mean(results[recommender][k]["global_precisions"])
        )
        user_averaged_precision_curves[recommender].append(
            statistics.mean(results[recommender][k]["user_averaged_precisions"])
        )
        precisions_at_k_curve[recommender].append(
            statistics.mean(results[recommender][k]["precisions_at_k"])
        )

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# for recommender in recommenders:


#     fig.add_trace(go.Scatter(x=ks, y=mae_curves[recommender], mode='lines+markers', name=recommender, showlegend=False), row=1, col=1)
#     fig.add_trace(go.Scatter(x=ks, y=rmse_curves[recommender], mode='lines+markers', name=recommender, showlegend=False), row=1, col=2)

#     fig.add_trace(go.Scatter(x=ks, y=global_recall_curves[recommender], mode='lines+markers', name=recommender, showlegend=False), row=2, col=1)
#     fig.add_trace(go.Scatter(x=ks, y=user_averaged_recall_curves[recommender], mode='lines+markers', name=recommender, showlegend=False), row=2, col=2)

#     fig.add_trace(go.Scatter(x=ks, y=global_precision_curves[recommender], mode='lines+markers', name=recommender, showlegend=False), row=3, col=1)
#     fig.add_trace(go.Scatter(x=ks, y=user_averaged_precision_curves[recommender], mode='lines+markers', name=recommender, showlegend=False), row=3, col=2)

#     fig.add_trace(go.Scatter(x=ks, y=recalls_at_k_curve[recommender], mode='lines+markers', name=recommender, showlegend=False), row=4, col=1)
#     fig.add_trace(go.Scatter(x=ks, y=precisions_at_k_curve[recommender], mode='lines+markers', name=recommender, showlegend=False), row=4, col=2)


# fig.show()
label = {
    "fcabmf_cosine_distance": "GreConD / Cosine Similarity",
    "fcabmf_jaccard_distance": "GreConD / Jaccard Distance",
    "knn": "KNN / Cosine Similarity",
    "binaps_recommender_jaccard": "BinaPs / Jaccard Distance",
    "binaps_recommender_cosine": "BinaPs / Cosine Similarity",
}

colors = {
    "fcabmf_cosine_distance": dict(color="red", dash="dot", width=4),
    "fcabmf_jaccard_distance": dict(color="red", dash="solid", width=4),
    "knn": dict(color="green", dash="dot", width=4),
    "binaps_recommender_jaccard": dict(color="blue", dash="solid", width=4),
    "binaps_recommender_cosine": dict(color="blue", dash="dot", width=4),
}


def generate_plot(data, title):
    fig = go.Figure()

    for recommender in recommenders:
        fig.add_trace(
            go.Scatter(x=ks, y=data[recommender], name=label[recommender], line=colors[recommender])
        )

    fig.update_traces(showlegend=True)
    fig.update_layout(height=HEIGHT / 2, margin_l=80)
    fig.update_xaxes(title="K")
    fig.update_yaxes(title=title)
    fig.show(renderer="png")
    fig.write_image(f"{title}.png", format=FORMAT, scale=DPI / 96)


generate_plot(mae_curves, "MAE")
generate_plot(rmse_curves, "RMSE")
generate_plot(global_recall_curves, "Global Recall")
generate_plot(user_averaged_recall_curves, "User Averaged Recall")
generate_plot(global_precision_curves, "Global Precision")
generate_plot(user_averaged_precision_curves, "User Averaged Precision")
generate_plot(recalls_at_k_curve, "Recall@N")
generate_plot(precisions_at_k_curve, "Precision@N")

# fig.write_image("top_k_correlation.png", format=FORMAT, scale=DPI/96)