In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,train_test_split
from collections import defaultdict
import matplotlib.pyplot as plt

from models.simple import SimpleRS
from models.max_coverage import MaxCoverageRS

In [2]:
ratings_df = pd.read_csv('datasets/ml-latest-small/ratings.csv', encoding='latin-1')

In [3]:
def print_metrics(metrics):
    combined_dict = {key: [] for key in metrics[0]}
    for d in metrics:
        for key, value in d.items():
            combined_dict[key].append(round(value, 4))
    for metric, values in combined_dict.items():
        formatted_accuracies = " & ".join([f"{value:.4f}" for value in values])
        print(metric, formatted_accuracies)

def run(model_class, data, start=5, stop=11):
    metrics = []
    
    for i in range(start,stop):
        train_df, test_df = train_test_split(data, test_size=0.2)

        watched = test_df.groupby('userId')['movieId'].apply(list).reset_index()
        true = watched['movieId'].tolist()
        
        model = model_class(i)
        model.fit(train_df)
        
        predictions = model.predict(test_df)
        metrics.append(model.calculate_metrics(predictions, true))
    return metrics

topk_metrics = run(SimpleRS, ratings_df)
mc_metrics = run(MaxCoverageRS, ratings_df)

In [7]:
print_metrics(topk_metrics)

accuracy 0.0990 & 0.0888 & 0.0959 & 0.0918 & 0.0887 & 0.0919
coverage 0.3902 & 0.3770 & 0.4795 & 0.4902 & 0.5295 & 0.5428
novelty 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000
diversity 0.8255 & 0.7787 & 0.8180 & 0.8922 & 0.7503 & 0.8357


In [8]:
print_metrics(mc_metrics)

accuracy 0.0961 & 0.0883 & 0.0806 & 0.0854 & 0.0778 & 0.0826
coverage 0.3672 & 0.3820 & 0.4095 & 0.4663 & 0.4705 & 0.5230
novelty 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000
diversity 0.8303 & 0.8469 & 0.9558 & 0.8164 & 0.9327 & 0.8723
