In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,train_test_split
from collections import defaultdict
import matplotlib.pyplot as plt

from models.simple import SimpleRS
from models.max_coverage import MaxCoverageRS
from tools.print import print_metrics

In [2]:
ratings_df = pd.read_csv('datasets/ml-latest-small/ratings.csv', encoding='latin-1')

In [3]:
def run(model_class, data, start=5, stop=11):
    metrics = []
    
    for i in range(start,stop):
        train_df, test_df = train_test_split(data, test_size=0.2)

        watched = test_df.groupby('userId')['movieId'].apply(list).reset_index()
        true = watched['movieId'].tolist()
        
        model = model_class(i)
        model.fit(train_df)
        
        predictions = model.predict(test_df)
        metrics.append(model.calculate_metrics(predictions, true))
    return metrics

topk_metrics = run(SimpleRS, ratings_df)
mc_metrics = run(MaxCoverageRS, ratings_df)

In [4]:
print_metrics(topk_metrics)

accuracy 0.1000 & 0.1036 & 0.0927 & 0.0826 & 0.0814 & 0.0816
coverage 0.3803 & 0.4475 & 0.4696 & 0.4377 & 0.4762 & 0.5393
novelty 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000
diversity 0.9181 & 0.8645 & 0.8857 & 0.9462 & 0.8598 & 0.9382


In [5]:
print_metrics(mc_metrics)

accuracy 0.0898 & 0.0872 & 0.0856 & 0.0922 & 0.0786 & 0.0670
coverage 0.3475 & 0.4131 & 0.4253 & 0.4984 & 0.4647 & 0.4548
novelty 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000 & 0.0000
diversity 0.8678 & 0.7793 & 0.8416 & 0.8710 & 0.8781 & 0.9605
