In [2]:
import pandas as pd
import numpy as np

In [3]:
readers = pd.read_csv("../data/readers.csv")
readers = readers.rename(columns={"id":"user_id", "art_id":"nzz_id"})
readers.head()

Unnamed: 0,user_id,nzz_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [4]:
read_counts = readers["user_id"].value_counts(sort=True)
read_counts = read_counts.rename_axis("user_id").reset_index(name="read_count")

# Biorę pod uwagę tylko użytkowników, którzy przeczytali minimum 5 artykułów
min_read_count = 5
read_counts = read_counts[read_counts["read_count"] > min_read_count]

readers = readers[readers["user_id"].isin(read_counts["user_id"])]

In [5]:
def bootstrap_split(readers):
    all_samples = []
    for user in readers["user_id"].unique():
        all_samples.append(readers[readers["user_id"] == user].sample(3, replace=True))

    df_test = pd.concat(all_samples)
    df_train = pd.merge(readers, df_test, how='outer', indicator=True)
    df_train = df_train.loc[df_train["_merge"] == "left_only", ["user_id", "nzz_id"]]
    return df_train, df_test

In [6]:
import sys
sys.path.append('../code')
from cf_model import CFModel
from model_evaluator import ModelEvaluator
from random_model import RandomModel
from implicit_model import ImplicitModel
model_evaluator = ModelEvaluator(k_list = [10])

In [7]:
#cf_recommender_model = CFModel(n_latent_factors=200)
cf_recommender_model = ImplicitModel(n_latent_factors=500, regularization=150, alpha=50, iterations=10)

In [9]:
bootstrap_results = []
for i in range(2):
    cf_recommender_model = ImplicitModel(n_latent_factors=500, regularization=150, alpha=50, iterations=10)
    train, test = bootstrap_split(readers)
    print(f"training #{i}...", flush=True)
    cf_recommender_model.fit(train)
    print(f"evaluating #{i}...", flush=True)
    cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model, readers, train, test, interactions=0)
    bootstrap_results.append(cf_global_metrics)

training #0...
100%|██████████| 10/10 [00:03<00:00,  2.68it/s]evaluating #0...

979 users processed
training #1...
100%|██████████| 10/10 [00:09<00:00,  1.01it/s]evaluating #1...

979 users processed


In [10]:
import json

with open('../output/bootstrap.json', 'w') as fp:
    json.dump(bootstrap_results, fp)