In [1]:
import pandas as pd
import numpy as np
import random


In [2]:
readers = pd.read_csv("../data/readers.csv")
readers = readers.rename(columns={"id":"user_id", "art_id":"nzz_id"})
readers.head()

Unnamed: 0,user_id,nzz_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [3]:
read_counts = readers["user_id"].value_counts(sort=True)
read_counts = read_counts.rename_axis("user_id").reset_index(name="read_count")

# Biorę pod uwagę tylko użytkowników, którzy przeczytali minimum 5 artykułów
min_read_count = 3
read_counts = read_counts[read_counts["read_count"] > min_read_count]

readers = readers[readers["user_id"].isin(read_counts["user_id"])]

In [4]:
# Train/Test split
from sklearn.model_selection import train_test_split

readers_train, readers_test = train_test_split(readers,
                                   stratify=readers["user_id"], 
                                   test_size=0.20,
                                   random_state=123)

print(f"Train set size {len(readers_train)}")
print(f"Test set size {len(readers_test)}")

Train set size 22284
Test set size 5571


In [5]:
import sys
sys.path.append('../code')
from cf_model import CFModel


In [9]:
from model_evaluator import ModelEvaluator
model_evaluator = ModelEvaluator()

In [7]:
from random_model import RandomModel

In [10]:

cf_recommender_model = CFModel(n_latent_factors=100)
cf_recommender_model.fit(readers_train)

In [11]:

from datetime import datetime
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
start = datetime.now()
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model, readers, readers_train, readers_test)
end = datetime.now()
print(f"eval time {end - start}")
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
999 users processed
eval time 0:00:26.820241

Global metrics:
{'modelName': 'CF_model', 'recall@5': 0.21881170346436907, 'recall@10': 0.29204810626458444}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,1,1,10,0.1,0.1,907
721,2,2,10,0.2,0.2,938
510,0,2,10,0.0,0.2,64
86,0,1,10,0.0,0.1,887
484,0,0,10,0.0,0.0,397
472,1,2,10,0.1,0.2,218
259,1,2,10,0.1,0.2,273
264,1,3,10,0.1,0.3,803
797,1,2,10,0.1,0.2,865
276,0,1,10,0.0,0.1,443


In [16]:
articles = pd.read_csv("../data/articles_cleaned.csv")
random_model = RandomModel()
random_model.fit(articles)

from datetime import datetime
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
start = datetime.now()
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(random_model)
end = datetime.now()
print(f"eval time {end - start}")
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
999 users processed
eval time 0:01:00.230484

Global metrics:
{'modelName': 'random_model', 'recall@5': 0.04792676359719979, 'recall@10': 0.09262250942380183}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,0,0,10,0.0,0.0,907
721,0,1,10,0.0,0.1,938
510,0,2,10,0.0,0.2,64
86,0,0,10,0.0,0.0,887
484,0,0,10,0.0,0.0,397
472,0,0,10,0.0,0.0,218
259,0,0,10,0.0,0.0,273
264,0,1,10,0.0,0.1,803
797,0,0,10,0.0,0.0,865
276,0,0,10,0.0,0.0,443


In [17]:
cf_recommender_model.recommend(221, topn=5).sort_values(by=["recommendation_strength"], ascending=False)

Unnamed: 0,nzz_id,recommendation_strength
0,ld.138721,0.329429
1,ld.152339,0.194284
2,ld.153813,0.192678
3,ld.150619,0.19075
4,ld.149510,0.190037


In [18]:
cf_detailed_results_df.sort_values(by=["recall@5"], ascending=False).head(20)

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
845,1,1,1,1.0,1.0,474
791,1,1,2,0.5,0.5,183
663,1,1,2,0.5,0.5,604
378,1,1,2,0.5,0.5,590
994,1,1,2,0.5,0.5,128
741,1,1,2,0.5,0.5,539
64,1,1,2,0.5,0.5,672
836,1,1,2,0.5,0.5,120
293,1,1,2,0.5,0.5,747
223,1,1,2,0.5,0.5,152


In [19]:
recommended_items = cf_recommender_model.recommend(17)

recommended_items.head(3)

Unnamed: 0,nzz_id,recommendation_strength
0,1.18108994,0.710432
1,ld.137200,0.447095
2,ld.146967,0.207135


In [20]:
articles = pd.read_csv("../data/articles_cleaned.csv")
articles = articles[articles["nzz_id"].isin(recommended_items["nzz_id"])]
articles

Unnamed: 0,nzz_id,author,catchline,department,lead_text,pub_date,title,paragraph
199,ld.1295009,Daniele Muscionico,«Die Schwarze Spinne» als Freilichtaufführung,Feuilleton,Das hätte man nicht gedacht: Jeremias Gotthelf...,2017-05-19 16:28:29.844,Chillen mit Gotthelf,"Ein Schrei, entsetzt und jäh. Der Himmel ist k..."
2048,ld.154109,Fabian Urech,US-Kongress,Digital,Die USA weichen den Datenschutz von Internet-N...,2017-03-29 06:10:00.0,Vorschriften zum Internet-Datenschutz gekippt,
7440,1.18108994,Unknown,Impressionen aus Tessin und Graubünden,Tessin,Die Kantone Graubünden und Tessin (Gastkanton ...,2017-04-11 13:59:48.182,Impressionen aus Tessin und Graubünden,
10617,ld.146967,Franziska Engelhardt,Was heute wichtig ist,Briefing,Bayrou gibt seine Stimmen Macron / Anti-Terror...,2017-02-22 20:15:00.0,Was heute wichtig ist,Bayrou gibt seine Stimmen Macron / Anti-Terror...
12586,ld.140699,Christian Berzins,Hamburger Elbphilharmonie,NZZaS,Die Elbphilharmonie in Hamburg ist bereits ein...,2017-01-15 08:55:00.0,Sie kann nicht singen,Die Elbphilharmonie in Hamburg ist bereits ein...
15027,ld.138132,Nina Fargahi,Was heute wichtig ist,Briefing,Tote nach Explosion in Izmir / Serbien will Ha...,2017-01-05 05:01:52.0,Was heute wichtig ist,Tote nach Explosion in Izmir / Serbien will Ha...
15951,ld.146001,Eduard Kaeser,Für das gepflegte Spiel der Argumente,Meinung,Von der postmodernen Theorie haben wir gelernt...,2017-02-17 07:30:00.0,Vorwärts zu den Fakten zurück,Von der postmodernen Theorie haben wir gelernt...
16637,ld.154221,Jochen Siegle,Social Media,Digital,Die populäre «Stories»-Funktion von Vorreiter ...,2017-03-29 13:22:30.981,Facebook macht auf Snapchat,
17802,ld.144423,Jochen Siegle,Virtual Reality,Digital,Ein japanisches Startup will mit einem kleinen...,2017-02-08 15:52:02.0,Der Duft der virtuellen Welt,Ein japanisches Startup will mit einem kleinen...
18907,ld.137200,Unknown,Ein letzter Blick zurück,Jahresrückblick 2016,"Die grossen Themen in der Bilanz, die besten G...",2017-01-02 04:30:00.0,Der grosse NZZ-Jahresrückblick,


## Parameter tuning

In [21]:
n_latent_factors_list = range(5, 100, 5)
metrics = []

for n_latent_factors in n_latent_factors_list:
    recommender = CFModel(n_latent_factors=n_latent_factors)
    recommender.fit(readers_train)
    cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(recommender)
    metrics.append({"n_latent_factors": n_latent_factors, "metrics": cf_global_metrics})

999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed


KeyboardInterrupt: 

In [47]:
for metric in metrics:
    print(metric["n_latent_factors"], metric["metrics"])

5 {'modelName': 'CF_model', 'recall@5': 0.16837192604559326, 'recall@10': 0.23981331897325436}
10 {'modelName': 'CF_model', 'recall@5': 0.17985998922994076, 'recall@10': 0.24699335846347156}
15 {'modelName': 'CF_model', 'recall@5': 0.1981690899299946, 'recall@10': 0.2642254532399928}
20 {'modelName': 'CF_model', 'recall@5': 0.1985280919045055, 'recall@10': 0.272661999640998}
25 {'modelName': 'CF_model', 'recall@5': 0.2022976126368695, 'recall@10': 0.2785855322204272}
30 {'modelName': 'CF_model', 'recall@5': 0.20319511757314665, 'recall@10': 0.2769700233351283}
35 {'modelName': 'CF_model', 'recall@5': 0.20714413929276612, 'recall@10': 0.28109854604200324}
40 {'modelName': 'CF_model', 'recall@5': 0.2084006462035541, 'recall@10': 0.28235505295279123}
45 {'modelName': 'CF_model', 'recall@5': 0.21145216298689642, 'recall@10': 0.28917609046849757}
50 {'modelName': 'CF_model', 'recall@5': 0.20445162448393467, 'recall@10': 0.2882785855322204}
55 {'modelName': 'CF_model', 'recall@5': 0.20804164

In [211]:
def get_coverage(predicted, catalog):
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
    return prediction_coverage

In [236]:
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
def personalization(predicted):
    """
    Personalization measures recommendation similarity across users.
    A high score indicates good personalization (user's lists of recommendations are different).
    A low score indicates poor personalization (user's lists of recommendations are very similar).
    A model is "personalizing" well if the set of recommendations for each user is different.
    Parameters:
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns:
    -------
        The personalization score for all recommendations.
    """

    def make_rec_matrix(predicted):
        df = pd.DataFrame(data=predicted).reset_index().melt(
            id_vars='index', value_name='item',
        )
        df = df[['index', 'item']].pivot(index='index', columns='item', values='item')
        df = pd.notna(df)*1
        rec_matrix = sp.csr_matrix(df.values)
        return rec_matrix

    #create matrix for recommendations
    predicted = np.array(predicted)
    rec_matrix_sparse = make_rec_matrix(predicted)

    #calculate similarity for every user's recommendation list
    similarity = cosine_similarity(X=rec_matrix_sparse, dense_output=False)

    #get indicies for upper right triangle w/o diagonal
    upper_right = np.triu_indices(similarity.shape[0], k=1)

    #calculate average similarity
    personalization = np.mean(similarity[upper_right])
    return 1-personalization

In [233]:
# Coverage evaluation
def evaluate_coverage(model, topn=5):
    all_recs = []
    for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            person_recs_df = model.recommend(person_id, articles_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df), topn=topn)
            all_recs.append(person_recs_df["nzz_id"].values)

    coverage_result = get_coverage(all_recs, interactions_train_indexed_df["nzz_id"].unique())
    personalization_result = personalization(all_recs)
    print(f"Model: {model.get_model_name()} pokrywa: {coverage_result}% artykułów")
    print(f"Model: {model.get_model_name()} personalizacja: {personalization_result}")



In [237]:
evaluate_coverage(cf_recommender_model, topn=10)


NameError: name 'cosine_similarity' is not defined