In [1]:
import pandas as pd
import numpy as np
import random


In [2]:
readers = pd.read_csv("../data/readers.csv")
readers = readers.rename(columns={"id":"user_id", "art_id":"nzz_id"})
readers.head()

Unnamed: 0,user_id,nzz_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [3]:
read_counts = readers["user_id"].value_counts(sort=True)
read_counts = read_counts.rename_axis("user_id").reset_index(name="read_count")

# Biorę pod uwagę tylko użytkowników, którzy przeczytali minimum 5 artykułów
min_read_count = 3
read_counts = read_counts[read_counts["read_count"] > min_read_count]

readers = readers[readers["user_id"].isin(read_counts["user_id"])]

In [4]:
# Train/Test split
from sklearn.model_selection import train_test_split

readers_train, readers_test = train_test_split(readers,
                                   stratify=readers["user_id"], 
                                   test_size=0.20,
                                   random_state=123)

print(f"Train set size {len(readers_train)}")
print(f"Test set size {len(readers_test)}")

Train set size 22284
Test set size 5571


In [5]:
reader_article_matrix_df = pd.crosstab(readers_train["user_id"], readers_train["nzz_id"]).fillna(0)
reader_article_matrix_df.head(10)

nzz_id,1.10001800,1.10142798,1.10474932,1.10676585,1.10985668,1.11025867,1.11137056,1.11221430,1.11316388,1.11563462,...,ld.887983,ld.904757,ld.907977,ld.911838,ld.944195,ld.953156,ld.959496,ld.987818,ld.9951,ld.995338
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
unpivot_test = reader_article_matrix_df.stack().to_frame()
unpivot_test.reset_index()

Unnamed: 0,user_id,nzz_id,0
0,1,1.10001800,0
1,1,1.10142798,0
2,1,1.10474932,0
3,1,1.10676585,0
4,1,1.10985668,0
...,...,...,...
10056995,1000,ld.953156,0
10056996,1000,ld.959496,0
10056997,1000,ld.987818,0
10056998,1000,ld.9951,0


In [7]:
unpivot_test.to_csv("../data/readers_all.csv", encoding="utf-8", index=False)

In [8]:
reader_article_matrix = reader_article_matrix_df.to_numpy()
reader_article_matrix[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = readers.set_index('user_id')
interactions_train_indexed_df = readers_train.set_index('user_id')
interactions_test_indexed_df = readers_test.set_index('user_id')

In [10]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['nzz_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [43]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(readers['nzz_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):     
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['nzz_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['nzz_id'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['nzz_id']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend(person_id, 
                                               articles_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df),
                                               topn=1000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            #seed = int.from_bytes(item_id.encode('utf-8'), 'little')
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=12)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['nzz_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['nzz_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [12]:
import sys
sys.path.append('../code')
from cf_model import CFModel


In [13]:
from random_model import RandomModel

In [14]:

cf_recommender_model = CFModel(n_latent_factors=100)
cf_recommender_model.fit(readers_train)

In [15]:

from datetime import datetime
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
start = datetime.now()
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
end = datetime.now()
print(f"eval time {end - start}")
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
999 users processed
eval time 0:00:32.437028

Global metrics:
{'modelName': 'CF_model', 'recall@5': 0.21468318075749418, 'recall@10': 0.2942021181116496}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,1,3,10,0.1,0.3,907
721,2,2,10,0.2,0.2,938
510,0,0,10,0.0,0.0,64
86,1,1,10,0.1,0.1,887
484,0,0,10,0.0,0.0,397
472,2,2,10,0.2,0.2,218
259,1,1,10,0.1,0.1,273
264,3,4,10,0.3,0.4,803
797,1,1,10,0.1,0.1,865
276,0,0,10,0.0,0.0,443


In [34]:
articles = pd.read_csv("../data/articles_cleaned.csv")
random_model = RandomModel()
random_model.fit(articles)

from datetime import datetime
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
start = datetime.now()
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(random_model)
end = datetime.now()
print(f"eval time {end - start}")
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
999 users processed
eval time 0:00:28.491800

Global metrics:
{'modelName': 'random_model', 'recall@5': 0.004667025668641177, 'recall@10': 0.004667025668641177}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,0,0,10,0.0,0.0,907
721,0,0,10,0.0,0.0,938
510,0,0,10,0.0,0.0,64
86,0,0,10,0.0,0.0,887
484,1,1,10,0.1,0.1,397
472,0,0,10,0.0,0.0,218
259,0,0,10,0.0,0.0,273
264,0,0,10,0.0,0.0,803
797,0,0,10,0.0,0.0,865
276,0,0,10,0.0,0.0,443


In [16]:
cf_recommender_model.recommend(221, topn=5).sort_values(by=["recommendation_strength"], ascending=False)

Unnamed: 0,nzz_id,recommendation_strength
0,ld.138721,0.329429
1,ld.152339,0.194284
2,ld.153813,0.192678
3,ld.150619,0.19075
4,ld.149510,0.190037


In [17]:
cf_detailed_results_df.sort_values(by=["recall@5"], ascending=False).head(20)

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
999,1,1,1,1.0,1.0,891
995,1,1,1,1.0,1.0,751
949,4,4,4,1.0,1.0,378
34,4,4,4,1.0,1.0,212
414,2,2,2,1.0,1.0,666
885,2,2,2,1.0,1.0,569
771,2,2,2,1.0,1.0,224
64,2,2,2,1.0,1.0,672
741,2,2,2,1.0,1.0,539
946,3,3,3,1.0,1.0,608


In [30]:
recommended_items = cf_recommender_model.recommend(17)

recommended_items.head(3)

Unnamed: 0,nzz_id,recommendation_strength
0,1.18108994,0.710432
1,ld.137200,0.447095
2,ld.146967,0.207135


In [19]:
articles = pd.read_csv("../data/articles_cleaned.csv")
articles = articles[articles["nzz_id"].isin(recommended_items["nzz_id"])]
articles

Unnamed: 0,nzz_id,author,catchline,department,lead_text,pub_date,title,paragraph
199,ld.1295009,Daniele Muscionico,«Die Schwarze Spinne» als Freilichtaufführung,Feuilleton,Das hätte man nicht gedacht: Jeremias Gotthelf...,2017-05-19 16:28:29.844,Chillen mit Gotthelf,"Ein Schrei, entsetzt und jäh. Der Himmel ist k..."
2048,ld.154109,Fabian Urech,US-Kongress,Digital,Die USA weichen den Datenschutz von Internet-N...,2017-03-29 06:10:00.0,Vorschriften zum Internet-Datenschutz gekippt,
7440,1.18108994,Unknown,Impressionen aus Tessin und Graubünden,Tessin,Die Kantone Graubünden und Tessin (Gastkanton ...,2017-04-11 13:59:48.182,Impressionen aus Tessin und Graubünden,
10617,ld.146967,Franziska Engelhardt,Was heute wichtig ist,Briefing,Bayrou gibt seine Stimmen Macron / Anti-Terror...,2017-02-22 20:15:00.0,Was heute wichtig ist,Bayrou gibt seine Stimmen Macron / Anti-Terror...
12586,ld.140699,Christian Berzins,Hamburger Elbphilharmonie,NZZaS,Die Elbphilharmonie in Hamburg ist bereits ein...,2017-01-15 08:55:00.0,Sie kann nicht singen,Die Elbphilharmonie in Hamburg ist bereits ein...
15027,ld.138132,Nina Fargahi,Was heute wichtig ist,Briefing,Tote nach Explosion in Izmir / Serbien will Ha...,2017-01-05 05:01:52.0,Was heute wichtig ist,Tote nach Explosion in Izmir / Serbien will Ha...
15951,ld.146001,Eduard Kaeser,Für das gepflegte Spiel der Argumente,Meinung,Von der postmodernen Theorie haben wir gelernt...,2017-02-17 07:30:00.0,Vorwärts zu den Fakten zurück,Von der postmodernen Theorie haben wir gelernt...
16637,ld.154221,Jochen Siegle,Social Media,Digital,Die populäre «Stories»-Funktion von Vorreiter ...,2017-03-29 13:22:30.981,Facebook macht auf Snapchat,
17802,ld.144423,Jochen Siegle,Virtual Reality,Digital,Ein japanisches Startup will mit einem kleinen...,2017-02-08 15:52:02.0,Der Duft der virtuellen Welt,Ein japanisches Startup will mit einem kleinen...
18907,ld.137200,Unknown,Ein letzter Blick zurück,Jahresrückblick 2016,"Die grossen Themen in der Bilanz, die besten G...",2017-01-02 04:30:00.0,Der grosse NZZ-Jahresrückblick,


## Parameter tuning

In [46]:
n_latent_factors_list = range(5, 100, 5)
metrics = []

for n_latent_factors in n_latent_factors_list:
    recommender = CFModel(n_latent_factors=n_latent_factors)
    recommender.fit(readers_train)
    cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(recommender)
    metrics.append({"n_latent_factors": n_latent_factors, "metrics": cf_global_metrics})

999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed


In [47]:
for metric in metrics:
    print(metric["n_latent_factors"], metric["metrics"])

5 {'modelName': 'CF_model', 'recall@5': 0.16837192604559326, 'recall@10': 0.23981331897325436}
10 {'modelName': 'CF_model', 'recall@5': 0.17985998922994076, 'recall@10': 0.24699335846347156}
15 {'modelName': 'CF_model', 'recall@5': 0.1981690899299946, 'recall@10': 0.2642254532399928}
20 {'modelName': 'CF_model', 'recall@5': 0.1985280919045055, 'recall@10': 0.272661999640998}
25 {'modelName': 'CF_model', 'recall@5': 0.2022976126368695, 'recall@10': 0.2785855322204272}
30 {'modelName': 'CF_model', 'recall@5': 0.20319511757314665, 'recall@10': 0.2769700233351283}
35 {'modelName': 'CF_model', 'recall@5': 0.20714413929276612, 'recall@10': 0.28109854604200324}
40 {'modelName': 'CF_model', 'recall@5': 0.2084006462035541, 'recall@10': 0.28235505295279123}
45 {'modelName': 'CF_model', 'recall@5': 0.21145216298689642, 'recall@10': 0.28917609046849757}
50 {'modelName': 'CF_model', 'recall@5': 0.20445162448393467, 'recall@10': 0.2882785855322204}
55 {'modelName': 'CF_model', 'recall@5': 0.20804164

In [211]:
def get_coverage(predicted, catalog):
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
    return prediction_coverage

In [236]:
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
def personalization(predicted):
    """
    Personalization measures recommendation similarity across users.
    A high score indicates good personalization (user's lists of recommendations are different).
    A low score indicates poor personalization (user's lists of recommendations are very similar).
    A model is "personalizing" well if the set of recommendations for each user is different.
    Parameters:
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns:
    -------
        The personalization score for all recommendations.
    """

    def make_rec_matrix(predicted):
        df = pd.DataFrame(data=predicted).reset_index().melt(
            id_vars='index', value_name='item',
        )
        df = df[['index', 'item']].pivot(index='index', columns='item', values='item')
        df = pd.notna(df)*1
        rec_matrix = sp.csr_matrix(df.values)
        return rec_matrix

    #create matrix for recommendations
    predicted = np.array(predicted)
    rec_matrix_sparse = make_rec_matrix(predicted)

    #calculate similarity for every user's recommendation list
    similarity = cosine_similarity(X=rec_matrix_sparse, dense_output=False)

    #get indicies for upper right triangle w/o diagonal
    upper_right = np.triu_indices(similarity.shape[0], k=1)

    #calculate average similarity
    personalization = np.mean(similarity[upper_right])
    return 1-personalization

In [233]:
# Coverage evaluation
def evaluate_coverage(model, topn=5):
    all_recs = []
    for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            person_recs_df = model.recommend(person_id, articles_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df), topn=topn)
            all_recs.append(person_recs_df["nzz_id"].values)

    coverage_result = get_coverage(all_recs, interactions_train_indexed_df["nzz_id"].unique())
    personalization_result = personalization(all_recs)
    print(f"Model: {model.get_model_name()} pokrywa: {coverage_result}% artykułów")
    print(f"Model: {model.get_model_name()} personalizacja: {coverage_result}")



In [237]:
evaluate_coverage(cf_recommender_model, topn=10)


NameError: name 'cosine_similarity' is not defined