In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
readers = pd.read_csv("../data/readers.csv")
readers = readers.rename(columns={"id":"user_id", "art_id":"nzz_id"})
readers.head()

Unnamed: 0,user_id,nzz_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [67]:
read_counts = readers["user_id"].value_counts(sort=True)
read_counts = read_counts.rename_axis("user_id").reset_index(name="read_count")

# Biorę pod uwagę tylko użytkowników, którzy przeczytali minimum 5 artykułów
min_read_count = 3
read_counts = read_counts[read_counts["read_count"] > min_read_count]

readers = readers[readers["user_id"].isin(read_counts["user_id"])]

In [5]:
# Train/Test split
from sklearn.model_selection import train_test_split

readers_train, readers_test = train_test_split(readers,
                                   stratify=readers["user_id"], 
                                   test_size=0.20,
                                   random_state=123)

print(f"Train set size {len(readers_train)}")
print(f"Test set size {len(readers_test)}")

Train set size 22284
Test set size 5571


In [10]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = readers.set_index('user_id')
interactions_train_indexed_df = readers_train.set_index('user_id')
interactions_test_indexed_df = readers_test.set_index('user_id')

In [11]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['nzz_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [8]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(readers['nzz_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):     
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['nzz_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['nzz_id'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['nzz_id']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend(person_id, 
                                               articles_to_ignore=get_items_interacted(person_id,interactions_train_indexed_df),
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            #seed = int.from_bytes(item_id.encode('utf-8'), 'little')
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=12)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['nzz_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['nzz_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [3]:
import sys
sys.path.append('../code')
from cf_model import CFModel


In [6]:

cf_recommender_model = CFModel(n_latent_factors=100)
cf_recommender_model.fit(readers_train)

In [12]:
from datetime import datetime
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
start = datetime.now()
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
end = datetime.now()
print(f"eval time {end - start}")
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
999 users processed
eval time 0:00:26.046176

Global metrics:
{'modelName': 'CF_model', 'recall@5': 0.2155806856937713, 'recall@10': 0.290612098366541}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,3,10,0.2,0.3,907
721,2,2,10,0.2,0.2,938
510,0,0,10,0.0,0.0,64
86,1,1,10,0.1,0.1,887
484,0,0,10,0.0,0.0,397
472,2,3,10,0.2,0.3,218
259,1,1,10,0.1,0.1,273
264,3,5,10,0.3,0.5,803
797,1,1,10,0.1,0.1,865
276,0,1,10,0.0,0.1,443


In [13]:
cf_detailed_results_df.sort_values(by=["recall@5"], ascending=False).head(20)

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
937,2,2,2,1.0,1.0,575
518,1,1,1,1.0,1.0,881
741,2,2,2,1.0,1.0,539
347,1,1,1,1.0,1.0,854
771,2,2,2,1.0,1.0,224
998,1,1,1,1.0,1.0,167
976,1,1,1,1.0,1.0,833
968,1,1,1,1.0,1.0,639
995,1,1,1,1.0,1.0,751
983,1,1,1,1.0,1.0,369


In [22]:
recommended_items = cf_recommender_model.recommend_items(17)
recommended_items = recommended_items.head(3)

In [23]:
articles = pd.read_csv("../data/articles_cleaned.csv")
articles = articles[articles["nzz_id"].isin(recommended_items["nzz_id"])]
articles

Unnamed: 0,nzz_id,author,catchline,department,lead_text,pub_date,title,paragraph
7440,1.18108994,Unknown,Impressionen aus Tessin und Graubünden,Tessin,Die Kantone Graubünden und Tessin (Gastkanton ...,2017-04-11 13:59:48.182,Impressionen aus Tessin und Graubünden,
10617,ld.146967,Franziska Engelhardt,Was heute wichtig ist,Briefing,Bayrou gibt seine Stimmen Macron / Anti-Terror...,2017-02-22 20:15:00.0,Was heute wichtig ist,Bayrou gibt seine Stimmen Macron / Anti-Terror...
18907,ld.137200,Unknown,Ein letzter Blick zurück,Jahresrückblick 2016,"Die grossen Themen in der Bilanz, die besten G...",2017-01-02 04:30:00.0,Der grosse NZZ-Jahresrückblick,


## Parameter tuning

In [24]:
n_latent_factors = range(5, 1000, 10)
metrics = []

for n_latent_factor in n_latent_factors:
    cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
    metrics.append(cf_global_metrics)

999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed
999 users processed


KeyboardInterrupt: 