In [89]:
import pandas as pd
import numpy as np
import random

In [90]:
readers = pd.read_csv("../data/readers.csv")
readers = readers.rename(columns={"id":"user_id"})
readers.head()

Unnamed: 0,user_id,art_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [91]:
read_counts = readers["user_id"].value_counts(sort=True)
read_counts = read_counts.rename_axis("user_id").reset_index(name="read_count")

# Biorę pod uwagę tylko użytkowników, którzy przeczytali minimum 5 artykułów
min_read_count = 10
read_counts = read_counts[read_counts["read_count"] > min_read_count]

readers = readers[readers["user_id"].isin(read_counts["user_id"])]

In [92]:
# Train/Test split
from sklearn.model_selection import train_test_split

readers_train, readers_test = train_test_split(readers,
                                   stratify=readers["user_id"], 
                                   test_size=0.20,
                                   random_state=123)

print(f"Train set size {len(readers_train)}")
print(f"Test set size {len(readers_test)}")

Train set size 21507
Test set size 5377


In [93]:
reader_article_matrix_df = pd.crosstab(readers_train["user_id"], readers_train["art_id"]).fillna(0)
reader_article_matrix_df.head(10)

art_id,1.10001800,1.10142798,1.10474932,1.10858409,1.10985668,1.11025867,1.11137056,1.11221430,1.11316388,1.11563462,...,ld.855114,ld.887983,ld.904757,ld.907977,ld.911838,ld.944195,ld.953156,ld.987818,ld.9951,ld.995338
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [94]:
reader_article_matrix = reader_article_matrix_df.to_numpy()
reader_article_matrix[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [95]:
reader_ids = list(reader_article_matrix_df.index)
reader_ids[:10]

[3, 4, 5, 6, 7, 8, 9, 10, 12, 13]

In [96]:
from scipy.sparse import csr_matrix

reader_article_csr_matrix = csr_matrix(reader_article_matrix)

# Type cast do float bo inczej metoda nie obsługuje
reader_article_csr_matrix = reader_article_csr_matrix.asfptype()
reader_article_csr_matrix

<871x9857 sparse matrix of type '<class 'numpy.float64'>'
	with 21507 stored elements in Compressed Sparse Row format>

In [97]:
from scipy.sparse.linalg import svds

# The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(reader_article_csr_matrix, k = NUMBER_OF_FACTORS_MF)

In [98]:
sigma = np.diag(sigma)

print(f"{U.shape}")
print(f"{Vt.shape}")
print(f"{sigma.shape}")

(871, 15)
(15, 9857)
(15, 15)


In [99]:
reader_predictions = np.dot(np.dot(U, sigma), Vt) 
reader_predictions

array([[ 9.05510630e-05,  6.94367426e-04, -5.60644823e-04, ...,
        -2.41197935e-03, -5.60961662e-03,  3.76247665e-04],
       [ 2.17622407e-03, -1.97701753e-03, -1.91399387e-03, ...,
        -3.29539631e-03, -1.11596766e-02,  9.37267847e-04],
       [-1.42382915e-03,  1.40906157e-04, -2.33309465e-04, ...,
        -2.11714603e-04,  1.16920981e-02,  7.96918011e-05],
       ...,
       [-3.72358537e-03, -2.72681695e-03, -2.49241667e-03, ...,
         1.63179078e-03, -1.31633809e-02, -3.38607318e-05],
       [ 3.95467047e-03, -1.57992507e-03,  1.42194758e-05, ...,
         1.09610559e-03,  2.71770116e-03,  7.09056443e-04],
       [ 1.17838175e-02,  2.54173834e-02, -8.55325967e-04, ...,
         7.22323618e-03, -1.02951214e-02,  6.84647007e-04]])

In [100]:
reader_predictions_norm = (reader_predictions - reader_predictions.min()) / (reader_predictions.max() - reader_predictions.min())

In [101]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(reader_predictions_norm, columns = reader_article_matrix_df.columns, index=reader_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,3,4,5,6,7,8,9,10,12,13,...,990,991,992,993,994,996,997,998,999,1000
art_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.100018,0.236763,0.238054,0.235825,0.237426,0.236774,0.234032,0.23625,0.237596,0.235172,0.238656,...,0.238574,0.236502,0.235653,0.23997,0.235735,0.235331,0.236558,0.234401,0.239155,0.244003
1.10142798,0.237137,0.235483,0.236794,0.236771,0.236845,0.233594,0.236883,0.235526,0.234151,0.238967,...,0.240413,0.233718,0.236072,0.236048,0.236108,0.236865,0.235164,0.235018,0.235729,0.252444
1.10474932,0.23636,0.235522,0.236562,0.236464,0.236613,0.235608,0.236419,0.23617,0.234812,0.236274,...,0.235916,0.236438,0.236328,0.236435,0.236537,0.23651,0.242101,0.235164,0.236716,0.236177
1.10858409,0.239359,0.230131,0.235719,0.235763,0.237012,0.238736,0.236967,0.229987,0.234127,0.237462,...,0.236393,0.238894,0.236776,0.236496,0.238405,0.24085,0.235329,0.226863,0.235154,0.234554
1.10985668,0.236339,0.238134,0.235851,0.236884,0.236743,0.232579,0.2361,0.236299,0.237118,0.236732,...,0.236247,0.235576,0.236061,0.236075,0.236805,0.235724,0.242713,0.235038,0.236906,0.236328
1.11025867,0.239148,0.233046,0.237255,0.23598,0.236792,0.247689,0.23767,0.232994,0.238199,0.237322,...,0.240765,0.237827,0.236083,0.236206,0.236748,0.240096,0.235774,0.237553,0.239096,0.238862
1.11137056,0.237042,0.240535,0.237124,0.23698,0.2368,0.237617,0.236954,0.238924,0.236802,0.236994,...,0.238086,0.237279,0.23671,0.236591,0.237051,0.237256,0.23954,0.237347,0.236513,0.238808
1.1122143,0.23619,0.233386,0.235297,0.237324,0.236734,0.235895,0.236585,0.234849,0.249842,0.23782,...,0.236154,0.241633,0.236529,0.235574,0.235098,0.23646,0.235514,0.233594,0.236857,0.242602
1.11316388,0.236909,0.23372,0.236932,0.2363,0.236371,0.241752,0.236936,0.235697,0.23467,0.235961,...,0.237491,0.245387,0.236039,0.236291,0.236169,0.237488,0.237188,0.236279,0.238741,0.235173
1.11563462,0.236387,0.243424,0.236421,0.236656,0.236836,0.239803,0.236865,0.235665,0.243724,0.236796,...,0.235935,0.235927,0.236805,0.240392,0.239559,0.236715,0.234892,0.237299,0.24303,0.236653


In [102]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = readers.set_index('user_id')
interactions_train_indexed_df = readers_train.set_index('user_id')
interactions_test_indexed_df = readers_test.set_index('user_id')

In [103]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['art_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [150]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(readers['art_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['art_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['art_id'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['art_id']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id,interactions_train_indexed_df),
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            seed = int.from_bytes(item_id.encode('utf-8'), 'little')
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=seed)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['art_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['art_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [140]:
class CFRecommender:
    
    MODEL_NAME = 'CF_model'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME

    def persist(self, path):
        with open(f"{path}/{self.MODEL_NAME}.pickle", mode="wb+") as model_file:
            pickle.dump(self, model_file)

    @classmethod
    def from_file(cls, path):
        try:
            with open(path, mode="rb") as model_file:
                model = pickle.load(model_file)

            cf_predictions_df = model.cf_predictions_df
            items_df = model.items_df

            return cls(model.cf_predictions_df, model.items_df)
        except:
            print("Couldn't load model")


    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['art_id'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'art_id', 
                                                          right_on = 'art_id')[['recStrength', 'art_id', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, readers)

In [126]:
cf_recommender_model.persist("../data")

In [134]:
loaded_model = CFRecommender.from_file("../data/CF_model.pickle")

In [151]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
870 users processed

Global metrics:
{'modelName': 'CF_model', 'recall@5': 0.18709317463269481, 'recall@10': 0.25850846196763994}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
672,4,4,10,0.4,0.4,8
350,1,1,10,0.1,0.1,73
449,1,2,10,0.1,0.2,153
279,1,1,10,0.1,0.1,598
336,2,2,10,0.2,0.2,908
622,1,2,10,0.1,0.2,858
433,0,2,10,0.0,0.2,376
432,0,0,10,0.0,0.0,451
42,3,3,10,0.3,0.3,526
87,0,1,10,0.0,0.1,443


In [79]:
cf_detailed_results_df.sort_values(by=["recall@5"], ascending=False).head(20)

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
543,4,4,4,1.0,1.0,853
794,2,2,2,1.0,1.0,575
623,3,3,3,1.0,1.0,954
344,4,4,5,0.8,0.8,798
144,3,3,4,0.75,0.75,95
501,6,6,8,0.75,0.75,465
637,6,6,8,0.75,0.75,649
737,3,4,4,0.75,1.0,673
203,3,3,4,0.75,0.75,851
72,3,3,4,0.75,0.75,87


In [136]:
recommended_items = cf_recommender_model.recommend_items(17)
recommended_items = recommended_items.head(3)

In [137]:
articles = pd.read_csv("../data/articles_cleaned.csv")
articles = articles[articles["nzz_id"].isin(recommended_items["art_id"])]
articles

Unnamed: 0,nzz_id,author,catchline,department,lead_text,pub_date,title,paragraph
7197,ld.142634,"Marie-José Kolly, Alexandra Kohler",Analysemethode,Storytelling,Die NZZ hat zu den Ski-Weltmeisterschaften in ...,2017-02-06 04:30:00.0,Wie aus Datenbergen Karrieretypen wurden,Die NZZ hat zu den Ski-Weltmeisterschaften in ...
13555,ld.150497,Unknown,Schiesserei in Basel,Video,Nach tödlichen Schüssen in einer Bar in Basel ...,2017-03-10 10:47:40.0,"Zwei Tote, ein Schwerverletzter",Nach tödlichen Schüssen in einer Bar in Basel ...
18907,ld.137200,Unknown,Ein letzter Blick zurück,Jahresrückblick 2016,"Die grossen Themen in der Bilanz, die besten G...",2017-01-02 04:30:00.0,Der grosse NZZ-Jahresrückblick,
