In [255]:
import pandas as pd
import numpy as np
import random
import pickle

In [256]:
readers = pd.read_csv("../data/readers.csv")
readers = readers.rename(columns={"id":"user_id"})
readers.head()

Unnamed: 0,user_id,art_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [257]:
read_counts = readers["user_id"].value_counts(sort=True)
read_counts = read_counts.rename_axis("user_id").reset_index(name="read_count")

# Biorę pod uwagę tylko użytkowników, którzy przeczytali minimum 3 artykułów
min_read_count = 3
read_counts = read_counts[read_counts["read_count"] > min_read_count]

readers = readers[readers["user_id"].isin(read_counts["user_id"])]

In [258]:
# Train/Test split
from sklearn.model_selection import train_test_split

readers_train, readers_test = train_test_split(readers,
                                   stratify=readers["user_id"], 
                                   test_size=0.20,
                                   random_state=123)

print(f"Train set size {len(readers_train)}")
print(f"Test set size {len(readers_test)}")

Train set size 22284
Test set size 5571


In [259]:
reader_article_matrix_df = pd.crosstab(readers_train["user_id"], readers_train["art_id"]).fillna(0)
reader_article_matrix_df.head(10)

art_id,1.10001800,1.10142798,1.10474932,1.10676585,1.10985668,1.11025867,1.11137056,1.11221430,1.11316388,1.11563462,...,ld.887983,ld.904757,ld.907977,ld.911838,ld.944195,ld.953156,ld.959496,ld.987818,ld.9951,ld.995338
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [260]:
reader_article_matrix = reader_article_matrix_df.to_numpy()
reader_article_matrix[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [261]:
reader_ids = list(reader_article_matrix_df.index)
reader_ids[:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [262]:
from scipy.sparse import csr_matrix

reader_article_csr_matrix = csr_matrix(reader_article_matrix)

# Type cast do float bo inczej metoda nie obsługuje
reader_article_csr_matrix = reader_article_csr_matrix.asfptype()
reader_article_csr_matrix

<1000x10057 sparse matrix of type '<class 'numpy.float64'>'
	with 22284 stored elements in Compressed Sparse Row format>

In [263]:
from scipy.sparse.linalg import svds

# The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 100
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(reader_article_csr_matrix, k = NUMBER_OF_FACTORS_MF)

In [264]:
sigma = np.diag(sigma)

print(f"{U.shape}")
print(f"{Vt.shape}")
print(f"{sigma.shape}")

(1000, 100)
(100, 10057)
(100, 100)


In [265]:
reader_predictions = np.dot(np.dot(U, sigma), Vt) 
reader_predictions

array([[ 0.00126406, -0.00091568,  0.00087472, ...,  0.001089  ,
        -0.00252528, -0.00011383],
       [ 0.00489697, -0.00028578, -0.00028638, ..., -0.00141694,
         0.00766448, -0.00062164],
       [ 0.00249037, -0.004516  , -0.00107365, ...,  0.00052246,
        -0.00929867, -0.00222214],
       ...,
       [ 0.00764014,  0.00712054, -0.0025289 , ..., -0.00333712,
        -0.00148746, -0.00503753],
       [-0.00141353,  0.01089344,  0.00083585, ..., -0.01226793,
        -0.00465555,  0.00402787],
       [ 0.02535716,  0.06575882, -0.00062212, ...,  0.00664736,
        -0.01153612, -0.00167098]])

In [266]:
reader_predictions_norm = (reader_predictions - reader_predictions.min()) / (reader_predictions.max() - reader_predictions.min())

In [267]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(reader_predictions_norm, columns = reader_article_matrix_df.columns, index=reader_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,991,992,993,994,995,996,997,998,999,1000
art_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.100018,0.155031,0.157202,0.155764,0.152544,0.151067,0.154852,0.154421,0.157366,0.152505,0.156557,...,0.144291,0.15261,0.15681,0.154032,0.153707,0.151264,0.150056,0.158841,0.153431,0.169427
1.10142798,0.153729,0.154105,0.151577,0.154,0.152468,0.151892,0.154588,0.146476,0.156938,0.152234,...,0.158054,0.154811,0.158317,0.160552,0.153252,0.153446,0.157603,0.15853,0.160785,0.193568
1.10474932,0.154798,0.154105,0.153634,0.150024,0.152808,0.15354,0.154091,0.159051,0.15465,0.151893,...,0.152349,0.153134,0.153666,0.154598,0.15415,0.154169,0.153468,0.152765,0.154775,0.153904
1.10676585,0.154429,0.154125,0.154621,0.153085,0.153789,0.154459,0.153918,0.154076,0.154283,0.153878,...,0.153476,0.153732,0.154086,0.154586,0.154172,0.153762,0.154512,0.15514,0.152658,0.154457
1.10985668,0.154212,0.153436,0.155743,0.159602,0.153573,0.156171,0.152458,0.14214,0.155193,0.152243,...,0.144013,0.153342,0.15626,0.15675,0.15352,0.154776,0.151464,0.159906,0.158664,0.161918
1.11025867,0.153873,0.154666,0.15221,0.149027,0.149645,0.15284,0.155252,0.145017,0.161488,0.150731,...,0.151762,0.15662,0.154664,0.155555,0.15332,0.149985,0.152119,0.145978,0.143745,0.147066
1.11137056,0.153542,0.154178,0.15286,0.158088,0.148194,0.155891,0.154046,0.148904,0.151333,0.152096,...,0.157321,0.165998,0.153665,0.153763,0.157521,0.152489,0.154665,0.170847,0.162028,0.154056
1.1122143,0.154822,0.153749,0.151285,0.162751,0.146612,0.157971,0.155282,0.148743,0.154801,0.146098,...,0.150109,0.155595,0.154437,0.151315,0.153598,0.1498,0.15361,0.161904,0.15476,0.157647
1.11316388,0.154283,0.153875,0.152956,0.151883,0.153611,0.152083,0.153092,0.157184,0.156168,0.155618,...,0.169237,0.150574,0.153476,0.154173,0.154229,0.156551,0.155888,0.150692,0.150115,0.148969
1.11563462,0.154952,0.156572,0.149081,0.150446,0.143799,0.157476,0.154559,0.148681,0.159401,0.15324,...,0.137339,0.154453,0.169701,0.153626,0.154138,0.146474,0.151907,0.169562,0.159415,0.151572


In [268]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = readers.set_index('user_id')
interactions_train_indexed_df = readers_train.set_index('user_id')
interactions_test_indexed_df = readers_test.set_index('user_id')

In [269]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['art_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [270]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(readers['art_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['art_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['art_id'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['art_id']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id,interactions_train_indexed_df),
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            seed = int.from_bytes(item_id.encode('utf-8'), 'little')
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=seed)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['art_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['art_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [271]:
class CFRecommender:
    
    MODEL_NAME = 'CF_model'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME

    def persist(self, path):
        with open(f"{path}/{self.MODEL_NAME}.pickle", mode="wb+") as model_file:
            pickle.dump(self, model_file)

    @classmethod
    def from_file(cls, path):
        try:
            with open(path, mode="rb") as model_file:
                model = pickle.load(model_file)

            cf_predictions_df = model.cf_predictions_df
            items_df = model.items_df

            return cls(model.cf_predictions_df, model.items_df)
        except:
            print("Couldn't load model")


    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})
        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['art_id'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'art_id', 
                                                          right_on = 'art_id')[['recStrength', 'art_id', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, readers)

In [272]:
cf_recommender_model.persist("../data")

In [273]:
loaded_model = CFRecommender.from_file("../data/CF_model.pickle")

In [274]:
print("Evaluating Collaborative Filtering (SVD Matrix Factorization) model...")
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print(f"\nGlobal metrics:\n{cf_global_metrics}")
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
999 users processed

Global metrics:
{'modelName': 'CF_model', 'recall@5': 0.21647819063004847, 'recall@10': 0.3019206605636331}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,2,2,10,0.2,0.2,938
510,0,0,10,0.0,0.0,64
86,1,1,10,0.1,0.1,887
484,0,0,10,0.0,0.0,397
472,1,2,10,0.1,0.2,218
259,1,1,10,0.1,0.1,273
264,2,4,10,0.2,0.4,803
797,1,2,10,0.1,0.2,865
276,0,1,10,0.0,0.1,443


In [275]:
cf_detailed_results_df.sort_values(by=["recall@5"], ascending=False).head(20)

Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
771,2,2,2,1.0,1.0,224
845,1,1,1,1.0,1.0,474
741,2,2,2,1.0,1.0,539
531,4,4,4,1.0,1.0,642
64,2,2,2,1.0,1.0,672
347,1,1,1,1.0,1.0,854
998,1,1,1,1.0,1.0,167
414,2,2,2,1.0,1.0,666
937,2,2,2,1.0,1.0,575
976,1,1,1,1.0,1.0,833


In [276]:
recommended_items = cf_recommender_model.recommend_items(17)
recommended_items = recommended_items.head(3)

In [277]:
articles = pd.read_csv("../data/articles_cleaned.csv")
articles = articles[articles["nzz_id"].isin(recommended_items["art_id"])]
articles

Unnamed: 0,nzz_id,author,catchline,department,lead_text,pub_date,title,paragraph
7440,1.18108994,Unknown,Impressionen aus Tessin und Graubünden,Tessin,Die Kantone Graubünden und Tessin (Gastkanton ...,2017-04-11 13:59:48.182,Impressionen aus Tessin und Graubünden,
10617,ld.146967,Franziska Engelhardt,Was heute wichtig ist,Briefing,Bayrou gibt seine Stimmen Macron / Anti-Terror...,2017-02-22 20:15:00.0,Was heute wichtig ist,Bayrou gibt seine Stimmen Macron / Anti-Terror...
18907,ld.137200,Unknown,Ein letzter Blick zurück,Jahresrückblick 2016,"Die grossen Themen in der Bilanz, die besten G...",2017-01-02 04:30:00.0,Der grosse NZZ-Jahresrückblick,
