In [2]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt
import random
import sklearn

import re
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

In [3]:
articles = pd.read_csv('articles_nlp.csv')
articles = articles.drop('content_length', axis=1)

In [4]:
articles.head()

Unnamed: 0,nzz_id,author,catchline,content,department,lead_text,pub_date,title,content_len
0,ld.149648,Claudia Gabriel,Obligationenfonds mit fixer Laufzeit,obligationenfonds mit fixer laufzeit es gi...,Finanzen,Die Idee ist gut: Statt einer einzigen Obligat...,2017-03-09 08:01:21.000,Es gibt noch interessante Varianten,718
1,1.18145900,Unknown,Unknown,,Panorama,Zum Auftakt der Fashion Week in New York zeige...,2017-04-11 14:00:29.473,Fashion Week New York,0
2,ld.138769,Unknown,E-Banking-Ausfall,e banking ausfall postfinance kämpft mit d...,Finanzen,Seit Sonntag funktioniert das E-Banking der Po...,2017-01-09 13:55:00.000,Postfinance kämpft mit dem System,525
3,ld.143700,Unknown,Terror in Frankreich,terror in frankreich louvre nach macheten ...,International,Einen Tag nach dem Angriff auf Soldaten beim P...,2017-02-04 12:50:25.000,Louvre nach Macheten-Angriff wieder geöffnet,181
4,ld.149385,Unknown,Unglück in Panama,unglück in panama bus prallt gegen eine ma...,Panorama,Bei einem Busunglück in Panama sind 17 Persone...,2017-03-06 07:31:21.000,Bus prallt gegen eine Mauer und stürzt in Fluss,110


### Lematyzacja

In [5]:
def lemmatizer(data): # do zmiany !
    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(row) for row in data]
    text = " ".join(text)
    return text

In [6]:
tokens = pd.read_csv('tokens.csv')

In [49]:
tokens.head()

Unnamed: 0,content
0,"['obligationenfonds', 'fixer', 'laufzeit', 'gi..."
1,[]
2,"['e', 'banking', 'ausfall', 'postfinance', 'kä..."
3,"['terror', 'frankreich', 'louvre', 'macheten',..."
4,"['unglück', 'panama', 'bus', 'prallt', 'mauer'..."


In [65]:
# row = articles.loc[0,'content']
# lemma = WordNetLemmatizer()
# text = lemma.lemmatize(row)
# text = "".join(text)
# text

## Wektoryzacja

In [9]:
stopwords_list = stopwords.words('german') # are there other languages in text

### Bag of words

In [28]:
bag_vectorizer = CountVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### TFIDF

In [11]:

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### Word2vec

In [14]:
# to do 

### FastText

In [13]:
# to do 

In [30]:
def vectorize(vectorizer, data): # tylko do tfidf/count (?)
    matrix  = vectorizer.fit_transform(data['content'])
    feature_names = vectorizer.get_feature_names()
    return matrix, feature_names

In [31]:
item_ids = articles['nzz_id'].tolist() # w lemma te same 

In [66]:
tfidf_matrix, tfidf_features = vectorize(tfidf_vectorizer, articles)
# tfidf_matrix_lemma, tfidf_features_lemma = vectorize(tfidf_vectorizer, articles_lemma)
bag_matrix, bag_features = vectorize(bag_vectorizer, articles)
# bag_matrix_lemma, bag_features_lemma = vectorize(bag_vectorizer, articles_lemma)

### User profiles (?)

In [67]:
readers = pd.read_csv('./../readers.csv')

In [68]:
readers['eventStrength'] = 1

In [69]:
# users_interactions_count_df = readers.groupby(['id', 'art_id']).size().groupby('id').size()
# print('# users: %d' % len(users_interactions_count_df))
# users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['id']]
# print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

In [70]:
interactions_train, interactions_test = train_test_split(readers,
                                   stratify=readers['id'], 
                                   test_size=0.20,
                                   random_state=123)

print('# interactions on Train set: %d' % len(interactions_train))
print('# interactions on Test set: %d' % len(interactions_test))

# interactions on Train set: 22284
# interactions on Test set: 5571


In [71]:
#Indexing by personId to speed up the searches during evaluation
interactions_total_ind = readers.set_index('id')
interactions_train_ind = interactions_train.set_index('id')
interactions_test_ind = interactions_test.set_index('id')

def get_items_interacted(person_id, interactions):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions.loc[person_id]['art_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [72]:
get_items_interacted(1,interactions_total_ind)

{'1.18331199', 'ld.1293110', 'ld.142559', 'ld.144819', 'ld.154103'}

In [76]:
# To model the user profile, we take all the item profiles the user has interacted and average them. The average is weighted by the interaction strength, in other words, the articles the user has interacted the most (eg. liked or commented) will have a higher strength in the final user profile.

def get_item_profile(item_id, matrix):
    idx = item_ids.index(item_id)
    item_profile = matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids,matrix):
    item_profiles_list = [get_item_profile(x,matrix) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df,matrix):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['art_id'],matrix)
    user_item_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1,1) # czy potrzebne od tego momentu?
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm
    # return user_item_profiles

def build_users_profiles(matrix): 
    interactions_indexed_df = interactions_train[interactions_train['art_id'] \
                                                   .isin(articles['nzz_id'])].set_index('id')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df,matrix)
    return user_profiles

In [77]:
interactions_ind = interactions_train[interactions_train['art_id'] \
                                                   .isin(articles['nzz_id'])].set_index('id')
interactions_person = interactions_ind.loc[1]
get_item_profiles(interactions_person['art_id'], tfidf_matrix)

<4x2282 sparse matrix of type '<class 'numpy.float64'>'
	with 132 stored elements in Compressed Sparse Row format>

In [78]:
tfidf_profiles = build_users_profiles(tfidf_matrix)
# tfidf_profiles_lemma = build_users_profiles(tfidf_matrix_lemma)
bag_profiles = build_users_profiles(bag_matrix)
# bag_profiles_lemma = build_users_profiles(tfidf_matrix_lemma)
len(tfidf_profiles)

1000

In [79]:
myprofile = tfidf_profiles[1]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_features, 
                        myprofile.flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 2282)


Unnamed: 0,token,relevance
0,insel,0.248814
1,papier,0.234971
2,frauen,0.227556
3,mal,0.216718
4,überall,0.210696
5,de,0.179554
6,leicht,0.177821
7,verkaufen,0.174356
8,zudem,0.162986
9,wasser,0.161277


## Model

In [90]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None, user_profiles=None, matrix=None):
        self.item_ids = item_ids
        self.items_df = items_df
        self.user_profiles = user_profiles
        self.matrix = matrix
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(self.user_profiles[person_id], self.matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['art_id', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'nzz_id', 
                                                          right_on = 'nzz_id')[
                [
                    "recStrength",
                    "nzz_id",
                    "catchline",
                    "content",
                    "department",
                    "lead_text",
                    "pub_date",
                    "content_len"
                ]
            ]


        return recommendations_df

## Ewaluator

In [94]:
# topn accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_total_ind)
        all_items = set(articles['nzz_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_ind.loc[person_id]
        if type(interacted_values_testset['art_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['art_id'])
        else:
            person_interacted_items_testset = set([(interacted_values_testset['art_id'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_ind), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=123)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['art_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['art_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_ind.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df

In [95]:
content_based_recommender_model = ContentBasedRecommender(articles, tfidf_profiles,tfidf_matrix)
model_evaluator = ModelEvaluator()  

In [96]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.11380362591994256, 'recall@10': 0.12439418416801293}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,1,1,10,0.1,0.1,938
510,2,2,10,0.2,0.2,64
86,1,2,10,0.1,0.2,887
484,2,2,10,0.2,0.2,397
472,1,2,10,0.1,0.2,218
259,3,3,10,0.3,0.3,273
264,0,0,10,0.0,0.0,803
797,1,2,10,0.1,0.2,865
276,2,2,10,0.2,0.2,443


In [97]:
content_based_recommender_model_2 = ContentBasedRecommender(articles, bag_profiles, bag_matrix)
model_evaluator = ModelEvaluator()

In [98]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model_2)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.09639203015616586, 'recall@10': 0.10482857655717107}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,3,10,0.2,0.3,907
721,1,1,10,0.1,0.1,938
510,1,1,10,0.1,0.1,64
86,1,1,10,0.1,0.1,887
484,1,2,10,0.1,0.2,397
472,1,3,10,0.1,0.3,218
259,1,1,10,0.1,0.1,273
264,0,0,10,0.0,0.0,803
797,1,2,10,0.1,0.2,865
276,2,2,10,0.2,0.2,443
