In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt
import random
import sklearn

import fasttext
import re
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

In [2]:
from content_based_model import ContentBasedRecommender
from evaluator import ModelEvaluator

In [3]:
articles = pd.read_csv('./data/articles_clean.csv')
articles_lemma = pd.read_csv('./data/articles_lemmatized.csv')

## Wektoryzacja

In [4]:
stopwords_list = stopwords.words('german') # are there other languages in text

### TFIDF

In [5]:

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### FastText

In [6]:
%%time
fasttext_model = fasttext.load_model(r'C:\Users\a814810\Downloads\wiki.de\wiki.de.bin')

Wall time: 52 s


In [7]:
%%time
for i in range(len(articles)):
    row = re.sub(r'\r\n', '', articles.loc[i,'content'])
    embedding = fasttext_model.get_sentence_vector(row)
    if i == 0 : fast_matrix = embedding
    else: fast_matrix = np.vstack([fast_matrix, embedding])

Wall time: 6min 56s


In [8]:
fast_matrix.shape

(22019, 300)

In [9]:
def vectorize(vectorizer, data): # tylko do tfidf/count (?)
    matrix  = vectorizer.fit_transform(data['content'])
    feature_names = vectorizer.get_feature_names()
    return matrix, feature_names

In [10]:
tfidf_matrix, tfidf_features = vectorize(tfidf_vectorizer, articles)

### User profiles (?)

In [11]:
readers = pd.read_csv('./../readers.csv')

In [12]:
readers.head()

Unnamed: 0,id,art_id
0,1,ld.154103
1,1,ld.142559
2,1,1.18331199
3,1,ld.144819
4,1,ld.1293110


In [13]:
users_interactions_count_df = readers.groupby(['id', 'art_id']).size().groupby('id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1000
# users with at least 5 interactions: 1000


In [14]:
# all users have enough interactions (min 5) so train-test-split basically on readers dataset
interactions_train, interactions_test = train_test_split(readers,
                                   stratify=readers['id'], 
                                   test_size=0.20,
                                   random_state=123)

print('# interactions on Train set: %d' % len(interactions_train))
print('# interactions on Test set: %d' % len(interactions_test))

# interactions on Train set: 22284
# interactions on Test set: 5571


In [15]:
def vectorize_and_build_profiles(vectorizer,data):

    def vectorize(vectorizer, data):
        matrix  = vectorizer.fit_transform(data)
        feature_names = vectorizer.get_feature_names()
        return matrix, feature_names

    def get_one_article_profile(item_id: str, matrix):
        idx = articles['nzz_id'].tolist().index(item_id)
        item_profile = matrix[idx:idx+1]
        return item_profile

    def get_articles_profiles(ids,matrix):
        item_profiles_list = [get_one_article_profile(x,matrix) for x in ids]
        item_profiles = scipy.sparse.vstack(item_profiles_list)
        return item_profiles

    def build_users_profiles(matrix): 
        interactions = interactions_train[interactions_train['art_id'].isin(articles['nzz_id'])].set_index('id')
        user_profiles = {}
        for person_id in interactions.index.unique():
            user_item_profiles = get_articles_profiles(interactions.loc[person_id,'art_id'],matrix)
            user_profiles[person_id] = sklearn.preprocessing.normalize(np.sum(user_item_profiles, axis=0))
        return user_profiles
    
    matrix, features = vectorize(vectorizer, data)
    user_profiles = build_users_profiles(matrix)
    return matrix, features, user_profiles

In [16]:
matrix, feat_names, user_profiles = vectorize_and_build_profiles(tfidf_vectorizer, articles['content'])

In [17]:
myprofile = user_profiles[1]
print(myprofile.shape)
pd.DataFrame(sorted(zip(feat_names, 
                        myprofile.flatten().tolist()), key=lambda x: -x[1])[:10],
             columns=['token', 'relevance'])

(1, 2282)


Unnamed: 0,token,relevance
0,insel,0.248814
1,papier,0.234971
2,frauen,0.227556
3,mal,0.216718
4,überall,0.210696
5,de,0.179554
6,leicht,0.177821
7,verkaufen,0.174356
8,zudem,0.162986
9,wasser,0.161277


## Ewaluator

In [18]:
content_based_recommender_model = ContentBasedRecommender(articles, user_profiles, matrix)
model_evaluator = ModelEvaluator()  

In [19]:
cb_global_metrics, cb_detailed_results_df, people_recs = model_evaluator.evaluate_model(content_based_recommender_model, articles, readers, interactions_train, interactions_test)

999 users processed


In [43]:
def get_recs_for_user(user_id):
    for i in range(len(people_recs)):
        if people_recs[i].loc[1,'_person_id'] == user_id:
            return people_recs[i]

In [47]:
recs = get_recs_for_user(411)
recs

Unnamed: 0,art_id,recStrength,_person_id
0,ld.138364,0.458265,411
1,ld.139822,0.419848,411
2,ld.139143,0.409991,411
3,ld.141102,0.404845,411
4,ld.140381,0.400806,411
...,...,...,...
977,ld.141173,0.248292,411
978,ld.142119,0.248191,411
979,ld.145887,0.248153,411
980,ld.151375,0.248139,411


In [68]:
list(recs['art_id'])[1]

'ld.139822'

In [87]:
list_for_test = list(recs['art_id'])[:5]
list_for_test

['ld.138364', 'ld.139822', 'ld.139143', 'ld.141102', 'ld.140381']

In [89]:
indices = articles[articles.nzz_id.isin(list_for_test)].index.tolist()

In [90]:
cosine_similarity(tfidf_matrix[indices])

array([[1.        , 0.49271318, 0.51491629, 0.42187818, 0.51711057],
       [0.49271318, 1.        , 0.51904078, 0.4635894 , 0.6443268 ],
       [0.51491629, 0.51904078, 1.        , 0.52035234, 0.62926822],
       [0.42187818, 0.4635894 , 0.52035234, 1.        , 0.48208584],
       [0.51711057, 0.6443268 , 0.62926822, 0.48208584, 1.        ]])

In [85]:
np.mean(pd.DataFrame(cosine_similarity(tfidf_matrix[indices])).describe(), axis=1)

count    982.000000
mean       0.136259
std        0.088651
min        0.006644
25%        0.069576
50%        0.117573
75%        0.188088
max        1.000000
dtype: float64

In [27]:
print('Evaluating Content-Based Filtering model...')
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.11236761802189912, 'recall@10': 0.12475318614252379}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,1,1,10,0.1,0.1,938
510,2,2,10,0.2,0.2,64
86,2,2,10,0.2,0.2,887
484,2,2,10,0.2,0.2,397
472,2,2,10,0.2,0.2,218
259,2,3,10,0.2,0.3,273
264,0,0,10,0.0,0.0,803
797,2,2,10,0.2,0.2,865
276,2,2,10,0.2,0.2,443
