In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt
import random
import sklearn

import fasttext
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix

In [22]:
from content_based_model import ContentBasedRecommender
from evaluator import ModelEvaluator

In [2]:
articles = pd.read_csv('./data/articles_clean.csv')
articles_lemma = pd.read_csv('./data/articles_lemmatized.csv')

### Vectorization

In [3]:
stopwords_list = stopwords.words('german') # are there other languages in text

In [7]:

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3), 
                     min_df=0.01,
                     max_df=1.0,
                     max_features=5000,
                     stop_words=stopwords_list)


### User profiles (?)

In [12]:
readers = pd.read_csv('./../readers.csv')

In [13]:
users_interactions_count_df = readers.groupby(['id', 'art_id']).size().groupby('id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1000
# users with at least 5 interactions: 1000


In [14]:
# all users have enough interactions (min 5) so train-test-split basically on readers dataset
interactions_train, interactions_test = train_test_split(readers,
                                   stratify=readers['id'], 
                                   test_size=0.20,
                                   random_state=123)

print('# interactions on Train set: %d' % len(interactions_train))
print('# interactions on Test set: %d' % len(interactions_test))

# interactions on Train set: 22284
# interactions on Test set: 5571


In [None]:
def vectorize_and_build_profiles(vectorizer,data):

    def vectorize(vectorizer, data):
        matrix  = vectorizer.fit_transform(data)
        feature_names = vectorizer.get_feature_names()
        return matrix, feature_names

    def get_one_article_profile(item_id: str, matrix):
        idx = articles['nzz_id'].tolist().index(item_id)
        item_profile = matrix[idx:idx+1]
        return item_profile

    def get_articles_profiles(ids,matrix):
        item_profiles_list = [get_one_article_profile(x,matrix) for x in ids]
        item_profiles = scipy.sparse.vstack(item_profiles_list)
        return item_profiles

    def build_users_profiles(matrix): 
        interactions = interactions_train[interactions_train['art_id'].isin(articles['nzz_id'])].set_index('id')
        user_profiles = {}
        for person_id in interactions.index.unique():
            user_item_profiles = get_articles_profiles(interactions.loc[person_id,'art_id'],matrix)
            user_profiles[person_id] = sklearn.preprocessing.normalize(np.sum(user_item_profiles, axis=0))
        return user_profiles
    
    matrix, features = vectorize(vectorizer, data)
    user_profiles = build_users_profiles(matrix)
    return matrix, features, user_profiles

In [26]:
matrix, feat_names, user_profiles = vectorize_and_build_profiles(tfidf_vectorizer, articles['content'])

### Modeling/evaluation

In [27]:
content_based_recommender_model = ContentBasedRecommender(articles, user_profiles, matrix)
model_evaluator = ModelEvaluator()

In [28]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model, articles, readers, interactions_train, interactions_test)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.11326512295817627, 'recall@10': 0.12457368515526836}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,1,1,10,0.1,0.1,938
510,0,2,10,0.0,0.2,64
86,2,2,10,0.2,0.2,887
484,2,2,10,0.2,0.2,397
472,2,2,10,0.2,0.2,218
259,3,3,10,0.3,0.3,273
264,0,0,10,0.0,0.0,803
797,2,2,10,0.2,0.2,865
276,2,2,10,0.2,0.2,443
