In [20]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt
import random
import sklearn
import spacy
nlp = spacy.load('de_core_news_md')
import fasttext
import gensim 

import re
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix



In [2]:
from content_based_model import ContentBasedRecommender
from evaluator import ModelEvaluator

In [3]:
articles = pd.read_csv('./data/articles_clean.csv')
articles_lemma = pd.read_csv('./data/articles_lemmatized.csv')

## Wektoryzacja

In [4]:
stopwords_list = stopwords.words('german') # are there other languages in text

### Bag of words

In [5]:
bag_vectorizer = CountVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### TFIDF

In [6]:

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### Word2vec

In [18]:
tokens = pd.read_csv('./data/tokens.csv')

In [7]:
# to do 

### FastText

In [4]:
# to do 
model = fasttext.load_model('model.bin')
vect = model.get_sentence_vector("some string") # 1 sentence
vect2 = [model.get_sentence_vector(el.replace('\n', '')) for el in text] # for text



ValueError: model.bin cannot be opened for loading!

In [7]:
def vectorize(vectorizer, data): # tylko do tfidf/count (?)
    matrix  = vectorizer.fit_transform(data['content'])
    feature_names = vectorizer.get_feature_names()
    return matrix, feature_names

In [8]:
item_ids = articles['nzz_id'].tolist() # w lemma te same 

In [9]:
tfidf_matrix, tfidf_features = vectorize(tfidf_vectorizer, articles)
# bag_matrix, bag_features = vectorize(bag_vectorizer, articles)


In [11]:
tfidf_matrix_lemma, tfidf_features_lemma = vectorize(tfidf_vectorizer, articles_lemma)
# bag_matrix_lemma, bag_features_lemma = vectorize(bag_vectorizer, articles_lemma)

### User profiles (?)

In [10]:
readers = pd.read_csv('./../readers.csv')

In [11]:
users_interactions_count_df = readers.groupby(['id', 'art_id']).size().groupby('id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1000
# users with at least 5 interactions: 1000


In [12]:
# all users have enough interactions (min 5) so train-test-split basically on readers dataset
interactions_train, interactions_test = train_test_split(readers,
                                   stratify=readers['id'], 
                                   test_size=0.20,
                                   random_state=123)

print('# interactions on Train set: %d' % len(interactions_train))
print('# interactions on Test set: %d' % len(interactions_test))

# interactions on Train set: 22284
# interactions on Test set: 5571


In [13]:
def get_one_article_profile(item_id: str, matrix):
    idx = item_ids.index(item_id)
    item_profile = matrix[idx:idx+1]
    return item_profile

def get_articles_profiles(ids,matrix):
    item_profiles_list = [get_one_article_profile(x,matrix) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profiles(matrix): 
    interactions = interactions_train[interactions_train['art_id'].isin(articles['nzz_id'])].set_index('id')
    user_profiles = {}
    for person_id in interactions.index.unique():
        user_item_profiles = get_articles_profiles(interactions.loc[person_id,'art_id'],matrix)
        user_profiles[person_id] = sklearn.preprocessing.normalize(np.sum(user_item_profiles, axis=0))
    return user_profiles

In [14]:
tfidf_profiles = build_users_profiles(tfidf_matrix)
#bag_profiles = build_users_profiles(bag_matrix)
len(tfidf_profiles)

1000

In [18]:
tfidf_profiles[1].shape

(1, 2282)

In [21]:
tfidf_profiles_lemma = build_users_profiles(tfidf_matrix_lemma)
# bag_profiles_lemma = build_users_profiles(bag_matrix_lemma)


In [22]:
myprofile = tfidf_profiles[1]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_features, 
                        myprofile.flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 2282)


Unnamed: 0,token,relevance
0,insel,0.248814
1,papier,0.234971
2,frauen,0.227556
3,mal,0.216718
4,überall,0.210696
5,de,0.179554
6,leicht,0.177821
7,verkaufen,0.174356
8,zudem,0.162986
9,wasser,0.161277


## Ewaluator

In [15]:
content_based_recommender_model = ContentBasedRecommender(item_ids,articles, tfidf_profiles,tfidf_matrix)
model_evaluator = ModelEvaluator()  

In [16]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model, articles, readers, interactions_train, interactions_test)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.10375157063363849, 'recall@10': 0.12457368515526836}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,0,1,10,0.0,0.1,938
510,0,2,10,0.0,0.2,64
86,2,2,10,0.2,0.2,887
484,2,2,10,0.2,0.2,397
472,1,2,10,0.1,0.2,218
259,2,3,10,0.2,0.3,273
264,0,0,10,0.0,0.0,803
797,2,2,10,0.2,0.2,865
276,2,2,10,0.2,0.2,443


In [136]:
content_based_recommender_model_3 = ContentBasedRecommender(articles_lemma, bag_profiles_lemma, bag_matrix_lemma)
model_evaluator = ModelEvaluator()

In [137]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model_3)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.10716208939149165, 'recall@10': 0.11541913480524144}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,1,1,10,0.1,0.1,938
510,2,2,10,0.2,0.2,64
86,1,1,10,0.1,0.1,887
484,2,2,10,0.2,0.2,397
472,1,3,10,0.1,0.3,218
259,3,3,10,0.3,0.3,273
264,0,0,10,0.0,0.0,803
797,0,3,10,0.0,0.3,865
276,1,1,10,0.1,0.1,443
