In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt
import random
import sklearn
import spacy
nlp = spacy.load('de_core_news_md')
import fasttext
import gensim 
from gensim.models.doc2vec import Doc2Vec, Word2Vec

import re
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix



In [2]:
from content_based_model import ContentBasedRecommender
from evaluator import ModelEvaluator

In [3]:
articles = pd.read_csv('./data/articles_clean.csv')
articles_lemma = pd.read_csv('./data/articles_lemmatized.csv')

## Wektoryzacja

In [4]:
stopwords_list = stopwords.words('german') # are there other languages in text

### Bag of words

In [5]:
bag_vectorizer = CountVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### TFIDF

In [6]:

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 3), # bi - gram (?)
                     min_df=0.01,
                     max_df=0.7,
                     max_features=5000,
                     stop_words=stopwords_list)


### Word2vec

In [7]:
tokens = pd.read_csv('./data/tokens.csv')

In [8]:
tokens.head()

Unnamed: 0,content
0,"['obligationenfonds', 'fixer', 'laufzeit', 'gi..."
1,[]
2,"['e', 'banking', 'ausfall', 'postfinance', 'kä..."
3,"['terror', 'frankreich', 'louvre', 'macheten',..."
4,"['unglück', 'panama', 'bus', 'prallt', 'mauer'..."


In [9]:
tokens_to_list = []
for i in range(len(tokens)):
    tokens_to_list.append(re.sub('[,\'\[\]]', '', tokens.loc[i][0]).split(" "))

In [10]:
 wv_model = gensim.models.Word2Vec(vector_size=300,min_count=5,workers=10,epochs=1)

In [13]:
wv_model.build_vocab(tokens_to_list) 
wv_model.train(tokens_to_list, total_examples=wv_model.corpus_count, epochs=wv_model.epochs)

(3821522, 4225581)

In [14]:
wv_model.wv.most_similar(positive='institut')

[('steigt', 0.9997395277023315),
 ('zählt', 0.9996887445449829),
 ('performance', 0.9996780157089233),
 ('inklusive', 0.9996732473373413),
 ('bundesamt', 0.9996673464775085),
 ('volle', 0.9996650815010071),
 ('beinahe', 0.9996550679206848),
 ('stück', 0.9996546506881714),
 ('dank', 0.9996422529220581),
 ('kategorie', 0.999623715877533)]

In [15]:
### Doc2Vec

### FastText

In [16]:
# from gensim.models.fasttext import load_facebook_model

# wv = load_facebook_model(r'C:\Users\a814810\Downloads\wiki.de\wiki.de.bin')

In [17]:
%%time
fasttext_model = fasttext.load_model(r'C:\Users\a814810\Downloads\wiki.de\wiki.de.bin')

Wall time: 54.3 s


In [None]:
articles_lemma

In [44]:
for i in range(len(articles)):
    row = re.sub(r'\r\n', '', articles.loc[i,'content'])
    embedding = fasttext_model.get_sentence_vector(row)
    if i == 0 : fast_matrix = embedding
    else: fast_matrix = np.vstack([fast_matrix, embedding])

In [45]:
fast_matrix.shape

(22019, 300)

In [46]:
# def cosine_similarity(embedding_1, embedding_2):
#     # Calculate the cosine similarity of the two embeddings.
#     sim = 1 - cosine(embedding_1, embedding_2)
#     print('Cosine similarity: {:.2}'.format(sim))

#     # compare the embeddings
# cosine_similarity(embedding_1, embedding_2)
#     # compare the embeddings
# cosine_similarity(embedding_1, embedding_3)

In [47]:
def vectorize(vectorizer, data): # tylko do tfidf/count (?)
    matrix  = vectorizer.fit_transform(data['content'])
    feature_names = vectorizer.get_feature_names()
    return matrix, feature_names

In [48]:
item_ids = articles['nzz_id'].tolist() # w lemma te same 

In [49]:
tfidf_matrix, tfidf_features = vectorize(tfidf_vectorizer, articles)
# bag_matrix, bag_features = vectorize(bag_vectorizer, articles)


In [50]:
tfidf_matrix_lemma, tfidf_features_lemma = vectorize(tfidf_vectorizer, articles_lemma)
# bag_matrix_lemma, bag_features_lemma = vectorize(bag_vectorizer, articles_lemma)

### User profiles (?)

In [51]:
readers = pd.read_csv('./../readers.csv')

In [52]:
users_interactions_count_df = readers.groupby(['id', 'art_id']).size().groupby('id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1000
# users with at least 5 interactions: 1000


In [53]:
# all users have enough interactions (min 5) so train-test-split basically on readers dataset
interactions_train, interactions_test = train_test_split(readers,
                                   stratify=readers['id'], 
                                   test_size=0.20,
                                   random_state=123)

print('# interactions on Train set: %d' % len(interactions_train))
print('# interactions on Test set: %d' % len(interactions_test))

# interactions on Train set: 22284
# interactions on Test set: 5571


In [54]:
def get_one_article_profile(item_id: str, matrix):
    idx = item_ids.index(item_id)
    item_profile = matrix[idx:idx+1]
    return item_profile

def get_articles_profiles(ids,matrix):
    item_profiles_list = [get_one_article_profile(x,matrix) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profiles(matrix): 
    interactions = interactions_train[interactions_train['art_id'].isin(articles['nzz_id'])].set_index('id')
    user_profiles = {}
    for person_id in interactions.index.unique():
        user_item_profiles = get_articles_profiles(interactions.loc[person_id,'art_id'],matrix)
        user_profiles[person_id] = sklearn.preprocessing.normalize(np.sum(user_item_profiles, axis=0))
    return user_profiles

In [55]:
tfidf_profiles = build_users_profiles(tfidf_matrix)
#bag_profiles = build_users_profiles(bag_matrix)
len(tfidf_profiles)

1000

In [57]:
tfidf_matrix

<22019x2282 sparse matrix of type '<class 'numpy.float64'>'
	with 1402142 stored elements in Compressed Sparse Row format>

In [62]:
fast_matrix_2 = csr_matrix(fast_matrix)

In [63]:
fast_matrix_2

<22019x300 sparse matrix of type '<class 'numpy.float32'>'
	with 4470300 stored elements in Compressed Sparse Row format>

In [64]:
fast_profiles = build_users_profiles(fast_matrix_2)

In [18]:
tfidf_profiles[1].shape

(1, 2282)

In [21]:
tfidf_profiles_lemma = build_users_profiles(tfidf_matrix_lemma)
# bag_profiles_lemma = build_users_profiles(bag_matrix_lemma)


In [22]:
myprofile = tfidf_profiles[1]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_features, 
                        myprofile.flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 2282)


Unnamed: 0,token,relevance
0,insel,0.248814
1,papier,0.234971
2,frauen,0.227556
3,mal,0.216718
4,überall,0.210696
5,de,0.179554
6,leicht,0.177821
7,verkaufen,0.174356
8,zudem,0.162986
9,wasser,0.161277


## Ewaluator

In [65]:
content_based_recommender_model = ContentBasedRecommender(item_ids,articles, fast_profiles,fast_matrix_2)
model_evaluator = ModelEvaluator()  

In [66]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model, articles, readers, interactions_train, interactions_test)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.07700592353257943, 'recall@10': 0.08741698079339437}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,1,1,10,0.1,0.1,907
721,1,1,10,0.1,0.1,938
510,0,0,10,0.0,0.0,64
86,0,1,10,0.0,0.1,887
484,0,0,10,0.0,0.0,397
472,1,1,10,0.1,0.1,218
259,2,2,10,0.2,0.2,273
264,0,0,10,0.0,0.0,803
797,0,1,10,0.0,0.1,865
276,1,2,10,0.1,0.2,443


In [136]:
content_based_recommender_model_3 = ContentBasedRecommender(articles_lemma, bag_profiles_lemma, bag_matrix_lemma)
model_evaluator = ModelEvaluator()

In [137]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model_3)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
999 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.10716208939149165, 'recall@10': 0.11541913480524144}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
358,2,2,10,0.2,0.2,907
721,1,1,10,0.1,0.1,938
510,2,2,10,0.2,0.2,64
86,1,1,10,0.1,0.1,887
484,2,2,10,0.2,0.2,397
472,1,3,10,0.1,0.3,218
259,3,3,10,0.3,0.3,273
264,0,0,10,0.0,0.0,803
797,0,3,10,0.0,0.3,865
276,1,1,10,0.1,0.1,443
