In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import matplotlib.pyplot as plt

import fasttext
from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse

In [2]:
from content_based_model_testing import ContentBasedRecommender

In [3]:
articles = pd.read_csv('./data/articles_clean.csv')
articles_lemma = pd.read_csv('./data/articles_lemmatized.csv')

In [4]:
matrix = sparse.load_npz("./vectorized/vec_matrix.npz")
matrix_lemma = sparse.load_npz("./vectorized/vec_matrix_lemma.npz")

In [5]:
feature_names = np.load('./vectorized/vec_names.npy') 
feature_names_lemma = np.load('./vectorized/vec_names_lemma.npy')

In [6]:
readers = pd.read_csv('./../readers.csv')

In [7]:
fasttext_model = fasttext.load_model(r'C:\Users\a814810\Downloads\wiki.de\wiki.de.bin')



In [8]:
content_based_recommender_model = ContentBasedRecommender(articles_db=articles, user_db= readers)#, matrix= matrix)
# model_evaluator = ModelEvaluator([5,10], filter_similar=True, vec_matrix=matrix, vec_names=feat_names, articles=articles, model=fasttext_model)  

In [9]:
%%time
person_recs = content_based_recommender_model.recommend(300, ignored=True, limit=10, return_list=False)

user id
 300
Wall time: 1min 27s


In [10]:
person_recs

Unnamed: 0,nzz_id,recStrength
0,ld.143344,0.35835
1,ld.142230,0.353695
2,ld.146325,0.330961
3,ld.151224,0.329285
4,ld.141346,0.328803
5,ld.151729,0.321342
6,ld.139971,0.316963
7,ld.138296,0.315328
8,ld.1294359,0.314695
9,ld.149419,0.314287


In [12]:
list(person_recs['nzz_id'])

['ld.143344',
 'ld.142230',
 'ld.146325',
 'ld.151224',
 'ld.141346',
 'ld.151729',
 'ld.139971',
 'ld.138296',
 'ld.1294359',
 'ld.149419']

In [13]:
list(person_recs['recStrength'])

[0.3583498594433194,
 0.35369474748062035,
 0.33096129112481837,
 0.3292851082697844,
 0.3288033113770856,
 0.3213423180378651,
 0.31696257127634486,
 0.3153275679955797,
 0.3146951130418842,
 0.3142870032570457]

In [10]:
person_recs # 300

Unnamed: 0,nzz_id,recStrength
0,ld.143344,0.35835
1,ld.142230,0.353695
2,ld.146325,0.330961
3,ld.151224,0.329285
4,ld.141346,0.328803
5,ld.151729,0.321342
6,ld.139971,0.316963
7,ld.138296,0.315328
8,ld.1294359,0.314695
9,ld.149419,0.314287


In [13]:
person_recs # 300

Unnamed: 0,nzz_id,recStrength
0,ld.143344,0.35835
1,ld.142230,0.353695
2,ld.146325,0.330961
3,ld.151224,0.329285
4,ld.141346,0.328803
5,ld.151729,0.321342
6,ld.139971,0.316963
7,ld.138296,0.315328
8,ld.1294359,0.314695
9,ld.149419,0.314287


In [11]:
person_recs

Unnamed: 0,nzz_id,recStrength
0,ld.138364,0.442184
1,ld.139822,0.426568
2,ld.137099,0.401655
3,ld.141993,0.396713
4,ld.141102,0.39341
5,ld.144162,0.391064
6,ld.143746,0.391
7,ld.146206,0.390971
8,ld.140381,0.390264
9,ld.139143,0.387521


In [10]:
person_recs

Unnamed: 0,nzz_id,recStrength
0,ld.1292982,0.443032
1,ld.1293234,0.43401
2,ld.1292986,0.430789
3,ld.1294555,0.430447
4,ld.1292913,0.418571
5,ld.1293549,0.411674
6,ld.1293186,0.294282
7,ld.1293199,0.293256
8,ld.1292834,0.29146
9,ld.1288599,0.288053


In [23]:
id_list = list(person_recs['nzz_id']) 
# extracting indices of those articles (in articles_db)
indices = articles[articles.nzz_id.isin(id_list)].index.tolist()

# extracting pairs of similar articles based on cosine similarities between vectors
matrix_lower = np.tril(cosine_similarity(matrix[indices]))
np.fill_diagonal(matrix_lower, 0)
similar_pairs = np.where(matrix_lower>=0.5)
similar_df = pd.DataFrame(np.column_stack(similar_pairs),columns=['first_art','second_art'])

# extracting keywords for each pair and computing similrity between them 
for i in range(len(similar_df)):
    id_1 = similar_df.loc[i,'first_art']
    sorted_1 = np.argsort(matrix[id_1].data)[:-(5+1):-1]
    key_1 = np.array(feature_names)[matrix[id_1].indices[sorted_1]]
    id_2 = similar_df.loc[i,'second_art']
    sorted_2 = np.argsort(matrix[id_2].data)[:-(5+1):-1]
    key_2 = np.array(feature_names)[matrix[id_2].indices[sorted_2]]
    print(key_1)
    print(key_2)
    # vectorization of keywords
    key_vec_1 = [fasttext_model.get_word_vector(x) for x in key_1]
    key_vec_2 = [fasttext_model.get_word_vector(x) for x in key_2]
    
    # handling occurences of empty articles
    if not key_vec_1: key_vec_1 = [fasttext_model.get_word_vector('') for i in range(5)]
    if not key_vec_2: key_vec_2 = [fasttext_model.get_word_vector('') for i in range(5)]

    print(len(key_vec_1))
    key_vec_1_sum = [0]*300
    key_vec_2_sum = [0]*300
    for j in range(5):
        key_vec_1_sum += key_vec_1[j]
        key_vec_2_sum += key_vec_2[j]
    print(len(key_vec_1_sum))
    
    # computing similarity
    #cos_matrix = [[cosine_similarity(x.reshape(1,-1),y.reshape(1,-1)) for x in key_vec_1] for y in key_vec_2]
    similarity = cosine_similarity(key_vec_1_sum.reshape(1,-1), key_vec_2_sum.reshape(1,-1))
    print(similarity)
    similar_df.loc[i,'similarity'] = similarity[0][0]
    print(similar_df)
        # filtering the recommendations
new_indices = [x for i, x in enumerate(indices) if i not in list(similar_df.loc[similar_df.similarity >= 0.4, 'first_art'])]
filtered_recs_ind = articles.loc[new_indices, 'nzz_id']

new_recs = person_recs.loc[person_recs.nzz_id.isin(list(filtered_recs_ind))]
new_recs

[]
['museum' 'angriff' 'soldaten' 'verletzte' 'täter']
5
300
[[0.]]
   first_art  second_art  similarity
0          7           3         0.0
1          7           4         NaN
2          8           7         NaN
3          9           3         NaN
4          9           4         NaN
5          9           7         NaN
[]
['personen' 'ums leben gekommen' 'leben gekommen' 'familien'
 'unterstützen']
5
300
[[0.]]
   first_art  second_art  similarity
0          7           3         0.0
1          7           4         0.0
2          8           7         NaN
3          9           3         NaN
4          9           4         NaN
5          9           7         NaN
['islamischen' 'nals' 'aufmerksamkeit' 'organisation' 'steht']
[]
5
300
[[0.]]
   first_art  second_art  similarity
0          7           3         0.0
1          7           4         0.0
2          8           7         0.0
3          9           3         NaN
4          9           4         NaN
5          9       

Unnamed: 0,nzz_id,recStrength
0,ld.138364,0.442184
1,ld.139822,0.426568
2,ld.137099,0.401655
3,ld.141993,0.396713
4,ld.141102,0.39341
5,ld.144162,0.391064
6,ld.143746,0.391
7,ld.146206,0.390971
8,ld.140381,0.390264


In [None]:
# 9 - ['us präsident' 'präsident donald trump' 'präsident donald' 'vorerst' 'usa']
# 4 - ['personen' 'ums leben gekommen' 'leben gekommen' 'familien' 'unterstützen']
# 3 - ['museum' 'angriff' 'soldaten' 'verletzte' 'täter']

In [31]:
articles.loc[articles['nzz_id'] =='ld.141993'] 

Unnamed: 0,nzz_id,author,catchline,content,department,lead_text,pub_date,title,content_len
1455,ld.141993,"Alain Zucker, Davos",WEF 2017,wef die welt hat keine führung mehr ...,NZZaS,"Ian Bremmer, Experte für Geopolitik, hält es e...",2017-01-26 12:22:34.000,«Die Welt hat keine Führung mehr»,1505


In [32]:
articles.loc[articles['nzz_id']== 'ld.141102']

Unnamed: 0,nzz_id,author,catchline,content,department,lead_text,pub_date,title,content_len
13708,ld.141102,syc./ela. / Agenturen,Donald Trumps erste Amtstage,donald trumps erste amtstage us präsident ...,International,Während Hunderttausende von Frauen in ganz Ame...,2017-01-22 14:08:00.000,US-Präsident sieht sich «im Krieg mit den Medien»,2619


In [33]:
articles.loc[articles['nzz_id']== 'ld.139143']

Unnamed: 0,nzz_id,author,catchline,content,department,lead_text,pub_date,title,content_len
20763,ld.139143,Unknown,Der neue Präsident und seine Geschäfte,der neue präsident und seine geschäfte tr...,Wirtschaft,Donald Trump und sein Firmenimperium sind eng ...,2017-01-11 12:35:00.000,Trump wird Interessenkonflikte nicht los,1312


In [11]:
content_based_recommender_model.filter_out_similar(person_recs, feature_names = feature_names, model=fasttext_model, article_similarity=0.5, 
                                                    keyword_similarity=0.3)

list as input
list_returned


['ld.138364',
 'ld.139822',
 'ld.137099',
 'ld.141993',
 'ld.141102',
 'ld.144162',
 'ld.143746',
 'ld.146206',
 'ld.140381']

In [45]:
recs = build_profiles(readers, matrix, articles)

In [41]:
len(recs)

1000

In [42]:
recs[154]

array([[0.01371982, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])