In [98]:
import pandas as pd
import math
import scipy
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, NMF

In [3]:
articles = pd.read_csv('shared_articles.csv')
articles = articles[articles['eventType']=='CONTENT SHARED']
articles.shape

(3047, 13)

In [4]:
articles.head(1)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en


In [5]:
iteractions = pd.read_csv('users_interactions.csv')
iteractions.shape

(72312, 8)

In [6]:
iteractions.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [7]:
iteractions.personId.unique().shape

(1895,)

In [8]:
iteractions.contentId.unique().shape

(2987,)

In [9]:
iteractions.eventType.unique()

array(['VIEW', 'FOLLOW', 'BOOKMARK', 'LIKE', 'COMMENT CREATED'],
      dtype=object)

In [10]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

iteractions.eventType = iteractions.eventType.apply(lambda x:event_type_strength[x])
iteractions.head(1)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,1.0,-3499919498720038879,-8845298781299428018,1264196770339959068,,,


In [11]:
# iteraction_count_by_user = iteractions.groupby('personId').size()
# users_with_enough_count = iteraction_count_by_user[iteraction_count_by_user>=5]
# users_with_enough_count = users_with_enough_count.reset_index()
# users_with_enough_count.personId.unique().shape

In [12]:
iteractions = iteractions.groupby(['personId', 'contentId'])['eventType'].sum().apply(lambda x:math.log(1+x, 2)).reset_index()

In [13]:
activity_count = iteractions.groupby('personId').size().reset_index()
active_user = activity_count[activity_count.iloc[:, 1]>=5]
iteractions = pd.merge(iteractions, active_user, how = 'right', on='personId').drop(0, axis = 1)

In [14]:
iteractions.head()

Unnamed: 0,personId,contentId,eventType
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925


In [15]:
iteractions.contentId.unique().shape

(2984,)

In [16]:
iteractions.personId.unique().shape

(1140,)

In [105]:
reader = Reader(rating_scale=(1, 4))
data = Dataset.load_from_df(iteractions[['personId', 'contentId', 'eventType']], reader)

In [106]:
cross_validate(NormalPredictor(), data, cv=3, verbose=True)
cross_validate(BaselineOnly(), data, cv=3, verbose=True)
cross_validate(KNNBasic(), data, cv=3, verbose=True)
cross_validate(KNNWithMeans(), data, cv=3, verbose=True)
cross_validate(KNNWithZScore(), data, cv=3, verbose=True)
cross_validate(KNNBaseline(), data, cv=3, verbose=True)
cross_validate(SVD(), data, cv=3, verbose=True)
cross_validate(NMF(), data, cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8826  0.8753  0.8767  0.8782  0.0032  
MAE (testset)     0.6526  0.6515  0.6479  0.6507  0.0020  
Fit time          0.07    0.04    0.04    0.05    0.01    
Test time         0.10    0.17    0.14    0.14    0.03    
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6224  0.6207  0.6280  0.6237  0.0031  
MAE (testset)     0.4668  0.4664  0.4704  0.4679  0.0018  
Fit time          0.10    0.08    0.08    0.09    0.01    
Test time         0.12    0.15    0.07    0.11    0.03    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

{'fit_time': (2.2954018115997314, 2.104573965072632, 2.0997989177703857),
 'test_mae': array([0.44181879, 0.4443098 , 0.43522508]),
 'test_rmse': array([0.6678841 , 0.67542534, 0.65663805]),
 'test_time': (0.08130717277526855, 0.07795286178588867, 0.07904696464538574)}

In [107]:
trainset = data.build_full_trainset()
algo = NMF()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x116a3bc88>

In [108]:
from collections import defaultdict
ratings = defaultdict(list)
for contentId in iteractions.contentId.unique():
    ratings['contentId'].append(contentId)
    ratings['rating'].append(algo.predict(-9223121837663643404, contentId).est)
ratings = pd.DataFrame(ratings)
ratings = ratings.sort_values(by = 'rating', ascending = False)
print(ratings.shape)

user_read = iteractions[iteractions.personId == -9223121837663643404]
print(user_read.shape)
ratings = ratings[~ratings.contentId.isin(user_read.contentId)]
print(ratings.shape)
read_list = pd.merge(user_read, articles, how='left', on = 'contentId')
recom_list = pd.merge(ratings, articles, how='left', on = 'contentId')

(2984, 2)
(43, 3)
(2941, 2)


In [109]:
read_list

Unnamed: 0,personId,contentId,eventType_x,timestamp,eventType_y,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,-9223121837663643404,-8949113594875411859,1.0,1462448930,CONTENT SHARED,1895326251577378793,6242109617183539580,,,,HTML,http://m.folha.uol.com.br/mercado/2016/05/1766...,"No Brasil, '25% dos celulares ainda são 'Burro...","Divulgação O ex-jogador de futebol Tostão, don...",pt
1,-9223121837663643404,-8377626164558006982,1.0,1473870933,CONTENT SHARED,-5527145562136413747,5228941642832000454,,,,HTML,https://hbr.org/2016/09/bad-writing-is-destroy...,Bad Writing Is Destroying Your Company's Produ...,A hidden source of friction is slowing your co...,en
2,-9223121837663643404,-8208801367848627943,1.0,1469678235,CONTENT SHARED,-3390049372067052505,2045534933671019150,,,,HTML,http://www.geekwire.com/2016/ray-kurzweil-worl...,Ray Kurzweil: The world isn't getting worse - ...,"Ray Kurzweil, the author, inventor, computer s...",en
3,-9223121837663643404,-8187220755213888616,1.0,1467823363,CONTENT SHARED,1895326251577378793,6337372998984359835,,,,HTML,http://www.mckinsey.com/industries/high-tech/o...,Organizing for digital acceleration: Making a ...,By adopting a digital product management model...,en
4,-9223121837663643404,-7423191370472335463,3.169925,1478623412,CONTENT SHARED,-4465926797008424436,-4234938118093547320,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,HTML,https://medium.com/android-dev-br/espresso-int...,"Espresso Intents: não é magia, é tecnologia! -...",Se você leu meu último artigo sobre Testes uni...,pt
5,-9223121837663643404,-7331393944609614247,1.0,1463301395,CONTENT SHARED,-1032019229384696495,3697390468148831479,,,,HTML,http://www.businessinsider.com/amit-singh-jump...,Here's proof that Google is getting serious ab...,"Business Insider/Julie Bort Amit Singh, who ho...",en
6,-9223121837663643404,-6872546942144599345,1.0,1487597538,CONTENT SHARED,-1393866732742189886,-6350745898785551312,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,MG,BR,HTML,https://medium.com/@husayn.hakeem/my-experienc...,My experience with Google's Associate Android ...,In this article I talk about my personal exper...,en
7,-9223121837663643404,-6728844082024523434,1.0,1485194633,CONTENT SHARED,801895594717772308,7194441186926042361,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2...,MG,BR,HTML,http://merowing.info/2017/01/seniority/,Seniority,People use different words to classify Enginee...,en
8,-9223121837663643404,-6590819806697898649,1.0,1485179372,CONTENT SHARED,-4465926797008424436,-7541317811547099244,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,HTML,https://medium.com/android-dev-br/listas-com-r...,Listas com RecyclerView - Android Dev BR,Lista simples O primeiro passo é colocar o Rec...,pt
9,-9223121837663643404,-6558712014192834002,1.584963,1464966720,CONTENT SHARED,-1443636648652872475,-6631146006024713322,,,,HTML,http://arstechnica.com/tech-policy/2016/06/goo...,Google's fair use victory is good for open source,Pamela Samuelson is a longtime professor of IP...,en


In [110]:
recom_list

Unnamed: 0,contentId,rating,timestamp,eventType,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,-4262964715347041233,2.950712,1.467681e+09,CONTENT SHARED,3.609194e+18,1.709336e+18,,,,HTML,https://marketing.knect365.com/intrapreneur,Corporate Intrapreneur Summit,"The 2016 Corporate Intrapreneur Summit, produc...",en
1,-2402157201203242566,2.826674,1.466519e+09,CONTENT SHARED,3.609194e+18,-7.072205e+18,,,,HTML,https://techcrunch.com/2016/06/21/number26-rai...,Number26 raises another $40 million for its vi...,Berlin-based startup Number26 just raised a $4...,en
2,3739926497176994524,2.737724,1.480423e+09,CONTENT SHARED,8.676130e+18,-2.273279e+18,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,HTML,http://meiobit.com/355936/a-chegada-arrival-re...,Resenha - A Chegada,Supondo que haja vida inteligente no Universo ...,pt
3,4909630283317268181,2.696768,1.486125e+09,CONTENT SHARED,3.609194e+18,-5.689940e+18,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,SP,BR,HTML,https://startupi.com.br/2017/02/como-o-banco-d...,Como o Banco do Brasil está inovando e incenti...,"Acontece esse semana em São Paulo, a décima ed...",pt
4,3801064060797809024,2.635400,1.462365e+09,CONTENT SHARED,8.676130e+18,1.142017e+18,,,,HTML,http://www.imdb.com/title/tt0033729/,How Green Was My Valley (1941),Storyline Life is hard in a Welsh mining town ...,en
5,-340541651592032197,2.574802,1.484310e+09,CONTENT SHARED,3.609194e+18,-4.642144e+18,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,SP,BR,HTML,https://software.intel.com/en-us/blogs/2015/11...,Relating a Problem Definition to IoT Architect...,"Lets face it, the largest function within IoT ...",en
6,-4228415104574264137,2.465933,1.465842e+09,CONTENT SHARED,3.302556e+18,-4.935618e+18,,,,HTML,https://medium.com/google-developers/up-your-a...,Up your app's sharing game with DirectShare - ...,Up your app's sharing game with Direct Share A...,en
7,9136323715291453594,2.458278,1.465919e+09,CONTENT SHARED,-7.711052e+18,4.374244e+18,,,,HTML,http://www.fluentu.com/japanese/blog/how-to-im...,How to Improve 8 Major Problem Areas for Japan...,Have you hit a big ol' wall while learning Jap...,en
8,-3351652027149912881,2.413429,1.478348e+09,CONTENT SHARED,3.829785e+18,4.598987e+18,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,HTML,https://blog.rstudio.org/2016/03/29/feather/,Feather: A Fast On-Disk Format for Data Frames...,"Wes McKinney, Software Engineer, Cloudera Hadl...",en
9,2415534163825594672,2.366363,1.461339e+09,CONTENT SHARED,-2.979881e+18,4.821078e+18,,,,HTML,http://hintjens.com/blog:115,Hintjens.com,First: blog:1 Elegant Little Pieces Edited: by...,en
