Imports:

In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

Read data:

In [2]:
articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')

In [5]:
articles_df.shape

(3122, 13)

In [6]:
articles_df.columns

Index(['timestamp', 'eventType', 'contentId', 'authorPersonId',
       'authorSessionId', 'authorUserAgent', 'authorRegion', 'authorCountry',
       'contentType', 'url', 'title', 'text', 'lang'],
      dtype='object')

In [7]:
interactions_df.shape

(72312, 8)

In [8]:
interactions_df.columns

Index(['timestamp', 'eventType', 'contentId', 'personId', 'sessionId',
       'userAgent', 'userRegion', 'userCountry'],
      dtype='object')

In [9]:
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)

In [10]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [11]:
interactions_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714


In [12]:
articles_df['eventType'].value_counts()

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64

In [13]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)

In [14]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3047 entries, 1 to 3121
Data columns (total 9 columns):
timestamp          3047 non-null int64
contentId          3047 non-null int64
authorPersonId     3047 non-null int64
authorSessionId    3047 non-null int64
contentType        3047 non-null object
url                3047 non-null object
title              3047 non-null object
text               3047 non-null object
lang               3047 non-null object
dtypes: int64(4), object(5)
memory usage: 238.0+ KB


In [16]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72312 entries, 0 to 72311
Data columns (total 5 columns):
timestamp    72312 non-null int64
eventType    72312 non-null object
contentId    72312 non-null int64
personId     72312 non-null int64
sessionId    72312 non-null int64
dtypes: int64(4), object(1)
memory usage: 2.8+ MB


In [17]:
df = pd.merge(interactions_df[['contentId','personId', 'eventType']], articles_df[['contentId', 'title']], 
              how = 'inner', on = 'contentId')

In [22]:
df.head(10)

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
5,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem
7,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem
9,-3499919498720038879,-9009798162809551896,VIEW,Hiri wants to fix the workplace email problem


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72269 entries, 0 to 72268
Data columns (total 4 columns):
contentId    72269 non-null int64
personId     72269 non-null int64
eventType    72269 non-null object
title        72269 non-null object
dtypes: int64(2), object(2)
memory usage: 2.8+ MB


In [24]:
df['eventType'].value_counts()

VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [25]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}

df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])

In [26]:
df.head(10)

Unnamed: 0,contentId,personId,eventType,title,eventStrength
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem,1.0
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
5,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
7,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem,2.0
9,-3499919498720038879,-9009798162809551896,VIEW,Hiri wants to fix the workplace email problem,1.0


In [27]:
df = df.drop_duplicates()
grouped_df = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()

In [28]:
grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength
29948,3636910968448833585,5313335392004163852,Salesforce Architect Journey,1.0
22275,801895594717772308,6245165134513326654,ASSISTA: show une 100 drones e uma orquestra s...,1.0
24701,1895326251577378793,18738895644634365,Bot do Facebook Messenger simula entrevista pa...,1.0
40540,9148269800512008413,-133139342397538859,"Novo workaholic trabalha, pratica esportes e t...",1.0
30783,3891637997717104548,-4673235524420943843,Blog | Niantic,1.0
33874,5497189205340943824,4815632823882298534,"Por dentro do Nubank, conheça os segredos da f...",1.0
4825,-6946355789336786528,679005777543560737,Mastercard libera APIs para quem quer desenvol...,5.0
21849,692689608292948411,2765063319512128208,360 million reasons to destroy all passwords -...,1.0
34728,5907617845968952831,-133139342397538859,"Novo workaholic trabalha, pratica esportes e t...",1.0
25612,2318971825420092215,-8208801367848627943,Ray Kurzweil: The world isn't getting worse - ...,3.0


In [30]:
grouped_df.dtypes

personId           int64
contentId          int64
title             object
eventStrength    float64
dtype: object

In [31]:
grouped_df['title'] = grouped_df['title'].astype("category")
grouped_df['personId'] = grouped_df['personId'].astype("category")
grouped_df['contentId'] = grouped_df['contentId'].astype("category")
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes

In [34]:
grouped_df.head(10)

Unnamed: 0,personId,contentId,title,eventStrength,person_id,content_id
0,-9223121837663643404,-8949113594875411859,"No Brasil, '25% dos celulares ainda são 'Burro...",1.0,0,65
1,-9223121837663643404,-8377626164558006982,Bad Writing Is Destroying Your Company's Produ...,1.0,0,159
2,-9223121837663643404,-8208801367848627943,Ray Kurzweil: The world isn't getting worse - ...,1.0,0,187
3,-9223121837663643404,-8187220755213888616,Organizing for digital acceleration: Making a ...,1.0,0,195
4,-9223121837663643404,-7423191370472335463,"Espresso Intents: não é magia, é tecnologia! -...",1.0,0,313
5,-9223121837663643404,-7331393944609614247,Here's proof that Google is getting serious ab...,1.0,0,327
6,-9223121837663643404,-6872546942144599345,My experience with Google's Associate Android ...,1.0,0,385
7,-9223121837663643404,-6728844082024523434,Seniority,1.0,0,416
8,-9223121837663643404,-6590819806697898649,Listas com RecyclerView - Android Dev BR,1.0,0,442
9,-9223121837663643404,-6558712014192834002,Google's fair use victory is good for open source,1.0,0,450


In [35]:
sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['content_id'], 
                                                                                       grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['person_id'], 
                                                                                       grouped_df['content_id'])))

In [36]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)



In [37]:
alpha = 15
data = (sparse_content_person * alpha).astype('double')

# Fit the model
model.fit(data)

100%|██████████| 50.0/50 [00:02<00:00, 20.74it/s]


In [44]:
content_id = 450
n_similar = 10

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

In [45]:
for content in similar:
    idx, score = content
    print(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])

Webinar do AEM para ajuda na preparação para fazer a prova de certificação.
Conselho da SABMiller aceita proposta de compra da AB InBev
Fintechs can help incumbents, not just disrupt them
Visa e Swatch lançam relógio para pagamentos contactless no Brasil - Startupi
Startup uses deep learning to let you shop for items by snapping photos
Peter Doig sued for disavowing 40-year-old painting
How Starbucks became so successful at coaxing consumers to pay with smartphones
A lição de um grande mestre de xadrez para resolver problemas
How Google sets goals: OKRs - GV Library
Deep-learning neural network creates its own interpretive dance | ExtremeTech


In [46]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations

In [47]:
# Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
person_id = 50

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title  score
0  Deep-learning neural network creates its own i...    NaN
1     Deck describing how MSFT plans to use Linkedin    NaN
2  Fintechs can help incumbents, not just disrupt...    NaN
3  Visa e Swatch lançam relógio para pagamentos c...    NaN
4  Conselho da SABMiller aceita proposta de compr...    NaN
5  Peter Doig sued for disavowing 40-year-old pai...    NaN
6  Webinar do AEM para ajuda na preparação para f...    NaN
7  A lição de um grande mestre de xadrez para res...    NaN
8           How Google sets goals: OKRs - GV Library    NaN
9  How Starbucks became so successful at coaxing ...    NaN


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [48]:
grouped_df.loc[grouped_df['person_id'] == 50].sort_values(by=['eventStrength'], ascending=False)[['title', 'person_id', 'eventStrength']].head(10)

Unnamed: 0,title,person_id,eventStrength
1727,Acquia Engage 2016: Day One,50,3.0
1791,Um bilhão de arquivos mostram quem vence a dis...,50,3.0
1781,Acquia Engage Awards Finalists Announced,50,3.0
1778,Sharing innovation with your competitors - Dri...,50,3.0
1769,Don't document your code. Code your documentat...,50,3.0
1747,Who sponsors Drupal development? | Dries Buytaert,50,3.0
1768,Johnson & Johnson comprará grupo suíço por US$...,50,1.0
1767,Slack and Google announce partnership focused ...,50,1.0
1770,Rating the English Proficiency of Countries an...,50,1.0
1766,Infográfico: Algoritmos para Aprendizado de Má...,50,1.0
