# Colloborative filtering modeling with Implicit library

reference: https://towardsdatascience.com/building-a-collaborative-filtering-recommender-system-with-clickstream-data-dffc86c8c65

In [2]:
!pip install implicit==0.4.4

Collecting implicit==0.4.4
  Downloading implicit-0.4.4.tar.gz (1.1 MB)
[?25l[K     |▎                               | 10 kB 23.4 MB/s eta 0:00:01[K     |▋                               | 20 kB 28.8 MB/s eta 0:00:01[K     |▉                               | 30 kB 26.0 MB/s eta 0:00:01[K     |█▏                              | 40 kB 19.6 MB/s eta 0:00:01[K     |█▌                              | 51 kB 15.5 MB/s eta 0:00:01[K     |█▊                              | 61 kB 11.3 MB/s eta 0:00:01[K     |██                              | 71 kB 12.4 MB/s eta 0:00:01[K     |██▍                             | 81 kB 13.4 MB/s eta 0:00:01[K     |██▋                             | 92 kB 14.3 MB/s eta 0:00:01[K     |███                             | 102 kB 13.4 MB/s eta 0:00:01[K     |███▎                            | 112 kB 13.4 MB/s eta 0:00:01[K     |███▌                            | 122 kB 13.4 MB/s eta 0:00:01[K     |███▉                            | 133 kB 13.4 MB/s eta 0:

In [3]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse

In [4]:
# I used Deskdrop's log data
# source: https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop

a_df = pd.read_csv('shared_articles.csv')
a_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [5]:
i_df = pd.read_csv('users_interactions.csv')
i_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [6]:
# remove 'content removed'

print(a_df.eventType.value_counts())
a_df = a_df[a_df['eventType'] == 'CONTENT SHARED'] 

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64


In [7]:
# merge dataframe of interactions with dataframe of articles
df = pd.merge(i_df[['contentId', 'personId', 'eventType']], a_df[['contentId', 'title']],
        how='inner', on='contentId')
df

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
...,...,...,...,...
72264,-7108012586837980940,-4028919343899978105,VIEW,The Ultimate Digital Clean-Up Checklist: Are Y...
72265,7526977287801930517,-3643155458357242906,VIEW,Renewing Medium's focus
72266,-282629989972409543,5660542693104786364,VIEW,Santander Brasil gera 21% do resultado da matr...
72267,-6468782714472551646,5660542693104786364,VIEW,"Santander Brasil tem lucro gerencial de R$ 1,9..."


In [8]:
df['eventType'].value_counts()

VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [9]:
# give weight for each eventType according to importance
weights = {'VIEW':1, 'LIKE':2, 'BOOKMARK':3, 'FOLLOW':4, 'COMMENT CREATED':5}

df['weight'] = df['eventType'].apply(lambda x: weights[x])
df

Unnamed: 0,contentId,personId,eventType,title,weight
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem,1
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1
...,...,...,...,...,...
72264,-7108012586837980940,-4028919343899978105,VIEW,The Ultimate Digital Clean-Up Checklist: Are Y...,1
72265,7526977287801930517,-3643155458357242906,VIEW,Renewing Medium's focus,1
72266,-282629989972409543,5660542693104786364,VIEW,Santander Brasil gera 21% do resultado da matr...,1
72267,-6468782714472551646,5660542693104786364,VIEW,"Santander Brasil tem lucro gerencial de R$ 1,9...",1


In [10]:
# drop duplicated instance

print(df.shape)
print(df[df.duplicated(keep='first')])
df = df.drop_duplicates()
print(df.shape)

(72269, 5)
                 contentId  ...  weight
1     -3499919498720038879  ...       1
4     -3499919498720038879  ...       1
5     -3499919498720038879  ...       1
7     -3499919498720038879  ...       1
10    -3499919498720038879  ...       1
...                    ...  ...     ...
72224  4675505028897335428  ...       1
72225  4675505028897335428  ...       1
72226  4675505028897335428  ...       1
72231  4675505028897335428  ...       1
72258  4644184613269860655  ...       1

[21359 rows x 5 columns]
(50910, 5)


In [11]:
# sum weights with group by 
df1 = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()
df1

Unnamed: 0,personId,contentId,title,weight
0,-9223121837663643404,-8949113594875411859,"No Brasil, '25% dos celulares ainda são 'Burro...",1
1,-9223121837663643404,-8377626164558006982,Bad Writing Is Destroying Your Company's Produ...,1
2,-9223121837663643404,-8208801367848627943,Ray Kurzweil: The world isn't getting worse - ...,1
3,-9223121837663643404,-8187220755213888616,Organizing for digital acceleration: Making a ...,1
4,-9223121837663643404,-7423191370472335463,"Espresso Intents: não é magia, é tecnologia! -...",1
...,...,...,...,...
40677,9210530975708218054,8477804012624580461,Gartner: em 2020 não haverá mais aplicativos,10
40678,9210530975708218054,8526042588044002101,Cloud Native Part 1: Definition,1
40679,9210530975708218054,8856169137131817223,Para inovar é preciso mais do que boas ideias ...,1
40680,9210530975708218054,8869347744613364434,Java Garbage Collection Essencial,1


# Modeling - Alternating Least Squares

In [12]:
# create category index columns to make sparse matrices

df1['person_id'] = df1['personId'].astype('category').cat.codes
df1['content_id'] = df1['contentId'].astype('category').cat.codes

In [13]:
df1.person_id

0           0
1           0
2           0
3           0
4           0
         ... 
40677    1894
40678    1894
40679    1894
40680    1894
40681    1894
Name: person_id, Length: 40682, dtype: int16

In [15]:
# create sparse matrices
# Use scipy csr matrix: csr_matrix((data, (row_ind, col_ind)))
# a_ij = sum(data) for all (data, (i, j))

content_user = sparse.csr_matrix((df1['weight'],(df1['content_id'], df1['person_id'])))
user_content = sparse.csr_matrix((df1['weight'],(df1['person_id'], df1['content_id'])))

In [16]:
content_user

<2979x1895 sparse matrix of type '<class 'numpy.longlong'>'
	with 40682 stored elements in Compressed Sparse Row format>

In [19]:
print('sparsity:', (2979 * 1895 - 40682)/(2979 * 1895)) # sparsity 99.3%의 matrix

sparsity: 0.9927935300843813


In [20]:
# We use a latent factor method because the number of contents are big
# We use Implicit library because the dataset is implicit
# Algorithm: Alternaing Least Squares

model = implicit.als.AlternatingLeastSquares(factors=32, regularization=0.1, iterations=50, random_state=42)



In [21]:
# Calculate the confidence by multiplying alpha

alpha = 15
data_conf = (content_user * alpha).astype('double')

In [22]:
# Fit the model

model.fit(data_conf)

  0%|          | 0/50 [00:00<?, ?it/s]

In [23]:
# Find similar articles

content_id = 450
n_similar = 10

model.similar_items(content_id, n_similar)

[(450, 1.0000001),
 (2746, 0.8830807),
 (2102, 0.8827647),
 (2913, 0.8799732),
 (2301, 0.8759437),
 (129, 0.8735974),
 (535, 0.8735723),
 (1398, 0.8687787),
 (1288, 0.86873996),
 (1518, 0.8672425)]

In [42]:
# define a function to find similar articles if content_id is given
def find_similar_articles(content_id, n_similar=10):
    print('Target articles:', df1.title.loc[df1.content_id == content_id].iloc[0])
    similar = model.similar_items(content_id, n_similar)
    print('---------------------------------------------------------------------------------------------')
    print('Recommendations (The first article is very likely to be the target article):')
    for idx, prob in similar:
    print('title:', df1.title.loc[df1.content_id == idx].iloc[0], '(score:{:.4f})'.format(prob))

In [49]:
find_similar_articles(450)

Target articles: Google's fair use victory is good for open source
---------------------------------------------------------------------------------------------
Recommendations (The first article is very likely to be the target article):
title: Google's fair use victory is good for open source (score:1.0000)
title: Google will show AMP URLs before App deep link URLs in mobile results (score:0.8831)
title: Atlassian launches Bitbucket Pipelines (score:0.8828)
title: How to forecast demand with Google BigQuery, public datasets and TensorFlow | Google Cloud Big Data and Machine Learning Blog (score:0.8800)
title: Engineers Shouldn't Write ETL: A Guide to Building a High Functioning Data Science Department (score:0.8759)
title: Apple Has Created 'Detailed Mockups' of iMessage for Android (score:0.8736)
title: Robô da IBM substitui 34 funcionários de empresa no Japão (score:0.8736)
title: O IPO é a saída para a Netshoes? (score:0.8688)
title: Kaol Porfírio luta como uma garota! - Think Olga