### Import dependencies

In [1]:
import os, sys, inspect
import numpy as np
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt 

### Get/Insert current and project root path

In [2]:
currdir =  os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
rootdir =  os.path.dirname(currdir)
sys.path.insert(0, rootdir)

In [3]:
from rcpy.model import Popularity
from rcpy.model import ContentFilter
from rcpy.model import CollaborativeFilter
from rcpy.model.evaluator import RecallTopN

### Load data

In [4]:
articles_df = pd.read_csv(os.path.join(rootdir, "rcpy", "data", "ext", "shared_articles.csv"))
articles_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [5]:
interactions_df = pd.read_csv(os.path.join(rootdir, "rcpy","data", "ext", "users_interactions.csv"))
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


### Perfom data munging
Associate a weight/rank to each event type according to its strength.

In [6]:
event_type_keys = interactions_df.eventType.unique()
event_type_keys

array(['VIEW', 'FOLLOW', 'BOOKMARK', 'LIKE', 'COMMENT CREATED'],
      dtype=object)

In [7]:
event_type_values = [1.0, 2.0, 2.5, 3.0, 4.0]
event_type_values

[1.0, 2.0, 2.5, 3.0, 4.0]

In [8]:
event_type_dict = dict((key, value) for (key, value) in zip(event_type_keys, event_type_values))

In [9]:
interactions_df["eventStrength"] = interactions_df.eventType.apply(lambda x: event_type_dict[x])

To prevent the user-cold-start problem eliminate user with interactions less than 5.

In [10]:
interactions_count_df = pd.DataFrame(interactions_df.groupby(["personId"])["contentId"].nunique()).reset_index()
interactions_count_df.rename(columns={"contentId": "interactionsCount"}, inplace=True)

In [11]:
users_with_enough_interactions = interactions_count_df[interactions_count_df.interactionsCount >= 5]
users_with_enough_interactions.head()

Unnamed: 0,personId,interactionsCount
0,-9223121837663643404,43
1,-9212075797126931087,5
2,-9207251133131336884,7
3,-9199575329909162940,11
4,-9196668942822132778,7


In [12]:
print('# of interactions: %d' % len(interactions_df))
selected_interactions_df = interactions_df.merge(
    users_with_enough_interactions,
    how="right",
    left_on="personId",
    right_on="personId")
print('# of interactions from users with at least 5 interactions: %d' % len(selected_interactions_df))
selected_interactions_df.head()

# of interactions: 72312
# of interactions from users with at least 5 interactions: 69868


Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength,interactionsCount
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0,130
1,1465413046,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,SP,BR,1.0,130
2,1464190235,VIEW,6437568358552101410,-8845298781299428018,-1157447994463607871,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,SP,BR,1.0,130
3,1459429221,VIEW,-4760639635023250284,-8845298781299428018,-5149610736659242149,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,SP,BR,1.0,130
4,1459274156,VIEW,-6142462826726347616,-8845298781299428018,-6283148774987755959,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,SP,BR,1.0,130


Aggregate all the user interactions in a single score. (sum of weighted interactions strength smoothed by a log transformation) 

In [13]:


full_interactions_df = selected_interactions_df\
    .groupby(["personId", "contentId"])["eventStrength"]\
    .sum() \
    .apply(lambda x: math.log(1+x,2)) \
    .reset_index()
full_interactions_df.head()

Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925


### Build evaluation data

In [14]:
user_column = "personId"
content_column = "contentId"
score_column = "eventStrength"
n_factors = 15

In [15]:
train_interactions_df, test_interactions_df = train_test_split(
    full_interactions_df,
    stratify=full_interactions_df[user_column],
    test_size=0.20,
    random_state=42)

In [16]:
print('# interactions on Train set: %d' % len(train_interactions_df))
print('# interactions on Test set: %d' % len(test_interactions_df))

# interactions on Train set: 31284
# interactions on Test set: 7822


In [17]:
full_interactions_df = full_interactions_df.set_index(user_column)
train_interactions_df = train_interactions_df.set_index(user_column)
test_interactions_df = test_interactions_df.set_index(user_column)

# full_interactions_df = full_interactions_df.reset_index()
# train_interactions_df = train_interactions_df.reset_index()
# test_interactions_df = test_interactions_df.reset_index()

In [18]:
# create user-itm score data frame
scores_df = train_interactions_df.pivot(columns=content_column, values=score_column).fillna(0)
print("user-item scores data frame: ({:d}, {:d})".format(scores_df.shape[0], scores_df.shape[1]))
scores_df.head(10)


user-item scores data frame: (1140, 2926)


contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9176143510534135851,-9172673334835262304,-9171475473795142532,-9166778629773133902,...,9191014301634017491,9207286802575546269,9208127165664287660,9209629151177723638,9209886322932807692,9213260650272029784,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,2.321928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9188188261933657343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9172914609055320039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9156344805277471150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9120685872592674274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9109785559521267180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# get uer-item scores matrix
scores_matrix = scores_df.values # user-item score matrix
print("user-item scores matrix: ({:d}, {:d})".format(scores_matrix.shape[0], scores_matrix.shape[1]))

user-item scores matrix: (1140, 2926)


In [20]:
# perform matrix factorization
U, sigma, Vt = svds(scores_matrix, k=n_factors) # factorized user-item score matrix
print("U: ({:d}, {:d})".format(U.shape[0], U.shape[1]))
print("sigma: ({:d})".format(len(sigma)))
print("Vt: ({:d}, {:d})".format(Vt.shape[0], Vt.shape[1]))

U: (1140, 15)
sigma: (15)
Vt: (15, 2926)


In [21]:
# reconstruct scores
scores_predicted_matrix =  np.dot(np.dot(U, np.diag(sigma)) , Vt)

In [22]:
# creat score data frame from score matrix
scores_predicted_df = pd.DataFrame(scores_predicted_matrix,
                                   columns=scores_df.columns,
                                   index=scores_df.index).transpose()
print("item-user predicted data frame: ({:d}, {:d})".format(scores_predicted_df.shape[0],
                                                            scores_predicted_df.shape[1]))
scores_predicted_df.head(10)

item-user predicted data frame: (2926, 1140)


personId,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222795471790223670,0.00485,-0.000456,-0.010169,0.058784,-0.00978,-0.004324,-0.004453,0.045504,-0.001874,-0.01894,...,-0.004182,-0.105448,0.000979,0.01413,0.031989,-0.008898,-0.018234,-0.025583,-0.020703,-0.020562
-9216926795620865886,0.000292,-0.00023,0.00773,-0.000269,0.000119,0.000272,0.000109,0.00072,0.002665,0.001426,...,-0.000224,0.014717,0.000548,0.001504,0.016964,0.001024,0.002117,0.009375,0.004455,0.003716
-9194572880052200111,-0.02304,-0.001931,-0.009119,-0.003649,0.024948,-0.00172,0.030106,-0.015879,-0.022432,0.004695,...,0.011473,0.036327,0.011388,0.020062,0.176805,0.021631,0.025802,-0.023756,0.009584,0.141702
-9192549002213406534,0.038676,0.000897,-0.035799,-0.006191,0.018377,0.0046,0.014409,0.054316,0.049604,0.004401,...,0.016444,0.294301,0.006416,-0.010134,0.038508,0.012372,0.009922,-0.018816,0.022117,0.106348
-9190737901804729417,0.019836,-0.00645,0.011248,0.007434,-0.003242,0.002913,0.00235,0.004552,-0.030522,0.020258,...,0.002669,0.011687,0.002925,1.2e-05,-0.047209,-0.001835,-0.000264,0.009215,0.005795,-0.022221
-9189659052158407108,0.005947,0.006662,0.002814,-0.003675,0.008263,-0.001208,0.003291,0.002198,0.022154,0.02083,...,0.023496,0.088919,-0.001883,0.002615,0.098591,0.007871,0.014248,0.017763,0.021165,0.067955
-9176143510534135851,0.040836,0.007425,0.013606,0.011935,-0.003528,-0.004621,0.015342,0.023478,0.066618,0.058955,...,-0.001988,0.063974,-0.017215,0.002521,0.006707,-0.001435,0.004142,0.04024,0.029514,0.050995
-9172673334835262304,0.004451,0.00065,0.002204,-0.000824,0.000413,0.000522,0.001259,0.002071,-0.002745,0.003111,...,0.004687,0.018105,0.000342,-0.000935,-0.00655,0.000222,0.002559,0.002473,0.006003,-0.001903
-9171475473795142532,0.030142,-0.000801,0.000428,-0.006268,0.002805,0.001227,0.011491,-0.00314,-0.011298,0.026195,...,0.012625,0.027591,0.000414,1.6e-05,-0.027984,0.002073,0.01067,0.000198,0.016424,-0.007493
-9166778629773133902,0.009544,-0.002217,-0.014063,-0.002275,0.005068,-4e-05,0.001485,0.004248,0.012544,-0.003614,...,0.001668,0.062148,0.0009,-0.003938,0.00983,0.003576,0.002912,-0.008748,0.0002,0.021179


In [23]:
user_id = -9223121837663643404
items_to_ignore =[]
user_scores_df = scores_predicted_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: score_column}).head(5)

In [24]:
user_scores_df[~user_scores_df[content_column].isin(items_to_ignore)]

Unnamed: 0,contentId,eventStrength
0,-8208801367848627943,0.405222
1,943818026930898372,0.298944
2,2072448887839540892,0.264123
3,3149164017776669829,0.261815
4,-4029704725707465084,0.23679


In [25]:
user_scores_df[~user_scores_df[content_column].isin(items_to_ignore)].sort_values(score_column, ascending=False)

Unnamed: 0,contentId,eventStrength
0,-8208801367848627943,0.405222
1,943818026930898372,0.298944
2,2072448887839540892,0.264123
3,3149164017776669829,0.261815
4,-4029704725707465084,0.23679


In [26]:
# create model
collaborative_filter = CollaborativeFilter(scores_df=scores_predicted_df, items_df=articles_df)
collaborative_filter

<rcpy.model.CollaborativeFilter at 0x7f72319521d0>

In [27]:
collaborative_filter.recommend(-1479311724257856983,
                               content_column,
                               score_column,)

Unnamed: 0,contentId,eventStrength
204,-8085935119790093311,1.047726
1965,3269302169678465882,0.975446
1612,1005751836898964351,0.935801
407,-6727357771678896471,0.933358
156,-8377626164558006982,0.886154
645,-5253644367331262405,0.877353
2632,7395435905985567130,0.87218
189,-8190931845319543363,0.868841
1696,1549650080907932816,0.857511
2262,5092635400707338872,0.832094


### Preform recommendations

In [28]:
model_evaluator = RecallTopN(
    full_interactions_df=full_interactions_df,
    test_interactions_df=test_interactions_df,
    train_interactions_df=train_interactions_df,
    items_df=articles_df,
    content_id=content_column,
    score_id=score_column,
    sample_size=100)

overall_results_df, detailed_results_df = model_evaluator.evaluate(collaborative_filter)

TypeError: __init__() got an unexpected keyword argument 'content_id'

In [None]:
overall_results_df.head(10)

In [None]:
detailed_results_df.head(10)