### Import dependencies

In [1]:
import os, sys, inspect
import numpy as np
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt 

### Get/Insert current and project root path

In [2]:
currdir =  os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
rootdir =  os.path.dirname(currdir)
sys.path.insert(0, rootdir)

In [3]:
from rcpy.model import Popularity
from rcpy.model.evaluator import RecallTopN

### Load data

In [4]:
articles_df = pd.read_csv(os.path.join(rootdir, "rcpy", "data", "ext", "shared_articles.csv"))
articles_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [5]:
interactions_df = pd.read_csv(os.path.join(rootdir, "rcpy","data", "ext", "users_interactions.csv"))
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


### Perfom data munging
Associate a weight/rank to each event type according to its strength.

In [6]:
event_type_keys = interactions_df.eventType.unique()
event_type_keys

array(['VIEW', 'FOLLOW', 'BOOKMARK', 'LIKE', 'COMMENT CREATED'],
      dtype=object)

In [7]:
event_type_values = [1.0, 2.0, 2.5, 3.0, 4.0]
event_type_values

[1.0, 2.0, 2.5, 3.0, 4.0]

In [8]:
event_type_dict = dict((key, value) for (key, value) in zip(event_type_keys, event_type_values))

In [9]:
interactions_df["eventStrength"] = interactions_df.eventType.apply(lambda x: event_type_dict[x])

To prevent the user-cold-start problem eliminate user with interactions less than 5.

In [10]:
interactions_count_df = pd.DataFrame(interactions_df.groupby(["personId"])["contentId"].nunique()).reset_index()
interactions_count_df.rename(columns={"contentId": "interactionsCount"}, inplace=True)

In [11]:
users_with_enough_interactions = interactions_count_df[interactions_count_df.interactionsCount >= 5]
users_with_enough_interactions.head()

Unnamed: 0,personId,interactionsCount
0,-9223121837663643404,43
1,-9212075797126931087,5
2,-9207251133131336884,7
3,-9199575329909162940,11
4,-9196668942822132778,7


In [12]:
print('# of interactions: %d' % len(interactions_df))
selected_interactions_df = interactions_df.merge(
    users_with_enough_interactions,
    how="right",
    left_on="personId",
    right_on="personId")
print('# of interactions from users with at least 5 interactions: %d' % len(selected_interactions_df))
selected_interactions_df.head()

# of interactions: 72312
# of interactions from users with at least 5 interactions: 69868


Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength,interactionsCount
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0,130
1,1465413046,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5...,SP,BR,1.0,130
2,1464190235,VIEW,6437568358552101410,-8845298781299428018,-1157447994463607871,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,SP,BR,1.0,130
3,1459429221,VIEW,-4760639635023250284,-8845298781299428018,-5149610736659242149,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,SP,BR,1.0,130
4,1459274156,VIEW,-6142462826726347616,-8845298781299428018,-6283148774987755959,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,SP,BR,1.0,130


Aggregate all the user interactions in a single score. (sum of weighted interactions strength smoothed by a log transformation) 

In [13]:
full_interactions_df = selected_interactions_df\
    .groupby(["personId", "contentId"])["eventStrength"]\
    .sum() \
    .apply(lambda x: math.log(1+x,2)) \
    .reset_index()
full_interactions_df.head()

Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925


### Build evaluation data

In [14]:
train_interactions_df, test_interactions_df = train_test_split(
    full_interactions_df,
    stratify=full_interactions_df["personId"],
    test_size=0.20,
    random_state=42)

In [15]:
print('# interactions on Train set: %d' % len(train_interactions_df))
print('# interactions on Test set: %d' % len(test_interactions_df))

# interactions on Train set: 31284
# interactions on Test set: 7822


In [16]:
full_interactions_df = full_interactions_df.set_index("personId")
train_interactions_df = train_interactions_df.set_index("personId")
test_interactions_df = test_interactions_df.set_index("personId")

In [17]:
type(full_interactions_df.index.unique())

pandas.core.indexes.numeric.Int64Index

### Preform recommendations

#### Popularity

In [18]:
popularity_recommender = Popularity(full_interactions_df, articles_df)
popularity_recommender.recommend(user_id=-9223121837663643404,
                                 content_id="contentId",
                                 score_id="eventStrength",
                                 topn=5,
                                 full=True)

Unnamed: 0,contentId,eventStrength,timestamp,eventType,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,-4029704725707465084,319.508482,1487246811,CONTENT SHARED,6013226412048763966,-6569695881431984742,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,HTML,http://www.cnbc.com/2016/12/21/former-google-c...,Former Google career coach shares a visual tri...,"If you want 2017 to be an exciting year, desig...",en
1,-6783772548752091658,239.587417,1468867647,CONTENT SHARED,4918484843075254252,-8995217520473210153,,,,HTML,http://www.caroli.org/livro-retrospectivas-div...,Livro: Retrospectivas Divertidas,"Neste livro, nós fornecemos um conjunto de fer...",pt
2,-133139342397538859,234.52182,1467813728,CONTENT SHARED,4918484843075254252,-5701227433817087697,,,,HTML,http://gq.globo.com/Prazeres/Poder/Carreira/no...,"Novo workaholic trabalha, pratica esportes e t...",Novo workaholic não abre mão do esporte e da f...,pt
3,-8208801367848627943,202.259434,1469678235,CONTENT SHARED,-3390049372067052505,2045534933671019150,,,,HTML,http://www.geekwire.com/2016/ray-kurzweil-worl...,Ray Kurzweil: The world isn't getting worse - ...,"Ray Kurzweil, the author, inventor, computer s...",en
4,-6843047699859121724,198.878826,1461629452,CONTENT SHARED,7527226129639571966,-1297230017812472163,,,,HTML,https://medium.com/@jeffersoncn/ganhe-6-meses-...,"Ganhe 6 meses de acesso ao Pluralsight, maior ...","Ganhe 6 meses de acesso ao Pluralsight, maior ...",pt


In [19]:
model_evaluator = RecallTopN(
    full_interactions_df=full_interactions_df,
    test_interactions_df=test_interactions_df,
    train_interactions_df=train_interactions_df,
    items_df=articles_df,
    content_id="contentId",
    score_id="eventStrength",
    sample_size=100)

In [20]:
overall_results_df, detailed_results_df = model_evaluator.evaluate(popularity_recommender)

user:  -830175562779396891 	recall@5:  0.4 	recall@10:  0.4
user:  -7267769888748948232 	recall@5:  0.24242424242424243 	recall@10:  0.36363636363636365
user:  -3535274684588209118 	recall@5:  0.3076923076923077 	recall@10:  0.38461538461538464
user:  3094513233385472738 	recall@5:  0.4375 	recall@10:  0.4375
user:  -3390049372067052505 	recall@5:  0.18604651162790697 	recall@10:  0.27906976744186046
user:  -3500661007957156229 	recall@5:  0.058823529411764705 	recall@10:  0.11764705882352941
user:  -8399605302938582500 	recall@5:  0.2 	recall@10:  0.36
user:  -7990997793599977496 	recall@5:  0.3333333333333333 	recall@10:  0.4666666666666667
user:  3681539658619684640 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  5328649570833345873 	recall@5:  0.0 	recall@10:  0.6666666666666666
user:  -1443636648652872475 	recall@5:  0.03418803418803419 	recall@10:  0.06837606837606838
user:  -1251984896177895077 	recall@5:  0.1111111111111111 	recall@10:  0.3333333333333333


user:  -5527145562136413747 	recall@5:  0.14285714285714285 	recall@10:  0.21428571428571427
user:  -4465926797008424436 	recall@5:  0.36363636363636365 	recall@10:  0.5454545454545454
user:  -534549863526737439 	recall@5:  0.1875 	recall@10:  0.25
user:  -331066625167168067 	recall@5:  0.25 	recall@10:  0.4375
user:  -9016528795238256703 	recall@5:  0.18840579710144928 	recall@10:  0.2898550724637681
user:  7890134385692540512 	recall@5:  0.2631578947368421 	recall@10:  0.3157894736842105
user:  8676130229735483748 	recall@5:  0.12 	recall@10:  0.28
user:  1895326251577378793 	recall@5:  0.16 	recall@10:  0.28
user:  5327824330504712485 	recall@5:  0.5714285714285714 	recall@10:  0.7142857142857143
user:  -3474152259371895547 	recall@5:  0.09090909090909091 	recall@10:  0.2727272727272727
user:  7293029244157928140 	recall@5:  0.5 	recall@10:  0.625
user:  -5918600103053825191 	recall@5:  0.25 	recall@10:  0.5
user:  6971525809430309144 	recall@5:  0.13333333333333333 	recall@10:  0.4

user:  -867171338823652769 	recall@5:  0.3333333333333333 	recall@10:  0.4444444444444444
user:  7869282721414305216 	recall@5:  0.0 	recall@10:  0.3333333333333333
user:  434517017614495946 	recall@5:  0.2727272727272727 	recall@10:  0.45454545454545453
user:  -9188188261933657343 	recall@5:  0.0 	recall@10:  0.0
user:  4537318193229625884 	recall@5:  0.0 	recall@10:  0.0
user:  670878778036881163 	recall@5:  0.42105263157894735 	recall@10:  0.5789473684210527
user:  908938729693125261 	recall@5:  0.0 	recall@10:  0.2
user:  4417246933621591943 	recall@5:  0.2 	recall@10:  0.2
user:  -108842214936804958 	recall@5:  0.07407407407407407 	recall@10:  0.12962962962962962
user:  3938645257702379823 	recall@5:  0.18181818181818182 	recall@10:  0.36363636363636365
user:  -7898880382889684604 	recall@5:  0.5 	recall@10:  0.75
user:  599868086167624974 	recall@5:  0.14285714285714285 	recall@10:  0.2857142857142857
user:  8874741321583329336 	recall@5:  0.0 	recall@10:  0.0
user:  -58735620083

user:  7316935598591113280 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  5621833459783231486 	recall@5:  0.06451612903225806 	recall@10:  0.25806451612903225
user:  6686431125336194142 	recall@5:  0.0 	recall@10:  0.0
user:  1962800348704770740 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  -4467650312287951120 	recall@5:  0.0 	recall@10:  0.0
user:  -8607239111818252463 	recall@5:  0.75 	recall@10:  1.0
user:  -9172914609055320039 	recall@5:  0.0 	recall@10:  0.2
user:  -229539536136014922 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  -4712939006281398894 	recall@5:  1.0 	recall@10:  1.0
user:  1716230101302949859 	recall@5:  0.0 	recall@10:  0.0
user:  -6023498443312206944 	recall@5:  0.0 	recall@10:  0.0
user:  2002083152443772042 	recall@5:  0.125 	recall@10:  0.125
user:  -7035562184657775493 	recall@5:  0.5 	recall@10:  1.0
user:  -8853658195208337106 	recall@5:  0.34782608695652173 	recall@10:  0.521739130434

user:  -8781306637602263252 	recall@5:  0.1875 	recall@10:  0.375
user:  674947025087085832 	recall@5:  0.06666666666666667 	recall@10:  0.26666666666666666
user:  -8436122021634018264 	recall@5:  0.0 	recall@10:  0.25
user:  -6234126753805415756 	recall@5:  0.5 	recall@10:  0.5
user:  -3978390417378743562 	recall@5:  0.0 	recall@10:  0.0
user:  -6939883833521991898 	recall@5:  0.5 	recall@10:  0.5
user:  -588447608033313001 	recall@5:  0.3333333333333333 	recall@10:  0.4444444444444444
user:  6935578859207277054 	recall@5:  0.0 	recall@10:  0.0
user:  2318971825420092215 	recall@5:  0.8 	recall@10:  0.8
user:  1671960877913596679 	recall@5:  0.3333333333333333 	recall@10:  0.5
user:  3118792477913513242 	recall@5:  0.3333333333333333 	recall@10:  0.3333333333333333
user:  6960073744377754728 	recall@5:  0.3 	recall@10:  0.5
user:  -3696538850054062306 	recall@5:  0.3333333333333333 	recall@10:  0.3333333333333333
user:  5746645399823844475 	recall@5:  0.2 	recall@10:  0.4
user:  -2615

user:  3592784031357342393 	recall@5:  0.5 	recall@10:  0.5
user:  8813266398846460512 	recall@5:  0.5 	recall@10:  0.5
user:  1771843275312644574 	recall@5:  1.0 	recall@10:  1.0
user:  8239286975497580612 	recall@5:  0.3333333333333333 	recall@10:  0.4
user:  -9199575329909162940 	recall@5:  0.5 	recall@10:  0.5
user:  -48161796606086482 	recall@5:  0.23076923076923078 	recall@10:  0.46153846153846156
user:  -41506117647079716 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  5316031233920901237 	recall@5:  0.0 	recall@10:  0.5
user:  6408246065272657336 	recall@5:  0.0 	recall@10:  0.25
user:  -9120685872592674274 	recall@5:  0.2 	recall@10:  0.2
user:  5565776571801586229 	recall@5:  0.5 	recall@10:  1.0
user:  1929981821863611546 	recall@5:  0.0 	recall@10:  0.0
user:  6967147874415599925 	recall@5:  0.0 	recall@10:  0.5
user:  6646901263143710184 	recall@5:  0.5 	recall@10:  1.0
user:  3028038382445447853 	recall@5:  0.5 	recall@10:  0.5
user:  707237333451159

user:  -444330148331768170 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  -5140211912313628591 	recall@5:  0.0 	recall@10:  0.0
user:  6860472782621321785 	recall@5:  0.3333333333333333 	recall@10:  0.6666666666666666
user:  1538315580625707373 	recall@5:  0.0 	recall@10:  0.5
user:  6136591349108383474 	recall@5:  0.0 	recall@10:  0.5
user:  -3359156065199608672 	recall@5:  0.0 	recall@10:  0.125
user:  4672375507299551161 	recall@5:  0.6666666666666666 	recall@10:  0.6666666666666666
user:  2217968793137479020 	recall@5:  0.0 	recall@10:  0.0
user:  4160581188559697212 	recall@5:  0.5 	recall@10:  0.6
user:  -2411675934925653835 	recall@5:  0.0 	recall@10:  0.0
user:  1200132673411527362 	recall@5:  1.0 	recall@10:  1.0
user:  5404470824348262768 	recall@5:  0.6666666666666666 	recall@10:  0.6666666666666666
user:  8531798655176011897 	recall@5:  0.5 	recall@10:  1.0
user:  -6533304626998890151 	recall@5:  0.5 	recall@10:  1.0
user:  3472075810981614387 	recall

user:  -5349775063975483956 	recall@5:  0.0 	recall@10:  0.5
user:  1915826476552544131 	recall@5:  0.5 	recall@10:  0.5
user:  8236260669729422750 	recall@5:  0.0 	recall@10:  0.0
user:  3168283140501778164 	recall@5:  0.0 	recall@10:  0.0
user:  -6226728612139235550 	recall@5:  0.0 	recall@10:  0.0
user:  -6874849957985527250 	recall@5:  0.0 	recall@10:  0.0
user:  8745559424774781326 	recall@5:  0.0 	recall@10:  0.0
user:  4965319960651517505 	recall@5:  1.0 	recall@10:  1.0
user:  1104652557829563522 	recall@5:  0.0 	recall@10:  0.0
user:  8940614478925413056 	recall@5:  1.0 	recall@10:  1.0
user:  1178253609400958886 	recall@5:  1.0 	recall@10:  1.0
user:  -2455286848766388141 	recall@5:  0.0 	recall@10:  0.0
user:  -7753289154322622303 	recall@5:  0.0 	recall@10:  0.0
user:  -4154712816784766009 	recall@5:  0.6666666666666666 	recall@10:  1.0
user:  -8620763856232712186 	recall@5:  0.5 	recall@10:  0.5
user:  -2762344524254945371 	recall@5:  0.0 	recall@10:  0.0
user:  1693920335

user:  -2484779020675533629 	recall@5:  0.6666666666666666 	recall@10:  1.0
user:  -8424644554119645763 	recall@5:  0.5 	recall@10:  0.5
user:  1720358903345561796 	recall@5:  1.0 	recall@10:  1.0
user:  -5648501732106474447 	recall@5:  0.0 	recall@10:  0.0
user:  -5388083602161519863 	recall@5:  0.0 	recall@10:  0.0
user:  -8430239657678001264 	recall@5:  1.0 	recall@10:  1.0
user:  -8223836278784533306 	recall@5:  0.0 	recall@10:  0.0
user:  -710135072571868067 	recall@5:  0.0 	recall@10:  0.0
user:  -9196668942822132778 	recall@5:  0.0 	recall@10:  0.0
user:  -4767444107153363126 	recall@5:  0.0 	recall@10:  0.0
user:  -3160777155871929764 	recall@5:  0.0 	recall@10:  0.0
user:  6590718530693270591 	recall@5:  0.0 	recall@10:  0.0
user:  -6631461711538275356 	recall@5:  0.0 	recall@10:  0.5
user:  8562738815734604501 	recall@5:  0.0 	recall@10:  0.0
user:  -485491685721852725 	recall@5:  0.0 	recall@10:  0.0
user:  -2545236939408275407 	recall@5:  0.0 	recall@10:  0.5
user:  -830641

user:  -3424085881110247816 	recall@5:  0.0 	recall@10:  0.0
user:  7563882320191729616 	recall@5:  0.0 	recall@10:  0.0
user:  -5150600065224095351 	recall@5:  0.0 	recall@10:  0.0
user:  -8854674432071487111 	recall@5:  0.0 	recall@10:  0.0
user:  -4167415844282118439 	recall@5:  0.0 	recall@10:  0.0
user:  -5474198417649529244 	recall@5:  0.0 	recall@10:  0.0
user:  1037187242736800310 	recall@5:  1.0 	recall@10:  1.0
user:  3258998879061034753 	recall@5:  0.0 	recall@10:  0.0
user:  -5579363687688515285 	recall@5:  0.0 	recall@10:  0.0
user:  -5761633761733198864 	recall@5:  0.0 	recall@10:  0.0
user:  4791643529800239468 	recall@5:  0.0 	recall@10:  0.0
user:  753551422329466662 	recall@5:  0.0 	recall@10:  0.0
user:  3648848852703752699 	recall@5:  0.0 	recall@10:  0.0
user:  8389544657102033065 	recall@5:  0.0 	recall@10:  0.0
user:  1306495113310433590 	recall@5:  0.0 	recall@10:  0.0


In [21]:
overall_results_df

Unnamed: 0,model_name,recall_top5,recall_top10
0,popularity,0.217464,0.361033


In [22]:
detailed_results_df.head(10)

Unnamed: 0,nm_hits_top10,nm_hits_top5,nm_interacted_items,recall_top10,recall_top5,user_id
76,35,16,192,0.182292,0.083333,3609194402293569455
17,22,6,134,0.164179,0.044776,-2626634673110551643
16,25,15,130,0.192308,0.115385,-1032019229384696495
10,8,4,117,0.068376,0.034188,-1443636648652872475
82,29,20,88,0.329545,0.227273,-2979881261169775358
161,20,10,80,0.25,0.125,-3596626804281480007
65,28,15,73,0.383562,0.205479,1116121227607581999
81,18,10,69,0.26087,0.144928,692689608292948411
106,20,13,69,0.289855,0.188406,-9016528795238256703
52,25,15,68,0.367647,0.220588,3636910968448833585
