In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import random

In [2]:
shared_articles_df = pd.read_csv('shared_articles.csv')
shared_articles_df = shared_articles_df[shared_articles_df['eventType'] == 'CONTENT SHARED']
shared_articles_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [3]:
interactions_df = pd.read_csv('users_interactions.csv')
interactions_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US


In [4]:
print('Number of interactions:', len(interactions_df))

Number of interactions: 72312


In [5]:
popularity = {'VIEW': 1.0,'LIKE': 4.0, 'BOOKMARK': 100, 'FOLLOW': 25.0,'COMMENT CREATED': 10.0}
interactions_df['popularity'] = interactions_df['eventType'].apply(lambda x: popularity[x])
interactions_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,popularity
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,25.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0


In [6]:
distinct_users_interactions = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
len(distinct_users_interactions)

1895

In [7]:
#Avoiding cold-start
users_wt_cold_strt = distinct_users_interactions[distinct_users_interactions >5].reset_index()[['personId']]
len(users_wt_cold_strt)

1041

In [8]:
min_5_interactions_df = interactions_df.merge(users_wt_cold_strt, how = 'right', left_on = 'personId', right_on = 'personId')
print('Number of interactions from users with more than 5 interactions:', len(min_5_interactions_df))

Number of interactions from users with more than 5 interactions: 69081


<B>aggregate all the interactions the user has performed in an item

In [9]:
def logarithmic_interaction(x):
    return math.log(1+x, 2)
    
grouped_interactions_df = min_5_interactions_df.groupby(['personId', 'contentId'])['popularity'].sum()
print(grouped_interactions_df[:10])
grouped_interactions_df = min_5_interactions_df.groupby(['personId', 'contentId'])['popularity'].sum().apply(logarithmic_interaction).reset_index()
print('Number of unique user/item interactions:',len(grouped_interactions_df))
grouped_interactions_df.head(10)

personId              contentId           
-9223121837663643404  -8949113594875411859    1.0
                      -8377626164558006982    1.0
                      -8208801367848627943    1.0
                      -8187220755213888616    1.0
                      -7423191370472335463    8.0
                      -7331393944609614247    1.0
                      -6872546942144599345    1.0
                      -6728844082024523434    1.0
                      -6590819806697898649    1.0
                      -6558712014192834002    2.0
Name: popularity, dtype: float64
Number of unique user/item interactions: 38611


Unnamed: 0,personId,contentId,popularity
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


In [10]:
from sklearn.model_selection import train_test_split
interactions_train, interactions_test = train_test_split(grouped_interactions_df, test_size=0.20,random_state=42)

<B><P>To Speed up search we set an Index

In [11]:
grouped_interactions_df = grouped_interactions_df.set_index('personId')
interactions_train = interactions_train.set_index('personId')
interactions_test = interactions_test.set_index('personId')

<B><P> Sparse pivot table with PersonID in rows and ContentID in columns

In [12]:
#Pivot
pivot_person_content = interactions_train.pivot(columns='contentId', values='popularity').fillna(0)
pivot_person_content.head(5)

contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9184137057748005562,-9176143510534135851,-9172673334835262304,-9171475473795142532,...,9191014301634017491,9207286802575546269,9208127165664287660,9209629151177723638,9209886322932807692,9213260650272029784,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,2.584963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9188188261933657343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
pivot_person_content_val = pivot_person_content.values
pivot_person_content_val[:10]

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 2.5849625, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [14]:
#get distinct user_ids
users_ids = list(pivot_person_content.index)
users_ids[:10]

[-9223121837663643404,
 -9207251133131336884,
 -9199575329909162940,
 -9196668942822132778,
 -9188188261933657343,
 -9172914609055320039,
 -9156344805277471150,
 -9120685872592674274,
 -9109785559521267180,
 -9063420486253202900]

In [15]:
from scipy.sparse import csr_matrix
pivot_person_content_val_sparse = csr_matrix(pivot_person_content_val)
pivot_person_content_val_sparse.shape

(1041, 2915)

In [16]:
from scipy.sparse.linalg import svds
#The number of factors in matrix factorization
NUMBER_OF_FACTORS_MF = 15
#matrix factorization with svd
U, S, Vt = svds(pivot_person_content_val_sparse, k = NUMBER_OF_FACTORS_MF)

In [17]:
U.shape#nxr

(1041, 15)

In [18]:
S = np.diag(S)#rxr
S.shape

(15, 15)

In [19]:
Vt.shape#rxk

(15, 2915)

In [20]:
person_ratings_dense_matrix = np.dot(np.dot(U, S), Vt)
person_ratings_dense_matrix_std = (person_ratings_dense_matrix - person_ratings_dense_matrix.min()) / (person_ratings_dense_matrix.max() - person_ratings_dense_matrix.min())
person_ratings_dense_matrix_std.shape

(1041, 2915)

In [21]:
#Putting the matrix in dataframe
person_ratings_preds_df = pd.DataFrame(person_ratings_dense_matrix_std, columns = pivot_person_content.columns, index=users_ids).transpose()
person_ratings_preds_df.head(10)

Unnamed: 0_level_0,-9223121837663643404,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,-9063420486253202900,...,9102085903669288476,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222795471790223670,0.212038,0.208293,0.209888,0.207824,0.20797,0.207924,0.206895,0.208301,0.210813,0.206495,...,0.211267,0.209755,0.209424,0.208593,0.208662,0.202049,0.209813,0.208378,0.210671,0.206989
-9216926795620865886,0.208347,0.208956,0.208215,0.20842,0.208287,0.208449,0.208308,0.207797,0.208555,0.208669,...,0.208386,0.208346,0.209973,0.208365,0.20832,0.210778,0.209168,0.208552,0.208847,0.20813
-9194572880052200111,0.208901,0.201133,0.208099,0.211323,0.207633,0.211291,0.206622,0.201559,0.211515,0.210019,...,0.207086,0.207055,0.214796,0.208863,0.209822,0.22503,0.208778,0.204618,0.20811,0.230149
-9192549002213406534,0.208757,0.205917,0.207863,0.208719,0.208654,0.20907,0.211054,0.211067,0.208068,0.21025,...,0.20845,0.208757,0.215897,0.208686,0.208509,0.217842,0.211018,0.206243,0.20932,0.214268
-9190737901804729417,0.208421,0.210605,0.208254,0.208475,0.2083,0.207939,0.208564,0.20652,0.208342,0.208507,...,0.208917,0.208368,0.209766,0.208382,0.208186,0.208722,0.20838,0.209,0.208584,0.211546
-9189659052158407108,0.209871,0.209918,0.207687,0.209801,0.208283,0.208495,0.208647,0.209754,0.208455,0.209632,...,0.207881,0.208013,0.217435,0.208242,0.208387,0.213539,0.20778,0.209689,0.208765,0.223727
-9184137057748005562,0.208381,0.208459,0.208327,0.208334,0.20832,0.208344,0.208331,0.208262,0.208388,0.208356,...,0.208339,0.208341,0.208494,0.208325,0.208323,0.208399,0.208366,0.208401,0.208388,0.208332
-9176143510534135851,0.210444,0.208897,0.208921,0.208264,0.20825,0.209183,0.210127,0.206974,0.209764,0.207535,...,0.208701,0.208839,0.208362,0.208115,0.208121,0.207059,0.208533,0.208506,0.208937,0.20746
-9172673334835262304,0.208036,0.207752,0.208216,0.20817,0.208361,0.20834,0.208459,0.208353,0.20815,0.208508,...,0.208362,0.208486,0.208618,0.208356,0.208269,0.209758,0.209473,0.207948,0.208783,0.205707
-9171475473795142532,0.208938,0.209354,0.208583,0.208683,0.208428,0.208597,0.208014,0.207668,0.209499,0.210434,...,0.20856,0.208557,0.209383,0.208403,0.208457,0.209073,0.207784,0.208756,0.2082,0.211234


In [22]:
len(person_ratings_preds_df.columns)

1041

In [23]:
def interacted_content(person_id, interactions_df):
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [24]:
class giv_Recommendation:
        
    def __init__(self, person_ratings_preds_df, content_df):
        self.person_ratings_preds_df = person_ratings_preds_df
        self.content_df = pd.DataFrame()
        
    def recommend_content(self, person_id, already_interacted=[], topN=10):
        sorted_predictions = self.person_ratings_preds_df[person_id].sort_values(ascending=False).reset_index().rename(columns={person_id: 'rec_recall'})
        recommendations = sorted_predictions[~sorted_predictions['contentId'].isin(already_interacted)].sort_values('rec_recall', ascending = False)[:topN]

        return recommendations

In [36]:
recommendations = giv_Recommendation(person_ratings_preds_df, shared_articles_df)

<B><P> Recall@N evaluation - Top-N accuracy metrics

In [26]:
def not_interacted_content( person_id, sample_size, seed=42):
        random.seed(seed)
        interacted_items = interacted_content(person_id, grouped_interactions_df)
        all_items = set(shared_articles_df['contentId'])
        non_interacted_items = all_items - interacted_items        
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

In [27]:
def hits( contentId, recommendation, topN):        
    try:
        index = next(i for i, c in enumerate(recommendation) if (c == contentId))
    except:
        index = -1
    hit = int(index in range(0, topN))
    return hit

In [28]:
def evaluate_model_for_user(recommendations, person_id):
    interaction_count = 0        
    interacted_values_testset = interactions_test.loc[person_id]
    person_interacted_items_testset = set(interacted_values_testset['contentId'] if type(interacted_values_testset['contentId']) == pd.Series else [int(interacted_values_testset['contentId'])])
    tot_interactions_testset = len(person_interacted_items_testset)
    already_interacted=interacted_content(person_id, interactions_train)
    person_recs_df = recommendations.recommend_content(person_id, already_interacted, topN=10000000000)

    for content_id in person_interacted_items_testset:
        non_interacted_items_sample = not_interacted_content(person_id, sample_size=50, seed=1000)
        items_to_filter_recs = non_interacted_items_sample.union(set([content_id]))
        valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
        valid_recs = valid_recs_df['contentId'].values
        interaction= hits(content_id, valid_recs, 10)
        interaction_count += interaction

    recall = interaction_count / float(tot_interactions_testset)
    person_metrics = {'hits':interaction_count, 'tot_interactions_testset': tot_interactions_testset, 'recall': recall}
    return person_metrics

In [29]:
def evaluate_model(recommendations):
    all_persons = []
    person_metrics = {}
    for count, person_id in enumerate(list(interactions_test.index.unique().values)):
        person_metrics['person_id'] = person_id
        person_metrics = evaluate_model_for_user(recommendations, person_id)
        all_persons.append(person_metrics)

    person_details_df = pd.DataFrame(all_persons).sort_values('tot_interactions_testset', ascending=False).reset_index()
    recall = person_details_df['hits'].sum()/float(person_details_df['tot_interactions_testset'].sum())
    return recall, person_details_df,count   

<B><P> Validation

In [30]:
metrics_measurement, person_details_df,count = evaluate_model(recommendations)
print('For',count, 'user data combined recall Value:', metrics_measurement)
person_details_df.head(10)

For 960 user data combined recall Value: 0.5708921403599637


Unnamed: 0,index,hits,tot_interactions_testset,recall,person_id
0,42,54,183,0.295082,4.227774e+18
1,53,88,154,0.571429,-2.901997e+18
2,67,76,128,0.59375,3.891638e+18
3,142,59,109,0.541284,6.464364e+18
4,70,51,87,0.586207,-8.051903e+18
5,26,49,74,0.662162,7.703285e+18
6,7,29,73,0.39726,-5.527146e+18
7,57,20,70,0.285714,9.210531e+18
8,102,58,68,0.852941,3.576138e+18
9,1,39,68,0.573529,5.030175e+18


<B><P> Test

In [41]:
def already_interacted_content(person_id):
    return interactions_train.loc[person_id].merge(shared_articles_df, how = 'left').sort_values('popularity', ascending = False).dropna(1).reset_index()

In [42]:
already_interacted_content(-9188188261933657343)

Unnamed: 0,index,contentId,popularity,timestamp,eventType,authorPersonId,authorSessionId,contentType,url,title,text,lang
0,0,-7992053163122498177,2.584963,1461780616,CONTENT SHARED,5127372011815639401,2698416824809687808,HTML,https://nodejs.org/en/blog/announcements/v6-re...,World's Fastest Growing Open Source Platform P...,"New ""Current"" version line focuses on performa...",en
1,5,3579921471626387620,1.584963,1466867555,CONTENT SHARED,-6895155480127642372,6182198327666594200,HTML,http://www.segfoco.com.br/mercado/segures/,Dicionário de Segurês,O Mercado de Seguros é prodigioso no que se re...,pt
2,1,1468327003955810686,1.0,1463693988,CONTENT SHARED,-1032019229384696495,-3222296078930623200,HTML,https://medium.com/javascript-scene/what-is-we...,What is WebAssembly? The Dawn of a New Era - J...,It's much harder to get real work done when yo...,en
3,2,4609121753781446855,1.0,1467602804,CONTENT SHARED,-3203894957285229214,1887206186402623984,HTML,http://produto.mercadolivre.com.br/MLB-7659820...,Lava-louças Brastemp Ative! 8 Serviços Blf08ab...,R$ 800 00 Entrega a combinar com o vendedor Ja...,pt
4,3,2672238531812965181,1.0,1462312977,CONTENT SHARED,-6153009241569363021,1994471746758391985,HTML,https://medium.com/@hackupstate/improving-angu...,Improving Angular performance with 1 line of code,"So I thought to myself, ""Genius dot com raised...",en
5,4,174707786647990372,1.0,1475171859,CONTENT SHARED,-8020832670974472349,4319478661599559970,HTML,https://cloudplatform.googleblog.com/2016/09/b...,Bringing Pokémon GO to life on Google Cloud,"Throughout my career as an engineer, I've had ...",en


In [43]:
recommendations.recommend_content(-9188188261933657343, topN=10)

Unnamed: 0,contentId,rec_recall
0,4084131344684656470,0.212765
1,-5756697018315640725,0.212283
2,-1111518890369033396,0.212171
3,3906974906788964502,0.211977
4,-615912190028612956,0.211827
5,1356221992133852808,0.211758
6,-5315378314308323942,0.211724
7,6583734846225935852,0.211616
8,6807042796917367736,0.211529
9,-7681408188643141872,0.211489
