In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [3]:
articles_df = pd.read_csv('C:\\Users\\ashok.kumar\\Documents\\rec systems\\shared_articles.csv\\shared_articles.csv')

In [4]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
# articles_df.head(5)

In [5]:
interactions_df = pd.read_csv('C:/Users/ashok.kumar/Documents/rec systems/users_interactions.csv/users_interactions.csv')
# interactions_df.head(10)
# print(interactions_df['eventType'].unique())
# interactions_df.dtypes

##Data munging
As there are different interactions types, we associate them with a weight or strength, assuming that, for example, a comment in an article indicates a higher interest of the user on the item than a like, or than a simple view.

In [6]:
event_type_strength = {
    'VIEW' : 1.0,
    'FOLLOW' : 2.0,
    'BOOKMARK' : 2.5,
    'LIKE' : 3.0,
    'COMMENT CREATED' : 4.0,
}

In [7]:
# event_type_strength.keys(), event_type_strength.values()

In [8]:
interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x : event_type_strength[x])

In [9]:
# interactions_df.head(5)

In [10]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId'])\
                                .size().groupby('personId').size()

In [11]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1895
# users with at least 5 interactions: 1140


In [12]:
# users_with_enough_interactions_df.head()

In [13]:
print('#number of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# number of interactions from users with at least 5 interactions: %d'
      % len(interactions_from_selected_users_df))

#number of interactions: 72312
# number of interactions from users with at least 5 interactions: 69868


In [14]:
len(interactions_from_selected_users_df)

69868

In [15]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('#number of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

#number of unique user/item interactions: 39106


Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


In [16]:
    interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 31284
# interactions on Test set: 7822


In [17]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

In [18]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the journal information.
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [19]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    


#Popularity model


In [20]:
#computes the most popular items
item_popularity_df = interactions_full_df.groupby('contentId')['eventStrength'].sum().sort_values (ascending = False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,contentId,eventStrength
0,-4029704725707465084,319.508482
1,-6783772548752091658,239.587417
2,-133139342397538859,234.52182
3,-8208801367848627943,202.259434
4,-6843047699859121724,198.878826
5,8224860111193157980,197.136422
6,-2358756719610361882,187.107751
7,2581138407738454418,184.46451
8,7507067965574797372,180.922283
9,1469580151036142903,174.659316


In [21]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                               .sort_values('eventStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['eventStrength', 'contentId', 'title']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, articles_df)

In [22]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
1139 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.2412426489388903, 'recall@10': 0.37215545896190233}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
76,3609194402293569455,50,25,192,0.260417,0.130208
17,-2626634673110551643,24,12,134,0.179104,0.089552
16,-1032019229384696495,24,13,130,0.184615,0.1
10,-1443636648652872475,10,5,117,0.08547,0.042735
82,-2979881261169775358,38,27,88,0.431818,0.306818
161,-3596626804281480007,19,12,80,0.2375,0.15
65,1116121227607581999,34,19,73,0.465753,0.260274
81,692689608292948411,23,18,69,0.333333,0.26087
106,-9016528795238256703,20,14,69,0.289855,0.202899
52,3636910968448833585,29,22,68,0.426471,0.323529



Here we perform the evaluation of the Popularity model, according to the method described above.
It achieved the Recall@5 of 0.2417, which means that about 24% of interacted items in test set were ranked by Popularity model among the top-5 items (from lists with 100 random items). And Recall@10 was even higher (37%), as expected.
It might be surprising to you that usually Popularity models could perform so well!

#Testing Popularity model

In [23]:
def inspect_interactions(person_id, test_set=True):
    if test_set:
        interactions_df = interactions_test_indexed_df
    else:
        interactions_df = interactions_train_indexed_df
    return interactions_df.loc[person_id].merge(articles_df, how = 'left', 
                                                      left_on = 'contentId', 
                                                      right_on = 'contentId') \
                          .sort_values('eventStrength', ascending = False)[['eventStrength', 
                                                                          'contentId',
                                                                          'title']]

In [24]:
inspect_interactions(-1479311724257856983, test_set=False).head(20)

Unnamed: 0,eventStrength,contentId,title
115,4.285402,7342707578347442862,"At eBay, Machine Learning is Driving Innovativ..."
38,4.129283,621816023396605502,AI Is Here to Help You Write Emails People Wil...
116,4.044394,-7959318068735027467,Auto-scaling scikit-learn with Spark
8,4.044394,-4460374799273064357,"Deep Learning for Chatbots, Part 1 - Introduction"
10,3.807355,2589533162305407436,6 reasons why I like KeystoneML
113,3.754888,-6467708104873171151,5 reasons your employees aren't sharing their ...
6,3.70044,-398780385766545248,10 Stats About Artificial Intelligence That Wi...
42,3.643856,-4944551138301474550,Algorithms and architecture for job recommenda...
28,3.584963,5258604889412591249,Machine Learning Is No Longer Just for Experts
41,3.584963,444378495316508239,How to choose algorithms for Microsoft Azure M...


In [25]:
# pop.to_csv(r'C:\Users\ashok.kumar\Documents\movie lensl-100k\pop.csv')

In [26]:
popularity_model.recommend_items(-1479311724257856983, topn=20, verbose=True)
# pop_ourmodel.to_csv(r'C:\Users\ashok.kumar\Documents\movie lensl-100k\pop1.csv')

Unnamed: 0,eventStrength,contentId,title
0,319.508482,-4029704725707465084,Former Google career coach shares a visual tri...
1,239.587417,-6783772548752091658,Livro: Retrospectivas Divertidas
2,234.52182,-133139342397538859,"Novo workaholic trabalha, pratica esportes e t..."
3,202.259434,-8208801367848627943,Ray Kurzweil: The world isn't getting worse - ...
4,198.878826,-6843047699859121724,"Ganhe 6 meses de acesso ao Pluralsight, maior ..."
5,197.136422,8224860111193157980,Psicóloga de Harvard diz que as pessoas julgam...
6,187.107751,-2358756719610361882,Custo do Erro - Cinco motivos para investir em...
7,184.46451,2581138407738454418,10 Modern Software Over-Engineering Mistakes
8,180.922283,7507067965574797372,Um bilhão de arquivos mostram quem vence a dis...
9,174.659316,1469580151036142903,Don't document your code. Code your documentat...


In [27]:
# Content-Based Filtering model

In [28]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashok.kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
#Ignoring stopwords (words with no semantics) from English and Portuguese (as we have a corpus with mixed languages)
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

item_ids = articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform(articles_df['title'] + "" + articles_df['text'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<3047x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 638928 stored elements in Compressed Sparse Row format>


To model the user profile, we take all the item profiles the user has interacted and average them. The average is weighted by the interaction strength, in other words, the articles the user has interacted the most (eg. liked or commented) will have a higher strength in the final user profile.

In [30]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['contentId'])
    
    user_item_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_users_profiles(): 
    interactions_indexed_df = interactions_full_df[interactions_full_df['contentId'] \
                                                   .isin(articles_df['contentId'])].set_index('personId')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles

In [31]:
user_profiles = build_users_profiles()
len(user_profiles)

1140

Let's take a look in the profile. It is a unit vector of 5000 length. The value in each position represents how relevant is a token (unigram or bigram) for me.
Looking below profile, it appears that the top relevant tokens really represent his professional interests in machine learning, deep learning, artificial intelligence and google cloud platform! So we might expect good recommendations here!

In [32]:
myprofile = user_profiles[-1479311724257856983]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[-1479311724257856983].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 5000)


Unnamed: 0,token,relevance
0,learning,0.306565
1,machine learning,0.256911
2,machine,0.247312
3,google,0.208566
4,data,0.171981
5,ai,0.137154
6,algorithms,0.10198
7,graph,0.09697
8,like,0.096702
9,language,0.083541


In [71]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(articles_df)

In [34]:
content_based_recommender_model.recommend_items(-1479311724257856983,verbose=True)

Unnamed: 0,recStrength,title
0,0.687853,"How Google is Remaking Itself as a ""Machine Le..."
1,0.6858,"How Google is Remaking Itself as a ""Machine Le..."
2,0.628616,Machine Learning for Designers
3,0.589478,Machine Learning Is No Longer Just for Experts
4,0.581298,How real businesses are using machine learning
5,0.570813,5 Skills You Need to Become a Machine Learning...
6,0.570047,Building AI Is Hard-So Facebook Is Building AI...
7,0.560628,Is machine learning the next commodity?
8,0.558866,Machine Learning as a Service: How Data Scienc...
9,0.552942,Google's Cloud Machine Learning service is now...


Yay! With personalized recommendations of content-based filtering model, we have a jump on Recall@5 to about 0.4145, which means that about 41% of interacted items in test set were ranked by this model among the top-5 items (from lists with 100 random items).
And Recall@10 was 0.5241 (52%).

In [35]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
1139 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.41447200204551266, 'recall@10': 0.5236512400920481}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
76,3609194402293569455,26,16,192,0.135417,0.083333
17,-2626634673110551643,35,21,134,0.261194,0.156716
16,-1032019229384696495,35,22,130,0.269231,0.169231
10,-1443636648652872475,54,34,117,0.461538,0.290598
82,-2979881261169775358,15,8,88,0.170455,0.090909
161,-3596626804281480007,23,14,80,0.2875,0.175
65,1116121227607581999,15,10,73,0.205479,0.136986
81,692689608292948411,20,11,69,0.289855,0.15942
106,-9016528795238256703,10,4,69,0.144928,0.057971
52,3636910968448833585,11,3,68,0.161765,0.044118


Collaborative Filtering (CF) has two main implementation strategies: 
Memory-based: This approach uses the memory of previous users interactions to compute users similarities based on items they've interacted (user-based approach) or compute items similarities based on the users that have interacted with them (item-based approach).
A typical example of this approach is User Neighbourhood-based CF, in which the top-N similar users (usually computed using Pearson correlation) for a user are selected and used to recommend items those similar users liked, but the current user have not interacted yet. This approach is very simple to implement, but usually do not scale well for many users. 
Model-based: This approach, models are developed using different machine learning algorithms to recommend items to users. There are many model-based CF algorithms, like neural networks, bayesian networks, clustering models, and latent factor models such as Singular Value Decomposition (SVD) and, probabilistic latent semantic analysis.

Latent factor models compress user-item matrix into a low-dimensional representation in terms of latent factors. One advantage of using this approach is that instead of having a high dimensional matrix containing abundant number of missing values we will be dealing with a much smaller matrix in lower-dimensional space.
A reduced presentation could be utilized for either user-based or item-based neighborhood algorithms that are presented in the previous section. There are several advantages with this paradigm. It handles the sparsity of the original matrix better than memory based ones. Also comparing similarity on the resulting matrix is much more scalable especially in dealing with large sparse datasets. 

In [51]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index='personId', 
                                                          columns='contentId', 
                                                          values='eventStrength').fillna(0)

users_items_pivot_matrix_df.head(10)

contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9176143510534135851,-9172673334835262304,-9171475473795142532,-9166778629773133902,...,9191014301634017491,9207286802575546269,9208127165664287660,9209629151177723638,9209886322932807692,9213260650272029784,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,2.321928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9188188261933657343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9172914609055320039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9156344805277471150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9120685872592674274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9109785559521267180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix[:10]

  """Entry point for launching an IPython kernel.


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.32192809, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [53]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[-9223121837663643404,
 -9212075797126931087,
 -9207251133131336884,
 -9199575329909162940,
 -9196668942822132778,
 -9188188261933657343,
 -9172914609055320039,
 -9156344805277471150,
 -9120685872592674274,
 -9109785559521267180]

In [54]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [55]:
U.shape

(1140, 15)

In [56]:
Vt.shape

(15, 2926)

In [57]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

After the factorization, we try to to reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse any more. It was generated predictions for items the user have not yet interaction, which we will exploit for recommendations.

In [58]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 0.00484953,  0.00029194, -0.02303978, ...,  0.00139588,
         0.00969391,  0.00409157],
       [-0.00045612, -0.00023018, -0.0019307 , ...,  0.00246956,
         0.00157459, -0.0019201 ],
       [-0.01016898,  0.0077295 , -0.0091188 , ...,  0.0038174 ,
        -0.01362765,  0.01276193],
       ...,
       [-0.02558305,  0.00937477, -0.02375579, ..., -0.01204404,
        -0.00590814,  0.01292988],
       [-0.02070299,  0.00445527,  0.00958406, ...,  0.00206279,
        -0.00276032, -0.00489968],
       [-0.02056192,  0.00371639,  0.14170166, ..., -0.01236381,
         0.06547245,  0.01149071]])

In [59]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222795471790223670,0.00485,-0.000456,-0.010169,0.058784,-0.00978,-0.004324,-0.004453,0.045504,-0.001874,-0.01894,...,-0.004182,-0.105448,0.000979,0.01413,0.031989,-0.008898,-0.018234,-0.025583,-0.020703,-0.020562
-9216926795620865886,0.000292,-0.00023,0.00773,-0.000269,0.000119,0.000272,0.000109,0.00072,0.002665,0.001426,...,-0.000224,0.014717,0.000548,0.001504,0.016964,0.001024,0.002117,0.009375,0.004455,0.003716
-9194572880052200111,-0.02304,-0.001931,-0.009119,-0.003649,0.024948,-0.00172,0.030106,-0.015879,-0.022432,0.004695,...,0.011473,0.036327,0.011388,0.020062,0.176805,0.021631,0.025802,-0.023756,0.009584,0.141702
-9192549002213406534,0.038676,0.000897,-0.035799,-0.006191,0.018377,0.0046,0.014409,0.054316,0.049604,0.004401,...,0.016444,0.294301,0.006416,-0.010134,0.038508,0.012372,0.009922,-0.018816,0.022117,0.106348
-9190737901804729417,0.019836,-0.00645,0.011248,0.007434,-0.003242,0.002913,0.00235,0.004552,-0.030522,0.020258,...,0.002669,0.011687,0.002925,1.2e-05,-0.047209,-0.001835,-0.000264,0.009215,0.005795,-0.022221
-9189659052158407108,0.005947,0.006662,0.002814,-0.003675,0.008263,-0.001208,0.003291,0.002198,0.022154,0.02083,...,0.023496,0.088919,-0.001883,0.002615,0.098591,0.007871,0.014248,0.017763,0.021165,0.067955
-9176143510534135851,0.040836,0.007425,0.013606,0.011935,-0.003528,-0.004621,0.015342,0.023478,0.066618,0.058955,...,-0.001988,0.063974,-0.017215,0.002521,0.006707,-0.001435,0.004142,0.04024,0.029514,0.050995
-9172673334835262304,0.004451,0.00065,0.002204,-0.000824,0.000413,0.000522,0.001259,0.002071,-0.002745,0.003111,...,0.004687,0.018105,0.000342,-0.000935,-0.00655,0.000222,0.002559,0.002473,0.006003,-0.001903
-9171475473795142532,0.030142,-0.000801,0.000428,-0.006268,0.002805,0.001227,0.011491,-0.00314,-0.011298,0.026195,...,0.012625,0.027591,0.000414,1.6e-05,-0.027984,0.002073,0.01067,0.000198,0.016424,-0.007493
-9166778629773133902,0.009544,-0.002217,-0.014063,-0.002275,0.005068,-4e-05,0.001485,0.004248,0.012544,-0.003614,...,0.001668,0.062148,0.0009,-0.003938,0.00983,0.003576,0.002912,-0.008748,0.0002,0.021179


In [60]:
len(cf_preds_df.columns)

1140

In [61]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, articles_df)

In [62]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
1139 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.33252365124009203, 'recall@10': 0.4656098184607517}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
76,3609194402293569455,41,19,192,0.213542,0.098958
17,-2626634673110551643,55,30,134,0.410448,0.223881
16,-1032019229384696495,34,15,130,0.261538,0.115385
10,-1443636648652872475,47,40,117,0.401709,0.34188
82,-2979881261169775358,50,39,88,0.568182,0.443182
161,-3596626804281480007,33,21,80,0.4125,0.2625
65,1116121227607581999,31,25,73,0.424658,0.342466
81,692689608292948411,23,15,69,0.333333,0.217391
106,-9016528795238256703,27,20,69,0.391304,0.289855
52,3636910968448833585,32,22,68,0.470588,0.323529


Evaluating the Collaborative Filtering model (SVD matrix factorization), we observe that we got Recall@5 (33%) and Recall@10 (46%) values higher than Popularity model, but lower than Content-Based model.
It appears that for this dataset, Content-Based approach is being benefited by the rich item attributes (text) for a better modeling of users preferences.


# Hybrid Recommender¶

What if we combine Collaborative Filtering and Content-Based Filtering approaches?
Would that provide us with more accurate recommendations?
In fact, hybrid methods have performed better than individual approaches in many studies and have being extensively used by researchers and practioners.
Let's build a simple hybridization method, by only multiply the CF score with the Content-Based score, and ranking by resulting score.

In [74]:
class HybridRecommender:
    
    MODEL_NAME = 'Hybrid'
    
    def __init__(self, cb_rec_model, cf_rec_model, items_df):
        self.cb_rec_model = cb_rec_model
        self.cf_rec_model = cf_rec_model
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        #Getting the top-1000 Content-based filtering recommendations
        cb_recs_df = self.cb_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCB'})
        
        #Getting the top-1000 Collaborative filtering recommendations
        cf_recs_df = self.cf_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose, 
                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCF'})
        
        #Combining the results by contentId
        recs_df = cb_recs_df.merge(cf_recs_df,
                                   how = 'inner', 
                                   left_on = 'contentId', 
                                   right_on = 'contentId')
        
        #Computing a hybrid recommendation score based on CF and CB scores
        recs_df['recStrengthHybrid'] = recs_df['recStrengthCB'] * recs_df['recStrengthCF']
        
        #Sorting recommendations by hybrid score
        recommendations_df = recs_df.sort_values('recStrengthHybrid', ascending=False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrengthHybrid', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
hybrid_recommender_model = HybridRecommender(content_based_recommender_model, cf_recommender_model, articles_df)

In [75]:
print('Evaluating Hybrid model...')
hybrid_global_metrics, hybrid_detailed_results_df = model_evaluator.evaluate_model(hybrid_recommender_model)
print('\nGlobal metrics:\n%s' % hybrid_global_metrics)
hybrid_detailed_results_df.head(10)

Evaluating Hybrid model...
1139 users processed

Global metrics:
{'modelName': 'Hybrid', 'recall@5': 0.43300946049603684, 'recall@10': 0.5370749169010484}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
76,3609194402293569455,41,25,192,0.213542,0.130208
17,-2626634673110551643,58,38,134,0.432836,0.283582
16,-1032019229384696495,36,26,130,0.276923,0.2
10,-1443636648652872475,54,36,117,0.461538,0.307692
82,-2979881261169775358,31,25,88,0.352273,0.284091
161,-3596626804281480007,29,20,80,0.3625,0.25
65,1116121227607581999,23,16,73,0.315068,0.219178
81,692689608292948411,20,13,69,0.289855,0.188406
106,-9016528795238256703,20,13,69,0.289855,0.188406
52,3636910968448833585,19,17,68,0.279412,0.25



Our simple hybrid approach surpasses Content-Based filtering with its combination with Collaborative Filtering. Now we have a Recall@5 of 43% and Recall@10 of 53%


# Comparing the methods


In [76]:
global_metrics_df = pd.DataFrame([pop_global_metrics, cf_global_metrics, cb_global_metrics, hybrid_global_metrics]) \
                        .set_index('modelName')
global_metrics_df

Unnamed: 0_level_0,recall@10,recall@5
modelName,Unnamed: 1_level_1,Unnamed: 2_level_1
Popularity,0.372155,0.241243
Collaborative Filtering,0.46561,0.332524
Content-Based,0.523651,0.414472
Hybrid,0.537075,0.433009


In [77]:
inspect_interactions(-1479311724257856983, test_set=False).head(20)

Unnamed: 0,eventStrength,contentId,title
115,4.285402,7342707578347442862,"At eBay, Machine Learning is Driving Innovativ..."
38,4.129283,621816023396605502,AI Is Here to Help You Write Emails People Wil...
116,4.044394,-7959318068735027467,Auto-scaling scikit-learn with Spark
8,4.044394,-4460374799273064357,"Deep Learning for Chatbots, Part 1 - Introduction"
10,3.807355,2589533162305407436,6 reasons why I like KeystoneML
113,3.754888,-6467708104873171151,5 reasons your employees aren't sharing their ...
6,3.70044,-398780385766545248,10 Stats About Artificial Intelligence That Wi...
42,3.643856,-4944551138301474550,Algorithms and architecture for job recommenda...
28,3.584963,5258604889412591249,Machine Learning Is No Longer Just for Experts
41,3.584963,444378495316508239,How to choose algorithms for Microsoft Azure M...


In [78]:
content_based_recommender_model.recommend_items(-1479311724257856983,verbose=True)

Unnamed: 0,recStrength,contentId,title,url,lang
0,0.687853,5250363310227021277,"How Google is Remaking Itself as a ""Machine Le...",https://backchannel.com/how-google-is-remaking...,en
1,0.6858,-7126520323752764957,"How Google is Remaking Itself as a ""Machine Le...",https://backchannel.com/how-google-is-remaking...,en
2,0.628616,638282658987724754,Machine Learning for Designers,https://www.oreilly.com/learning/machine-learn...,en
3,0.589478,5258604889412591249,Machine Learning Is No Longer Just for Experts,https://hbr.org/2016/10/machine-learning-is-no...,en
4,0.581298,-8068727428160395745,How real businesses are using machine learning,https://techcrunch.com/2016/03/19/how-real-bus...,en
5,0.570813,2220561310072186802,5 Skills You Need to Become a Machine Learning...,http://blog.udacity.com/2016/04/5-skills-you-n...,en
6,0.570047,-229081393244987789,Building AI Is Hard-So Facebook Is Building AI...,http://www.wired.com/2016/05/facebook-trying-c...,en
7,0.560628,54678605145828343,Is machine learning the next commodity?,http://readwrite.com/2016/04/18/machine-learni...,en
8,0.558866,-4571929941432664145,Machine Learning as a Service: How Data Scienc...,http://www.huffingtonpost.com/laura-dambrosio/...,en
9,0.552942,-9033211547111606164,Google's Cloud Machine Learning service is now...,https://techcrunch.com/2016/09/29/googles-clou...,en


In [79]:
cf_recommender_model.recommend_items(-1479311724257856983, topn=20, verbose=True)

Unnamed: 0,recStrength,contentId,title,url,lang
0,1.047726,-8085935119790093311,Graph Capabilities with the Elastic Stack,https://www.elastic.co/webinars/sneak-peek-of-...,en
1,0.975446,3269302169678465882,The barbell effect of machine learning.,http://techcrunch.com/2016/06/02/the-barbell-e...,en
2,0.935801,1005751836898964351,Seria Stranger Things uma obra de arte do algo...,https://www.linkedin.com/pulse/seria-stranger-...,pt
3,0.933358,-6727357771678896471,This Super Accurate Portrait Selection Tech Us...,http://petapixel.com/2016/06/29/super-accurate...,en
4,0.886154,-8377626164558006982,Bad Writing Is Destroying Your Company's Produ...,https://hbr.org/2016/09/bad-writing-is-destroy...,en
5,0.877353,-5253644367331262405,"Hello, TensorFlow!",https://www.oreilly.com/learning/hello-tensorflow,en
6,0.87218,7395435905985567130,The AI business landscape,https://www.oreilly.com/ideas/the-ai-business-...,en
7,0.868841,-8190931845319543363,Machine Learning Is At The Very Peak Of Its Hy...,https://arc.applause.com/2016/08/17/gartner-hy...,en
8,0.857511,1549650080907932816,Spark comparison: AWS vs. GCP,https://www.oreilly.com/ideas/spark-comparison...,en
9,0.832094,5092635400707338872,Power to the People: How One Unknown Group of ...,https://medium.com/@atduskgreg/power-to-the-pe...,en


In [80]:
hybrid_recommender_model.recommend_items(-1479311724257856983, topn=20, verbose=True)

Unnamed: 0,recStrengthHybrid,contentId,title,url,lang
0,0.480138,3269302169678465882,The barbell effect of machine learning.,http://techcrunch.com/2016/06/02/the-barbell-e...,en
1,0.421791,5092635400707338872,Power to the People: How One Unknown Group of ...,https://medium.com/@atduskgreg/power-to-the-pe...,en
2,0.407698,5258604889412591249,Machine Learning Is No Longer Just for Experts,https://hbr.org/2016/10/machine-learning-is-no...,en
3,0.370506,-9033211547111606164,Google's Cloud Machine Learning service is now...,https://techcrunch.com/2016/09/29/googles-clou...,en
4,0.347416,5250363310227021277,"How Google is Remaking Itself as a ""Machine Le...",https://backchannel.com/how-google-is-remaking...,en
5,0.321695,-5756697018315640725,Being A Developer After 40 - Free Code Camp,https://medium.freecodecamp.com/being-a-develo...,en
6,0.312273,1415230502586719648,Machine Learning Is Redefining The Enterprise ...,http://www.forbes.com/sites/louiscolumbus/2016...,en
7,0.310754,7395435905985567130,The AI business landscape,https://www.oreilly.com/ideas/the-ai-business-...,en
8,0.302504,-7126520323752764957,"How Google is Remaking Itself as a ""Machine Le...",https://backchannel.com/how-google-is-remaking...,en
9,0.289275,-8190931845319543363,Machine Learning Is At The Very Peak Of Its Hy...,https://arc.applause.com/2016/08/17/gartner-hy...,en
