In [1]:
# import important libraries 
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [2]:
articles_df = pd.read_csv('C:\\Users\\ashok.kumar\\Documents\\rec systems\\shared_articles.csv\\shared_articles.csv')

In [3]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

0

In [4]:
interactions_df = pd.read_csv('C:/Users/ashok.kumar/Documents/rec systems/users_interactions.csv/users_interactions.csv')
# interactions_df.head(10)

In [9]:
interactions_df.nunique()

timestamp      68772
eventType          5
contentId       2987
personId        1895
sessionId      28740
userAgent       1090
userRegion        71
userCountry       23
dtype: int64

In [5]:
# print(interactions_df['eventType'].unique())
# interactions_df.dtypes
# print(interactions_df.isnull().sum())
# len(articles_df),len(interactions_df)


## Data munging
As there are different interactions types, we associate them with a weight or strength, assuming that, for example, a comment in an article indicates a higher interest of the user on the item than a like, or than a simple view.

In [6]:
event_type_strength = {
    'VIEW' : 1.0,
    'FOLLOW' : 2.0,
    'BOOKMARK' : 2.5,
    'LIKE' : 3.0,
    'COMMENT CREATED' : 4.0,
}

In [7]:
# event_type_strength.keys(), event_type_strength.values()
interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x : event_type_strength[x])
# interactions_df.head(5)

In [8]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()

In [9]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
# print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
# print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

In [10]:
# users_with_enough_interactions_df.head(5)

In [11]:
# print('#number of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
# print('# number of interactions from users with at least 5 interactions: %d'
#       % len(interactions_from_selected_users_df))

In [12]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
# print('#number of unique user/item interactions: %d' % len(interactions_full_df))
# interactions_full_df.head(10)

### Popularity model
A common (and usually hard-to-beat) baseline approach is the Popularity model. This model is not actually personalized - it simply recommends to a user the most popular items that the user has not previously consumed. As the popularity accounts for the "wisdom of the crowds", it usually provides good recommendations, generally interesting for most people. Ps. The main objective of a recommender system is to leverage the long-tail items to the users with very specific interests, which goes far beyond this simple technique.

In [13]:
#computes the most popular items
item_popularity_df = interactions_full_df.groupby('contentId')['eventStrength'].sum().sort_values (ascending = False).reset_index()
# item_popularity_df.head(10)

In [None]:
item_popularity_view_df = interactions_full_df.

In [19]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                               .sort_values('eventStrength', ascending = False) \
                               .head(topn)
    def recommend_items(self,user_id,items_to_ignore=[],topn=10,verbose=False):
        rec_by_view_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                            .sort_values('eventStrength', ascending = False).head(topn)
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['title','eventStrength']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, articles_df)

In [None]:
# def inspect_interactions(person_id, test_set=True):
#     if test_set:
#         interactions_df = interactions_test_indexed_df
#     else:
#         interactions_df = interactions_train_indexed_df
#     return interactions_df.loc[person_id].merge(articles_df, how = 'left', 
#                                                       left_on = 'contentId', 
#                                                       right_on = 'contentId') \
#                           .sort_values('eventStrength', ascending = False)[['eventStrength', 
#                                                                           'contentId',
#                                                                           'title', 'url', 'lang']]

In [26]:
type(popularity_model.recommend_items(96, topn=20, verbose=True))

pandas.core.frame.DataFrame

In [27]:
typeof = popularity_model.recommend_items(96, topn=20, verbose=True)

In [35]:
typeof.to_dict()

{'title': {0: 'Former Google career coach shares a visual trick for figuring out what to do with your life',
  1: 'Livro: Retrospectivas Divertidas',
  2: 'Novo workaholic trabalha, pratica esportes e tem tempo para a família. Conheça',
  3: "Ray Kurzweil: The world isn't getting worse - our information is getting better",
  4: 'Ganhe 6 meses de acesso ao Pluralsight, maior plataforma de treinamento online',
  5: 'Psicóloga de Harvard diz que as pessoas julgam você em segundos por esses critérios | Jornal do Empreendedor',
  6: 'Custo do Erro - Cinco motivos para investir em automação de testes',
  7: '10 Modern Software Over-Engineering Mistakes',
  8: 'Um bilhão de arquivos mostram quem vence a disputa tabs vs. espaços entre programadores',
  9: "Don't document your code. Code your documentation.",
  10: 'Seja esperto no trabalho: Melhore a comunicação na empresa com 12 robôs',
  11: 'A minha viagem à Maternidade #tetodomundo',
  12: 'UX ou UI?',
  13: 'Pull request first - Practical

In [41]:
typeof['title'][0]


'Former Google career coach shares a visual trick for figuring out what to do with your life'