Files exploration code
```
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
```

In [2]:
# !pip install pandas
# !pip install numpy
# !pip install scipy
# !pip install implicit

In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import implicit
import os

In [4]:
# -*- coding: utf-8 -*-
"""
@author: issbas
"""
import random  

pd.set_option('display.max_columns',10)

os.environ['MKL_NUM_THREADS'] = '1'   # to prevent a warning message
os.environ['OPENBLAS_NUM_THREADS'] = '1'
# map the user and item names to contiguous integers and also return the maps
def maptrans(trans):
    uniqueusers = np.sort(trans['user'].unique())
    uniqueitems = np.sort(trans['item'].unique())
    umap = dict(zip(uniqueusers,[i for i in range(len(uniqueusers))])) # this maps username -> index
    imap = dict(zip(uniqueitems,[i for i in range(len(uniqueitems))])) # this maps itemname -> index
    trans['user'] = trans.apply(lambda row: umap[row['user']], axis = 1) 
    trans['item'] = trans.apply(lambda row: imap[row['item']], axis = 1) 
    return (trans,umap,imap)

#return list of similar items, use the item-properties matrix (Q) to do nearest neighbour using cosine similarity
def findsimilaritems(item, item_vecs, n_similar=10):
    #Calculate the item vector norms (the vector lengths)
    item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))
    #Calculate the (cosine) similarity score: do dot product of selected content with every other content
    #Note: cosine sim = A.B/(norm(A)*norm(B)), since B (item 450) is the same for every item A, we can ignore its norm in this calc
    simscores = item_vecs.dot(item_vecs[item]) / item_norms
    #Get the top 10 contents (do a sort)
    top_idx = np.argpartition(simscores, -n_similar)[-n_similar:]
    #Create a descending list of content-score tuples of most similar articles with this article.
    similar = sorted(zip(top_idx, simscores[top_idx]/item_norms[item]), key=lambda x: -x[1])
    return (similar)

#return the top 10 recommendations chosen based on the person / content vectors 
#for contents never interacted with for any given person.
def recommend(user, sparse_user_item, userprefs, itemprops, num_items=10):

    # create a template vector, where unrated items = 1, rated items =0
    existing_ratings = sparse_user_item[user,:].toarray() # Get existing ratings for target
    existing_ratings = existing_ratings.reshape(-1) + 1  # Add 1 to everything, so items with no rating = 1
    existing_ratings[existing_ratings > 1] = 0  # make items already rated = 0

    # Get dot product of the target user preferences and all item properties ~ P[user]*transpose(Q)
    predrats = userprefs[user,:].dot(itemprops.T)
    
    # Items already rated have their predictions multiplied by zero (ie eliminated)
    predrats = predrats * existing_ratings 

    # Sort into descending order of predicted rating and select the topN item indexes
    itemids = np.argsort(predrats)[::-1][:num_items]
    
    # Start empty list to store items and scores
    recs = []
    for item in itemids: recs.append((item, predrats[item]))
    return recs


def implicit_testusers(testset, userprefs, itemprops, debug=False):
    errs = list([])
    #tic = time.perf_counter()
    for (indx,(uname,iname,rating)) in testset.iterrows():
        if (debug): print('.', end = '')
        err = abs(userprefs[uname,:].dot(itemprops[iname,:]) - rating)
        errs.append(err)
    #print(f"\ntime {time.perf_counter() - tic:0.4f} seconds")  
    return(errs)
    
    
#def ahead(arr,r=7,c=7):
#    with np.printoptions(threshold=np.inf):
#        print(arr[0:r,0:c])

#def sparsity(arr):
#    return np.isnan(arr).sum()/np.prod(arr.shape)
#    #1.0 - ( count_nonzero(arr) / float(arr.size) )
    
##################################################  

In [5]:
path = '/home/mobasshir/recommendation_engine_lab/NUS-Artificial-Intelligence-Training/recommender/Datasets'
os.chdir(path)

In [6]:
interactions_df = pd.read_csv('Deskdrop/users_interactions.csv')
articles_df = pd.read_csv('Deskdrop/shared_articles.csv')

In [7]:
interactions_df.head(3)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3.49992e+18,-8.8453e+18,1.2642e+18,,,
1,1465412560,VIEW,8.89072e+18,-1.03202e+18,3.62174e+18,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,3.10515e+17,-1.13027e+18,2.63186e+18,,,


In [8]:
articles_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,...,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6.45131e+18,4.34031e+18,8.94034e+18,...,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4.11035e+18,4.34031e+18,8.94034e+18,...,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en


In [9]:
# interaction events for individual users, eventype ~ view, like, bookmark, follow, comment
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)
interactions_df.head(3)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId
0,1465413032,VIEW,-3.49992e+18,-8.8453e+18,1.2642e+18
1,1465412560,VIEW,8.89072e+18,-1.03202e+18,3.62174e+18
2,1465416190,VIEW,3.10515e+17,-1.13027e+18,2.63186e+18


In [10]:
# load article info so we can obtain the article titles
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)
articles_df.head(2)

Unnamed: 0,timestamp,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
1,1459193988,-4.11035e+18,4.34031e+18,8.94034e+18,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,-7.29229e+18,4.34031e+18,8.94034e+18,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [11]:
# join on contentId to obtain the article titles
trans = pd.merge(interactions_df[['contentId','personId', 'eventType']], articles_df[['contentId', 'title']], how = 'inner', on = 'contentId')

In [12]:
# Create an implict rating called eventStrength based on the type of the interaction with the article
# E.g, assume a bookmark indicates a higher interest than a like etc.
# To do this, create a dictionary to associate each eventType with a weight.
print(trans['eventType'].value_counts())
 
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}
trans['eventStrength'] = trans['eventType'].apply(lambda x: event_type_strength[x])

VIEW               61188
LIKE                5755
BOOKMARK            2470
COMMENT CREATED     1616
FOLLOW              1412
Name: eventType, dtype: int64


In [13]:
trans.head()

Unnamed: 0,contentId,personId,eventType,title,eventStrength
0,-3.49992e+18,-8.8453e+18,VIEW,Hiri wants to fix the workplace email problem,1.0
1,-3.49992e+18,-8.8453e+18,VIEW,Hiri wants to fix the workplace email problem,1.0
2,-3.49992e+18,-1.08842e+17,VIEW,Hiri wants to fix the workplace email problem,1.0
3,-3.49992e+18,-1.44364e+18,VIEW,Hiri wants to fix the workplace email problem,1.0
4,-3.49992e+18,-1.44364e+18,VIEW,Hiri wants to fix the workplace email problem,1.0


In [14]:
# if a user has multiple interactions on the same content then sum the strengths
# Group eventStrength together with person and content.
trans = trans.drop_duplicates()
trans.columns = ['item','user','eventType','title','rating']
print(trans.head())
trans = trans.groupby(['user', 'item', 'title']).sum().reset_index()
print(trans.sample(5))

           item          user eventType  \
0 -3.499920e+18 -8.845300e+18      VIEW   
2 -3.499920e+18 -1.088420e+17      VIEW   
3 -3.499920e+18 -1.443640e+18      VIEW   
6 -3.499920e+18 -8.020830e+18      VIEW   
8 -3.499920e+18 -9.009800e+18      LIKE   

                                           title  rating  
0  Hiri wants to fix the workplace email problem     1.0  
2  Hiri wants to fix the workplace email problem     1.0  
3  Hiri wants to fix the workplace email problem     1.0  
6  Hiri wants to fix the workplace email problem     1.0  
8  Hiri wants to fix the workplace email problem     2.0  
               user          item  \
29158  3.609190e+18 -1.338960e+18   
16572 -1.443640e+18  4.761910e+18   
10558 -3.596630e+18 -7.264220e+18   
38389  7.983650e+18  6.031950e+18   
35787  6.644120e+18 -2.250490e+18   

                                                   title  rating  
29158              Qual é o país mais inovador do mundo?     6.0  
16572                         

In [15]:
# map to contiguous int ranges (note that the raw user and items ids are very very long integers , often negative)
trans,umap,imap = maptrans(trans)
trans.head(10)

Unnamed: 0,user,item,title,rating
0,0,65,"No Brasil, '25% dos celulares ainda são 'Burro...",1.0
1,0,159,Bad Writing Is Destroying Your Company's Produ...,1.0
2,0,187,Ray Kurzweil: The world isn't getting worse - ...,1.0
3,0,195,Organizing for digital acceleration: Making a ...,1.0
4,0,313,"Espresso Intents: não é magia, é tecnologia! -...",1.0
5,0,327,Here's proof that Google is getting serious ab...,1.0
6,0,385,My experience with Google's Associate Android ...,1.0
7,0,416,Seniority,1.0
8,0,442,Listas com RecyclerView - Android Dev BR,1.0
9,0,450,Google's fair use victory is good for open source,1.0


In [16]:
#Create two matrices, one for fitting the model (content-person) and one for recommendations (person-content)
#Create using sparse.csr_matrix((data,(row,column)))
sparse_item_user = sparse.csr_matrix((trans['rating'].astype(float), (trans['item'],trans['user'])))
sparse_user_item = sparse.csr_matrix((trans['rating'].astype(float), (trans['user'],trans['item'])))

In [17]:
#Initialize the Alternating Least Squares (ALS) recommendation model.
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

In [18]:
# Set matrix to double for the ALS function to run properly.
# note that each time the model is fitted may result in slightly different results (diff factor matrices)
alpha = 15
data = (sparse_item_user * alpha).astype('double')
model.fit(data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [19]:
###############################################
# Use the trained item properties to find the top 10 most similar articles for content_id = 450, 
# this article title=“Google’s fair use victory is good for open source”, it talks about Google and open source.
#################################################

item_id = 450
trans.title[trans.item == item_id]

 # use the implicit library built-in
similar = model.similar_items(item_id)
for item, score in similar: print(score,'\t',trans.title.loc[trans.item == item].iloc[0])

0.9999999 	 Google's fair use victory is good for open source
0.7845119 	 Up your DevOps chops with this online Kubernetes class
0.75798047 	 Meet Mycroft, the open source AI who wants to rival Siri, Cortana, and Alexa | ZDNet
0.73890924 	 Google's Cloud Dataflow stomps on Apache Spark in new benchmark tests
0.7363337 	 Understanding User Psychology: Thinking Like a Game Designer
0.7300129 	 Building immutable entities into Google Cloud Datastore
0.7208221 	 Google lags behind Amazon and Microsoft's cloud in one important area
0.7026284 	 5 Unique Features Of Google Compute Engine That No IaaS Provider Could Match
0.6935131 	 How to Embrace Constant Change in Growth
0.69236255 	 Announcing pricing for Google Stackdriver


In [20]:
# FYI - we can do the calc ourselves (should get identical results)
# we use the item-properties matrix (Q) to compute nearest neighbours using cosine similarity
user_vecs = model.user_factors; print(user_vecs.shape)  # user preferences (the P matrix)
item_vecs = model.item_factors; print(item_vecs.shape)  # item properties (the Q matrix)

similar = findsimilaritems(item_id, item_vecs)
for item, score in similar: print(score,'\t',trans.title.loc[trans.item == item].iloc[0])

(1892, 20)
(2976, 20)
0.9999999 	 Google's fair use victory is good for open source
0.784512 	 Up your DevOps chops with this online Kubernetes class
0.75798047 	 Meet Mycroft, the open source AI who wants to rival Siri, Cortana, and Alexa | ZDNet
0.7389093 	 Google's Cloud Dataflow stomps on Apache Spark in new benchmark tests
0.7363337 	 Understanding User Psychology: Thinking Like a Game Designer
0.7300129 	 Building immutable entities into Google Cloud Datastore
0.7208221 	 Google lags behind Amazon and Microsoft's cloud in one important area
0.7026283 	 5 Unique Features Of Google Compute Engine That No IaaS Provider Could Match
0.6935131 	 How to Embrace Constant Change in Growth
0.69236255 	 Announcing pricing for Google Stackdriver


In [21]:
###################################################
# Make recommendations for specific users
###################################################

user_id = 50

# use the implicit library built-in
recommendations = model.recommend(user_id, sparse_user_item, filter_already_liked_items=True)
for item, score in recommendations: 
    print(f'{score:0.5f}','\t',trans.title.loc[trans.item == item].iloc[0])
    
print('\n\n\n')
    
# use own function (do the matrix calculations ourselves, should get identical results)
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)
for item, score in recommendations: print(f'{score:0.5f}','\t',trans.title[trans.item == item].iloc[0])

1.45375 	 Custo do Erro - Cinco motivos para investir em automação de testes
1.43659 	 Ray Kurzweil: The world isn't getting worse - our information is getting better
1.24999 	 Do You Suffer From Deployment Anxiety? - DZone DevOps
1.19906 	 Former Google career coach shares a visual trick for figuring out what to do with your life
1.07891 	 Microservices testing
1.07600 	 How I built an app with 500,000 users in 5 days on a $100 server
1.07461 	 The technology behind preview photos
1.05011 	 Novo workaholic trabalha, pratica esportes e tem tempo para a família. Conheça
1.02281 	 'The Simpsons' celebrates 600 episodes with a VR couch gag
0.97802 	 BDD Best Practices and Guidelines - Testing Excellence




1.45375 	 Custo do Erro - Cinco motivos para investir em automação de testes
1.43659 	 Ray Kurzweil: The world isn't getting worse - our information is getting better
1.24999 	 Do You Suffer From Deployment Anxiety? - DZone DevOps
1.19906 	 Former Google career coach shares a visual tr

In [22]:
# Do these recommendations make sense? Examine the top 10 articles this person has rated.
trans[trans.user == user_id].sort_values(by=['rating'], ascending=False)[['rating', 'title']].head(10)

# try another person
user_id = 1
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)
for item, score in recommendations: print(f'{score:0.5f}','\t',trans.title[trans.item == item].iloc[0])
trans[trans.user == user_id].sort_values(by=['rating'], ascending=False)[['rating', 'title']].head(10)

0.48214 	 Como são escrita as risadas em japonês? - Suki Desu
0.45562 	 Livro: Retrospectivas Divertidas
0.44794 	 Jenkins 2.0 is here!
0.42300 	 Former Google career coach shares a visual trick for figuring out what to do with your life
0.41326 	 ITA está oferecendo 10 cursos gratuitos a distância - Engenharia é:
0.40681 	 Programação Reativa Funcional com RxJava
0.40255 	 Aposta na inovação
0.38291 	 Don't document your code. Code your documentation.
0.36911 	 Request lesson : How and when to use はず(=hazu) | Maggie Sensei
0.36726 	 Getting Started with Activity & Fragment Transitions (part 1)


Unnamed: 0,rating,title
44,3.0,Learn Hiragana: The Ultimate Guide
43,1.0,Firebase Test Lab for Android
45,1.0,"Fresco, sim! - Android Dev BR"
46,1.0,Japanese for dummies
47,1.0,Firebase and Google Cloud: better together
