### This notebook uses the library implicit for a latent factor model (matrix factorization with alternating least squares approach) to a user-user system. This is based on the paper http://yifanhu.net/PUB/cf.pdf

In [37]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
import itertools
import time
import implicit
from scipy import sparse
from scipy.sparse import csr_matrix

In [2]:
sessions = pd.read_csv('sessions.csv', index_col=0)
sessions.rename(columns={'SessionID':'SessionId'}, inplace=True)
sessions.shape

(112256, 6)

**From the documentation**: *TimeStamp – long integer value specifying the UTC date and time of the arrival of a request, coded as the number of 100-nanosecond intervals that have elapsed since 00:00:00 UTC on 1st January, 1 A.D.*

In [3]:
seconds = (datetime(1970,1,1)-datetime(1,1,1)).total_seconds()
ticks = seconds * 10**7
sessions['TimeStamp'] = pd.to_datetime((sessions['TimeStamp']-ticks)*100)
sessions.head()

Unnamed: 0,SessionId,IpId,TimeStamp,Event,Action,Product
0,0,100095PL,2020-01-13 21:35:36,Leave,,
1,1,100095PL,2020-04-18 16:14:40,Arrive,,
2,1,100095PL,2020-04-18 16:15:26,,add_to_cart,p-9967
3,1,100095PL,2020-04-18 16:33:07,,order,
4,1,100095PL,2020-04-18 16:34:25,Leave,,


In [4]:
train = sessions[sessions['TimeStamp']<=datetime(2020,3,31,23,59,59)]
test = sessions[sessions['TimeStamp']>datetime(2020,3,31,23,59,59)]

In [7]:
items = train['Product'].dropna().unique()
print(items.shape)
items

(4038,)


array(['p-7414', 'p-6769', 'p-4466', ..., 'p-10522', 'p-2788', 'p-8202'],
      dtype=object)

In [18]:
sessionFlags = pd.DataFrame()
sessionFlags['Add'] = train.groupby('SessionId')['Action'].apply(lambda y: 'add_to_cart' in set(y))
sessionFlags['Remove'] = train.groupby('SessionId')['Action'].apply(lambda y: 'remove_from_cart' in set(y))
sessionFlags['Delete'] = train.groupby('SessionId')['Action'].apply(lambda y: 'delete_cart' in set(y))
sessionFlags['Order'] = train.groupby('SessionId')['Action'].apply(lambda y: 'order' in set(y))
sessionFlags['Change'] = train.groupby('SessionId')['Action'].apply(lambda y: 'change_order' in set(y))

Since we are going to use the package 'implicit' we need to create a sparse item-user matrix. The matrix entries will be the confidence we have in the user preference for that item. Preferences are binary, and are based on implicit feedback, in our case based on 'action' information.

In [24]:
def userRating(x):
    if x['Add'] and x['Order'] and not x['Remove'] and not x['Delete'] and not x['Change']:
        return 5
    elif x['Add'] and x['Order'] and not x['Remove'] and not x['Delete']:
        return 4
    elif x['Add'] and x['Order'] and not x['Delete']:
        return 3
    elif x['Add'] and x['Order']:
        return 2
    elif x['Add']:
        return 1

In [47]:
user_session_item = train[~train['Product'].isnull()].drop(['TimeStamp', 'Event', 'Action'], axis=1).drop_duplicates()
user_session_item = user_session_item.merge(sessionFlags, how='left', on='SessionId')
user_session_item['Rating'] = user_session_item.apply(userRating, axis=1)
user_session_item = user_session_item.drop(['Add', 'Remove', 'Delete', 'Order', 'Change'], axis=1)
user_session_item

Unnamed: 0,SessionId,IpId,Product,Rating
0,12,10010PL,p-7414,1
1,19,100180PL,p-6769,1
2,19,100180PL,p-4466,1
3,19,100180PL,p-8068,1
4,20,100180PL,c-34_307,1
...,...,...,...,...
15712,40281,99972PL,p-6637,5
15713,40281,99972PL,p-8755,5
15714,40281,99972PL,c-65,5
15715,40281,99972PL,p-8990,5


In [51]:
user_item = user_session_item.drop('SessionId', axis=1)
user_item = user_item.groupby(['IpId', 'Product']).sum()
user_item['Confidence'] = 1 + 40*user_item['Rating']
user_item = user_item.drop('Rating', axis=1)
user_item[user_item['Confidence']>201]

Unnamed: 0_level_0,Unnamed: 1_level_0,Confidence
IpId,Product,Unnamed: 2_level_1
101074PL,p-5997,401
101293PL,c-35_316,401
102234PL,p-3362,241
102259PL,p-8808,241
104703PL,p-2592,241
...,...,...
9381PL,p-7502,241
95332PL,p-3618,241
95861PL,c-21,241
9593PL,p-9900,241


In [56]:
item_user = user_item.unstack(0)
dtype = pd.SparseDtype()
item_user_sparse = item_user.astype(dtype)
item_user_sparse = item_user_sparse.sparse.to_coo()
item_user_sparse = item_user_sparse.tocsr()

In [57]:
model = implicit.als.AlternatingLeastSquares(factors=10)
model.fit(item_user_sparse)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [80]:
user_ids = item_user.T.reset_index(level=0, drop=True).reset_index()['IpId']
user_ids[user_ids=='100180PL'].index[0]

1

In [81]:
user_item_ = item_user_sparse.T.tocsr()
user_id = user_ids[user_ids=='100180PL'].index[0]
recommendations = model.recommend(user_id, user_item_)

In [95]:
product_ids = pd.Series(item_user.index)

In [104]:
product_ids[product_ids.index==4].values[0]

'c-24_200'

In [106]:
[(product_ids[product_ids.index==recommendations[j][0]].values[0], recommendations[j][1]) for j in range(len(recommendations))]

[('c-66_274', 1.6412758),
 ('p-6119', 1.3442116),
 ('p-5325', 1.270266),
 ('p-7043', 1.2395097),
 ('p-10150', 1.120496),
 ('p-5326', 1.0641696),
 ('p-7646', 0.98180425),
 ('p-2101', 0.9340297),
 ('p-4291', 0.90439177),
 ('c-24_291', 0.8917385)]