In [128]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import linalg

import re
import mailbox

from datetime import datetime as dt
from dateutil import tz

from sklearn.neighbors import NearestNeighbors

In [93]:
df = pd.read_csv(r'datafile.csv',skipinitialspace=True)

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999998 entries, 0 to 9999997
Data columns (total 8 columns):
Unnamed: 0       int64
event_time       object
event_type       object
product_id       int64
category_code    object
brand            object
price            float64
user_id          int64
dtypes: float64(1), int64(3), object(4)
memory usage: 610.4+ MB


In [95]:
df.head()

Unnamed: 0.1,Unnamed: 0,event_time,event_type,product_id,category_code,brand,price,user_id
0,3967559,2019-10-04 08:11:58 UTC,view,1001588,electronics.smartphone,meizu,128.26,525199888
1,2628431,2019-10-03 05:53:24 UTC,view,1001588,electronics.smartphone,meizu,128.28,518720832
2,51776,2019-10-01 03:35:11 UTC,view,1001588,electronics.smartphone,meizu,128.31,554199941
3,2628354,2019-10-03 05:53:19 UTC,view,1001588,electronics.smartphone,meizu,128.28,518720832
4,4701726,2019-10-04 16:41:49 UTC,view,1001588,electronics.smartphone,meizu,128.25,556792578


We already know that some behaviors have occurred when customers browse products, from which we can know the product's id that customers have used or preferred. Next, we hope to use this analysis to predict the more products that customers will be interested in, and finally generate recommendations for each user.  
First of all, I will resort the data by user_id, count the times of each event based on user_id and product_id.

   View collect most data in 3 events, so I will focus on 'view' to do analysis.

In [96]:
# filter 'cart' and 'purchase' from df
m = df[df['event_type'] == 'view']
m

Unnamed: 0.1,Unnamed: 0,event_time,event_type,product_id,category_code,brand,price,user_id
0,3967559,2019-10-04 08:11:58 UTC,view,1001588,electronics.smartphone,meizu,128.26,525199888
1,2628431,2019-10-03 05:53:24 UTC,view,1001588,electronics.smartphone,meizu,128.28,518720832
2,51776,2019-10-01 03:35:11 UTC,view,1001588,electronics.smartphone,meizu,128.31,554199941
3,2628354,2019-10-03 05:53:19 UTC,view,1001588,electronics.smartphone,meizu,128.28,518720832
4,4701726,2019-10-04 16:41:49 UTC,view,1001588,electronics.smartphone,meizu,128.25,556792578
...,...,...,...,...,...,...,...,...
9999993,2237675,2019-11-02 12:26:57 UTC,view,61700012,non-avaliable,atlant,180.18,540807712
9999994,2671287,2019-11-02 16:57:35 UTC,view,61700012,non-avaliable,atlant,180.18,549358123
9999995,4845232,2019-11-04 05:29:29 UTC,view,61700012,non-avaliable,atlant,180.18,529390009
9999996,4524169,2019-11-03 20:55:42 UTC,view,61700012,non-avaliable,atlant,180.18,514022872


In [97]:
# Collect the data from m
user_df = pd.pivot_table(m, index = ['user_id','product_id'], values = 'event_type', aggfunc = 'count', 
                         fill_value=0)
user_df.reset_index('user_id')
user_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,event_type
user_id,product_id,Unnamed: 2_level_1
183503497,22200103,1
184265397,6902133,2
184265397,6902303,2
208669541,16200119,1
208669541,26000270,1


In [98]:
#the data size is too large to run, so smaller the data size to make the analysis works.
user_df_ = user_df.head(10000)
user_df_

Unnamed: 0_level_0,Unnamed: 1_level_0,event_type
user_id,product_id,Unnamed: 2_level_1
183503497,22200103,1
184265397,6902133,2
184265397,6902303,2
208669541,16200119,1
208669541,26000270,1
...,...,...
483785194,1004505,1
483806238,4804056,1
483823174,1004751,3
483823174,1004870,4


In [99]:
# Create the matrix
user_view = pd.pivot_table(user_df_, index = 'user_id', columns = 'product_id', values ='event_type').fillna(0)
user_view

product_id,1002098,1002099,1002101,1002102,1002398,1002484,1002524,1002528,1002531,1002532,...,54900012,54900014,54900015,55900001,57100062,57600033,57800002,58300010,60000009,60000015
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
183503497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
184265397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208669541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
214470341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216064734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483782965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
483785194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
483806238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
483823174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
# Calculate U, sigma and Vh by SVD
U, sigma, Vh = linalg.svd(user_view)
print(U.shape, sigma.shape, Vh.shape)

(3252, 3252) (3252,) (6164, 6164)


In [161]:
print(user_view.shape)

(3252, 6164)


In [162]:
# KNN 
product_knn = NearestNeighbors(algorithm = 'brute', metric = 'cosine', n_neighbors = 20)
product_knn.fit(Vh)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                 radius=1.0)

In [147]:
# make product's recommendation by ourselves
def make_recommendtions(self, fav_product, n_recommendations):
    # get data
    product_user_mat_sparse, hashmap = self._prep_data()
    # get recommendations
    raw_recommends = self._inference(
        self.model, product_user_mat_sparse, hashmap,
        fav_product, n_recommendations)
    # print results
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(fav_product))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance '
              'of {2}'.format(i+1, reverse_hashmap[idx], dist))