In [2]:
# Package installation
!pip install sklearn
!pip install turicreate

Collecting turicreate
[?25l  Downloading https://files.pythonhosted.org/packages/ea/63/00ef7586c56a1aa48a7acabe20e1118cd82794785a56137e190fc6fb049c/turicreate-6.4.1-cp36-cp36m-manylinux1_x86_64.whl (92.0MB)
[K     |████████████████████████████████| 92.0MB 51kB/s 
Collecting resampy==0.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/14/b6/66a06d85474190b50aee1a6c09cdc95bb405ac47338b27e9b21409da1760/resampy-0.2.1.tar.gz (322kB)
[K     |████████████████████████████████| 327kB 40.6MB/s 
Collecting tensorflow<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/87/5e/254f5baeb331a7c9065b796d786cd07c5db79f2d76a496b74891a6521e25/tensorflow-2.0.3-cp36-cp36m-manylinux2010_x86_64.whl (86.4MB)
[K     |████████████████████████████████| 86.4MB 82kB/s 
Collecting coremltools==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/77/19/611916d1ef326d38857d93af5ba184f6ad7491642e0fa4f9082e7d82f034/coremltools-3.3-cp36-none-manylinux1_x86_64.whl (3.4MB)

In [3]:
# Import dependencies
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
import turicreate as tc

**Input Data**

In [6]:
# Upload csv file from a local drive
from google.colab import files
uploaded = files.upload()

Saving uk_data.csv to uk_data (1).csv


In [7]:
# Import csv file into a dataframe
import io
df = pd.read_csv(io.BytesIO(uploaded['uk_data.csv']), encoding='unicode_escape')

In [8]:
print(df.shape)
df.head()

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


**Data Preparation**

In [9]:
# Get data for product/customer matrix
data = df[['CustomerID', 'StockCode', 'Quantity']]
print(data.shape)
data.head()

(541909, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
0,17850.0,85123A,6
1,17850.0,71053,6
2,17850.0,84406B,8
3,17850.0,84029G,6
4,17850.0,84029E,6


**Create Data with User, Product and Purchase Count**

In [11]:
# Group by to get purchase quantity by customer ID and stock code - CHECK why rename script didn't work
data = data.groupby(['CustomerID','StockCode']) \
    .agg({'Quantity':'sum'}) \
    .rename(columns={'StockCode':'ProductID'}) \
    .reset_index()
print(data.shape)
data.head()

(267615, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
0,12346.0,23166,0
1,12347.0,16008,24
2,12347.0,17021,36
3,12347.0,20665,6
4,12347.0,20719,40


In [12]:
# Question - get rid of negative values? - Decision: Get rid!
data[data['CustomerID']==17850]

Unnamed: 0,CustomerID,StockCode,Quantity
251481,17850.0,15056BL,24
251482,17850.0,20679,42
251483,17850.0,21068,96
251484,17850.0,21071,90
251485,17850.0,21169,-2
251486,17850.0,21730,102
251487,17850.0,21871,83
251488,17850.0,21874,-1
251489,17850.0,22411,60
251490,17850.0,22632,102


In [13]:
# Filter out quantity with negative values
data = data[data['Quantity']>0]
print(data.shape)
data.head()

(265220, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
1,12347.0,16008,24
2,12347.0,17021,36
3,12347.0,20665,6
4,12347.0,20719,40
5,12347.0,20780,12


In [14]:
# Format CustomerID from integer to string
data['CustomerID'] = data['CustomerID'].astype(int).astype(str)
data.dtypes

CustomerID    object
StockCode     object
Quantity       int64
dtype: object

**Create Dummy Dataset**

In [15]:
# Create dummy
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)
data_dummy

Unnamed: 0,CustomerID,StockCode,Quantity,purchase_dummy
1,12347,16008,24,1
2,12347,17021,36,1
3,12347,20665,6,1
4,12347,20719,40,1
5,12347,20780,12,1
...,...,...,...,...
267610,18287,84920,4,1
267611,18287,85039A,96,1
267612,18287,85039B,120,1
267613,18287,85040A,48,1


**Normalize Item Values Across Users**

In [16]:
# Function to normalize item values across users
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='Quantity', index='CustomerID', columns='StockCode')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['CustomerID'], value_name='scaled_purchase_freq').dropna()

In [17]:
# Execute function
norm_data = normalize_data(data)

In [18]:
print(norm_data.shape)
norm_data.head()

(264838, 3)


Unnamed: 0,CustomerID,StockCode,scaled_purchase_freq
85,12451,10002,0.04
128,12510,10002,0.083636
186,12583,10002,0.170909
231,12637,10002,0.04
262,12673,10002,0.0


In [19]:
# Import drive first before exporting as csv file
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
# Save pandas dataframe as csv file
norm_data.to_csv('uk_data_scaled_freq.csv')
!cp uk_data_scaled_freq.csv "drive/My Drive/PREWORK_JT/"

**Split Train and Test Datasets**

In [20]:
# Declare function to split train and test data
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [21]:
# Split normalised scaled frequency data
train_data_norm, test_data_norm = split_data(norm_data)

In [22]:
# Split dummy data
train_data_dummy, test_data_dummy = split_data(data_dummy)

In [23]:
# Split purchase count data
train_data, test_data = split_data(data)

**Define Models Using Turicreate Library**

In [24]:
# Define variables for field names
user_id = 'CustomerID'
item_id = 'StockCode'
users_to_recommend = list(data['CustomerID'])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [25]:
# Declare function for all models
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

**Popularity Model as Baseline**

In [26]:
# Using Purchase Count
name = 'popularity'
target = 'Quantity'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|   12347    |   47556B  |       2600.0       |  1   |
|   12347    |   84826   | 558.4166666666666  |  2   |
|   12347    |   84568   |       499.2        |  3   |
|   12347    |   21897   |       402.0        |  4   |
|   12347    |   84598   |       240.0        |  5   |
|   12347    |   16014   | 238.42857142857142 |  6   |
|   12347    |   17096   |       237.2        |  7   |
|   12347    |   22053   | 233.44444444444446 |  8   |
|   12347    |   16033   |       210.0        |  9   |
|   12347    |   17084R  |       180.0        |  10  |
|   12347    |   47556B  |       2600.0       |  1   |
|   12347    |   84826   | 558.4166666666666  |  2   |
|   12347    |   84568   |       499.2        |  3   |
|   12347    |   21897   |       402.0        |  4   |
|   12347    |   84598   |       240.0        |  5   |
|   12347 

In [27]:
# Using purchase dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   22847   |  1.0  |  1   |
|   12347    |   21401   |  1.0  |  2   |
|   12347    |   22382   |  1.0  |  3   |
|   12347    |   22662   |  1.0  |  4   |
|   12347    |   22734   |  1.0  |  5   |
|   12347    |   35967   |  1.0  |  6   |
|   12347    |   22178   |  1.0  |  7   |
|   12347    |   21181   |  1.0  |  8   |
|   12347    |   23289   |  1.0  |  9   |
|   12347    |   22644   |  1.0  |  10  |
|   12347    |   22847   |  1.0  |  1   |
|   12347    |   21401   |  1.0  |  2   |
|   12347    |   22382   |  1.0  |  3   |
|   12347    |   22662   |  1.0  |  4   |
|   12347    |   22734   |  1.0  |  5   |
|   12347    |   35967   |  1.0  |  6   |
|   12347    |   22178   |  1.0  |  7   |
|   12347    |   21181   |  1.0  |  8   |
|   12347    |   23289   |  1.0  |  9   |
|   12347    |   22644   |  1.0  |  10  |
|   12347    |   22847   |  1.0  |

In [28]:
# Using scaled purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   90129A  |  1.0  |  1   |
|   12347    |   90141C  |  1.0  |  2   |
|   12347    |   20906   |  1.0  |  3   |
|   12347    |   90151   |  1.0  |  4   |
|   12347    |   84598   |  1.0  |  5   |
|   12347    |   23446   |  1.0  |  6   |
|   12347    |   90188   |  1.0  |  7   |
|   12347    |   90019C  |  1.0  |  8   |
|   12347    |   90185A  |  1.0  |  9   |
|   12347    |   35597D  |  1.0  |  10  |
|   12347    |   90129A  |  1.0  |  1   |
|   12347    |   90141C  |  1.0  |  2   |
|   12347    |   20906   |  1.0  |  3   |
|   12347    |   90151   |  1.0  |  4   |
|   12347    |   84598   |  1.0  |  5   |
|   12347    |   23446   |  1.0  |  6   |
|   12347    |   90188   |  1.0  |  7   |
|   12347    |   90019C  |  1.0  |  8   |
|   12347    |   90185A  |  1.0  |  9   |
|   12347    |   35597D  |  1.0  |  10  |
|   12347    |   90129A  |  1.0  |

In [36]:
train_data.head()

CustomerID,StockCode,Quantity
13098,22714,24
16313,22274,6
13137,22964,12
12508,21452,3
16370,47590B,1
14625,22558,12
16801,22051,25
14499,23215,3
12826,20712,40
15228,23012,4


In [46]:
# Baseline summary
import turicreate.aggregate as agg
train_data.groupby(key_column_names='StockCode', operations={'mean_qty': agg.MEAN('Quantity')}).sort('mean_qty', ascending = False).head(20)

StockCode,mean_qty
47556B,2600.0
84826,558.4166666666666
84568,499.2
21897,402.0
84598,240.0
16014,238.42857142857144
17096,237.2
22053,233.44444444444449
16033,210.0
17084R,180.0


**Collaborative Filtering Model**

Cosine similarity

In [47]:
# Using purchase count
name = 'cosine'
target = 'Quantity'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|   12347    |   22326   | 6.867530511086246  |  1   |
|   12347    |   23345   | 6.8559121582881515 |  2   |
|   12347    |   22629   |  6.83385133384222  |  3   |
|   12347    |   23292   | 6.7768469445676685 |  4   |
|   12347    |   22399   | 6.757206121122981  |  5   |
|   12347    |   22551   | 6.7479429072644335 |  6   |
|   12347    |   35961   |  6.74569798377623  |  7   |
|   12347    |   22554   | 6.521932647888919  |  8   |
|   12347    |   22243   | 6.386506120842624  |  9   |
|   12347    |   22595   | 6.384640544293875  |  10  |
|   12347    |   22326   | 6.867530511086246  |  1   |
|   12347    |   23345   | 6.8559121582881515 |  2   |
|   12347    |   22629   |  6.83385133384222  |  3   |
|   12347    |   23292   | 6.7768469445676685 |  4   |
|   12347    |   22399   | 6.757206121122981  |  5   |
|   12347 

In [48]:
# Using purchase dummy
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | StockCode |        score         | rank |
+------------+-----------+----------------------+------+
|   12347    |   21212   | 0.07031899396284128  |  1   |
|   12347    |   23209   |  0.0538472032841341  |  2   |
|   12347    |   22629   | 0.052828196390175525 |  3   |
|   12347    |   23245   | 0.052347149377987706 |  4   |
|   12347    |   23307   | 0.04951141277949015  |  5   |
|   12347    |   20725   | 0.04887274901072184  |  6   |
|   12347    |   22382   | 0.048278385474358074 |  7   |
|   12347    |   23207   | 0.04794377603648621  |  8   |
|   12347    |   21977   | 0.04677223864896798  |  9   |
|   12347    |   23206   | 0.04577277104059855  |  10  |
|   12347    |   21212   | 0.07031899396284128  |  1   |
|   12347    |   23209   |  0.0538472032841341  |  2   |
|   12347    |   22629   | 0.052828196390175525 |  3   |
|   12347    |   23245   | 0.052347149377987706 |  4   |
|   12347    |   23307   | 0.04

In [49]:
# Using scaled purchase count
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| CustomerID | StockCode |        score         | rank |
+------------+-----------+----------------------+------+
|   12347    |   16206B  | 0.022151305757719893 |  1   |
|   12347    |   90205C  | 0.022151305757719893 |  2   |
|   12347    |   90072   | 0.02206183096458172  |  3   |
|   12347    |   90141C  | 0.02206183096458172  |  4   |
|   12347    |   90103   | 0.02206183096458172  |  5   |
|   12347    |   22769   | 0.02206183096458172  |  6   |
|   12347    |   72225C  | 0.02206183096458172  |  7   |
|   12347    |   90141A  | 0.021985142395414155 |  8   |
|   12347    |   23315   | 0.02171040814498375  |  9   |
|   12347    |   90101   | 0.02069691748454653  |  10  |
|   12347    |   16206B  | 0.022151305757719893 |  1   |
|   12347    |   90205C  | 0.022151305757719893 |  2   |
|   12347    |   90072   | 0.02206183096458172  |  3   |
|   12347    |   90141C  | 0.02206183096458172  |  4   |
|   12347    |   90103   | 0.02

Pearson similarity

In [50]:
# Using purchase count
name = 'pearson'
target = 'Quantity'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|   12347    |   47556B  |       2600.0       |  1   |
|   12347    |   84826   | 558.4166666666666  |  2   |
|   12347    |   84568   |       499.2        |  3   |
|   12347    |   21897   |       402.0        |  4   |
|   12347    |   84598   | 239.9978961154639  |  5   |
|   12347    |   16014   | 238.42857142857142 |  6   |
|   12347    |   17096   | 237.20000000000002 |  7   |
|   12347    |   22053   | 233.44444444444443 |  8   |
|   12347    |   16033   | 210.2373179570738  |  9   |
|   12347    |   17084R  |       180.0        |  10  |
|   12347    |   47556B  |       2600.0       |  1   |
|   12347    |   84826   | 558.4166666666666  |  2   |
|   12347    |   84568   |       499.2        |  3   |
|   12347    |   21897   |       402.0        |  4   |
|   12347    |   84598   | 239.9978961154639  |  5   |
|   12347 

In [51]:
# Using purchase dummy
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   22847   |  0.0  |  1   |
|   12347    |   21401   |  0.0  |  2   |
|   12347    |   22382   |  0.0  |  3   |
|   12347    |   22662   |  0.0  |  4   |
|   12347    |   22734   |  0.0  |  5   |
|   12347    |   35967   |  0.0  |  6   |
|   12347    |   22178   |  0.0  |  7   |
|   12347    |   21181   |  0.0  |  8   |
|   12347    |   23289   |  0.0  |  9   |
|   12347    |   22644   |  0.0  |  10  |
|   12347    |   22847   |  0.0  |  1   |
|   12347    |   21401   |  0.0  |  2   |
|   12347    |   22382   |  0.0  |  3   |
|   12347    |   22662   |  0.0  |  4   |
|   12347    |   22734   |  0.0  |  5   |
|   12347    |   35967   |  0.0  |  6   |
|   12347    |   22178   |  0.0  |  7   |
|   12347    |   21181   |  0.0  |  8   |
|   12347    |   23289   |  0.0  |  9   |
|   12347    |   22644   |  0.0  |  10  |
|   12347    |   22847   |  0.0  |

In [53]:
# Using scaled purchase count
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|   12347    |   79192A  |  1.0  |  1   |
|   12347    |   90141C  |  1.0  |  2   |
|   12347    |   20906   |  1.0  |  3   |
|   12347    |   90151   |  1.0  |  4   |
|   12347    |   84598   |  1.0  |  5   |
|   12347    |   23446   |  1.0  |  6   |
|   12347    |   90188   |  1.0  |  7   |
|   12347    |   90019C  |  1.0  |  8   |
|   12347    |   90185A  |  1.0  |  9   |
|   12347    |   35597D  |  1.0  |  10  |
|   12347    |   79192A  |  1.0  |  1   |
|   12347    |   90141C  |  1.0  |  2   |
|   12347    |   20906   |  1.0  |  3   |
|   12347    |   90151   |  1.0  |  4   |
|   12347    |   84598   |  1.0  |  5   |
|   12347    |   23446   |  1.0  |  6   |
|   12347    |   90188   |  1.0  |  7   |
|   12347    |   90019C  |  1.0  |  8   |
|   12347    |   90185A  |  1.0  |  9   |
|   12347    |   35597D  |  1.0  |  10  |
|   12347    |   79192A  |  1.0  |

**Model Evaluation**

In [54]:
# Declare initial callable variables for model evaluation
models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [62]:
models_w_counts

[Class                            : PopularityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : Quantity
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Statistics
 ----------
 Number of observations           : 212176
 Number of users                  : 4303
 Number of items                  : 3603
 
 Training summary
 ----------------
 Training time                    : 0.0208
 
 Model Parameters
 ----------------
 Model class                      : PopularityRecommender,
 Class                            : ItemSimilarityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : Quantity
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Stat

In [57]:
# Compare all the models based on RMSE and precision-recall characteristics
# Using purchase counts models
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |          0.0          |          0.0          |
|   2    | 0.0006177415369409433 | 5.360612630924177e-05 |
|   3    | 0.0006589243060703406 | 7.894936885040861e-05 |
|   4    | 0.0004941932295527553 | 7.894936885040873e-05 |
|   5    |  0.000395354583642204 |  7.89493688504087e-05 |
|   6    | 0.0008648381517173216 |  0.000578521304845326 |
|   7    | 0.0008118888771223834 | 0.0005928991488045363 |
|   8    | 0.0008648381517173219 | 0.0006966483354261605 |
|   9    | 0.0007687450237487302 | 0.0006966483354261598 |
|   10   |  0.000864838151717321 | 0.0007214580142372462 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 109.89703950981225

Per User RMSE (best)
+------------+------+-------+
| CustomerID |


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.058314801087225136 | 0.009689687001145114 |
|   2    | 0.048678033110946387 | 0.015575229125919875 |
|   3    | 0.04101803805287866  | 0.018050218186380074 |
|   4    | 0.036508524833209766 | 0.020558688056413518 |
|   5    | 0.033555720286632174 | 0.02342211821419828  |
|   6    | 0.030845894077917806 | 0.025305231616196553 |
|   7    | 0.028663207314059858 | 0.027256861288184674 |
|   8    | 0.02668643439584878  | 0.028930564885438888 |
|   9    | 0.02517639952777105  | 0.031204841845972428 |
|   10   | 0.02377069434148748  | 0.03255414598330161  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 112.41423838595747

Per User RMSE (best)
+------------+---------------------+-------+
| CustomerID |         rmse


Precision and recall summary statistics by cutoff
+--------+------------------------+-----------------------+
| cutoff |     mean_precision     |      mean_recall      |
+--------+------------------------+-----------------------+
|   1    |          0.0           |          0.0          |
|   2    |  0.000617741536940944  | 5.360612630924175e-05 |
|   3    | 0.0006589243060703397  | 7.894936885040861e-05 |
|   4    | 0.0004941932295527547  | 7.894936885040858e-05 |
|   5    | 0.00039535458364220406 | 7.894936885040865e-05 |
|   6    | 0.0008648381517173203  |  0.000578521304845326 |
|   7    | 0.0008118888771223827  | 0.0005928991488045362 |
|   8    | 0.0008648381517173214  | 0.0006966483354261606 |
|   9    | 0.0007687450237487308  | 0.0006966483354261602 |
|   10   |  0.000864838151717323  | 0.0007214580142372454 |
+--------+------------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 109.35960988414185

Per User RMSE (best)
+------------+----------------

In [58]:
# Using dummy models
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Dummy



Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0014869888475836435 | 7.843375717157473e-05  |
|   2    | 0.0016109045848822798 | 0.00024167406911077124 |
|   3    |  0.001652209830648492 |  0.000380059767228895  |
|   4    | 0.0019826517967781888 | 0.0005514173882277905  |
|   5    | 0.0018835192069392827 |  0.000608624758684594  |
|   6    |  0.002023957042544405 | 0.0008674991048595759  |
|   7    | 0.0019472473004071523 | 0.0009801565582961632  |
|   8    | 0.0018897149938042139 | 0.0012935359823665238  |
|   9    |  0.001817430813713341 | 0.0013560456094008472  |
|   10   |  0.001858736059479554 | 0.0014510084178915392  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| CustomerID | 


Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    | 0.19900867410161094 | 0.032699731812515655 |
|   2    |  0.1759603469640645 | 0.052825306935902455 |
|   3    | 0.16034696406443627 | 0.06850768740493836  |
|   4    | 0.14702602230483278 | 0.08104569605143867  |
|   5    | 0.13804213135068144 |  0.0924783494167599  |
|   6    |  0.1308963238331268 |  0.1029352270590789  |
|   7    | 0.12373871481678161 | 0.11137955908639346  |
|   8    | 0.11908302354399006 | 0.12031588874777618  |
|   9    | 0.11507641470466752 |  0.1299709327412758  |
|   10   | 0.11147459727385381 | 0.13840480317947676  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.978517124506073

Per User RMSE (best)
+------------+--------------------+-------+
| CustomerID |        rmse        | count |


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0014869888475836444 | 7.843375717157475e-05  |
|   2    |  0.00161090458488228  | 0.00024167406911077162 |
|   3    | 0.0016522098306484929 | 0.00038005976722889503 |
|   4    |  0.001982651796778192 | 0.0005514173882277905  |
|   5    | 0.0018835192069392823 |  0.000608624758684594  |
|   6    | 0.0020239570425444036 |  0.000867499104859576  |
|   7    | 0.0019472473004071506 |  0.000980156558296163  |
|   8    | 0.0018897149938042123 | 0.0012935359823665236  |
|   9    |  0.001817430813713339 |  0.001356045609400846  |
|   10   | 0.0018587360594795484 | 0.0014510084178915386  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| CustomerID | 

In [59]:
# Using normalized purchase count models
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Scaled Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    | 0.0004961548002976922  | 2.8029524432402102e-05 |
|   2    | 0.00024807740014884616 | 2.802952443240214e-05  |
|   3    | 0.0001653849334325643  | 2.802952443240214e-05  |
|   4    | 0.00018605805011163504 | 2.9089684262098028e-05 |
|   5    | 0.0002480774001488463  | 3.950982663133308e-05  |
|   6    | 0.00024807740014884616 | 7.051950164993887e-05  |
|   7    | 0.00024807740014884654 | 0.00010595913024263123 |
|   8    | 0.0002790870751674523  | 0.0001458287124094101  |
|   9    | 0.00027564155572094043 | 0.00015203064741313115 |
|   10   | 0.00027288514016373105 | 0.0001568949101611478  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.15177638259176848

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.048623170429173905 | 0.008783952706831903 |
|   2    | 0.03584718432150835  | 0.01230650788740156  |
|   3    | 0.028859670883982416 | 0.014433534893173226 |
|   4    | 0.02524187546514512  |  0.0165954082867696  |
|   5    | 0.022227735053336733 | 0.018356327653584165 |
|   6    | 0.020218308112130975 | 0.01991599237689818  |
|   7    | 0.018853882411312365 | 0.02100705545896235  |
|   8    | 0.01745844703547508  | 0.022440313703706027 |
|   9    | 0.016400672565396006 | 0.023394897918964916 |
|   10   | 0.015554452989332725 | 0.024312679767764914 |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.1782601128694322

Per User RMSE (best)
+------------+------+-------+
| CustomerID | rmse | count |
+-----------


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 0.00024807740014884643 | 8.939726131489961e-05  |
|   3    | 0.0002480774001488463  |  9.42615240629162e-05  |
|   4    | 0.0003100967501860581  | 0.00010450973574997682 |
|   5    | 0.0002976928801786157  | 0.00011336964289814987 |
|   6    | 0.00028942363350698756 | 0.00014437931791675584 |
|   7    | 0.00028351702874153883 | 0.00017981894650944807 |
|   8    | 0.0002790870751674523  | 0.00018137918173051005 |
|   9    | 0.0002756415557209406  | 0.00021238885674911568 |
|   10   | 0.0002976928801786157  | 0.00022801451117407547 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.15109937730070905

Per User RMSE (best)
+------------+-

**Evaluation Summary**

RMSE Outcome


1.   Popularity on purchase counts: 109.89703950981225
2.   Cosine similarity on purchase counts: 112.41423838595747
3.   Pearson simlarity on puchase counts: 109.35960988414185

4.   Popularity on purchase dummy: 0.0
5.   Cosine similarity on purchase dummy: 0.978517124506073
6.   Pearson simlarity on puchase dummy: 1.0

7.   Popularity on scaled purchase counts: 0.15177638259176848
8.   Cosine similarity on scaled purchase counts: 0.1782601128694322
9.   Pearson simlarity on scaled puchase counts: 0.15109937730070905

*   Popularity v Collaborative Filtering: Collaborative Filtering alloes personalization and popularity recommends popular options. Both models will be used as different features.
*   RMSE: RMSE for collaborative filtering using pearson distance is lower than cosine. We select the model with the smaller mean squared errors being the pearson model. 
*   Precision and recall: Dummy purchase count




**Final Output**

Collaborative filtering

In [None]:
# Rerun the model using the whole dataset, as we came to a final model using train data and evaluated with test set
final_model = tc.item_similarity_recommender.create(tc.SFrame(test_data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

In [None]:
# Transfer Turicreate model to pandas dataframe
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

In [None]:
# Declare a function to create a desired output and export dataframe to csv
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['CustomerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('CustomerId').set_index('CustomerId')
    if print_csv:
        df_output.to_csv('../output/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [None]:
# Execute function on whole dataset  
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

Popularity