**Model A**

This model purchase count is normalised for each product interaction by calculating an interaction score for each customer as follows:

interactionScore = (interactionScore-product.minScore)/(product.maxScore-product.minScore)

In this model, dummy data is all 1.

Output:
Recommend 10 products based on CustomerID input

In [1]:
# Package installation
!pip install sklearn
!pip install turicreate

Collecting turicreate
[?25l  Downloading https://files.pythonhosted.org/packages/ea/63/00ef7586c56a1aa48a7acabe20e1118cd82794785a56137e190fc6fb049c/turicreate-6.4.1-cp36-cp36m-manylinux1_x86_64.whl (92.0MB)
[K     |████████████████████████████████| 92.0MB 72kB/s 
Collecting resampy==0.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/14/b6/66a06d85474190b50aee1a6c09cdc95bb405ac47338b27e9b21409da1760/resampy-0.2.1.tar.gz (322kB)
[K     |████████████████████████████████| 327kB 46.7MB/s 
Collecting tensorflow<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/87/5e/254f5baeb331a7c9065b796d786cd07c5db79f2d76a496b74891a6521e25/tensorflow-2.0.3-cp36-cp36m-manylinux2010_x86_64.whl (86.4MB)
[K     |████████████████████████████████| 86.4MB 51kB/s 
Collecting coremltools==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/77/19/611916d1ef326d38857d93af5ba184f6ad7491642e0fa4f9082e7d82f034/coremltools-3.3-cp36-none-manylinux1_x86_64.whl (3.4MB)

In [2]:
# Import dependencies

# For data manipulation and EDA
import numpy as np
import pandas as pd
import time

# For splitting data into train and test sets
from sklearn.model_selection import train_test_split

# For machine learning modelling, model evaluation and selection
import turicreate as tc
import turicreate.aggregate as agg

# **Input Data**

In [3]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


Query from public data

In [None]:
project_id = 'cosmic-antenna-281600'

from google.cloud import bigquery

client = bigquery.Client(project=project_id)

sample_count = 2000
df = client.query('''SELECT store_number, item_number, sum(volume_sold_liters) AS volume
FROM `bigquery-public-data.iowa_liquor_sales.sales`
WHERE data > date'2020-01-01' 
GROUP BY store_number, item_number''').to_dataframe()

print(df.shape)
df.head()

Query from own table stored in personal database

In [4]:
project_id = 'cosmic-antenna-281600'

from google.cloud import bigquery

client = bigquery.Client(project=project_id)

sample_count = 2000
df = client.query('''SELECT *
  FROM `cosmic-antenna-281600.iowa_2020.iowa_liquor_2020`''').to_dataframe()

print(df.shape)
df.head()

(446139, 3)


Unnamed: 0,store_number,item_number,volume
0,4829,27066,360.0
1,4311,43031,1.35
2,2627,43040,26.25
3,2500,27175,60.0
4,5054,5037,5.0


Option to open via file upload from gdrive

In [None]:
# Import drive first before exporting as csv file - !need to change to read from Git/BigQuery!
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Use pandas to read into data
df = pd.read_csv('gdrive/My Drive/bq-iowa-liquor/bq-iowa-liquor.csv')
print(df.shape)
df.head()

# **Exploratory Data Analysis**

In [5]:
df.describe()

Unnamed: 0,volume
count,446139.0
mean,40.015801
std,247.456766
min,0.02
25%,3.0
50%,9.0
75%,27.0
max,79852.5


In [6]:
print( "Total unique customers: ", df.groupby('store_number').store_number.nunique().count())
print( "Total unique stock items: ", df.groupby('item_number').item_number.nunique().count())

Total unique customers:  1841
Total unique stock items:  3846


In [7]:
df['store_number'].astype('str').describe()

count     446139
unique      1841
top         2190
freq        1792
Name: store_number, dtype: object

In [8]:
df['item_number'].astype('str').describe()

count     446139
unique      3846
top        11788
freq        1611
Name: item_number, dtype: object

In [9]:
cust_count = df.groupby('CustomerID').nunique().sort_values(by='StockCode', ascending=False)
cust_count = cust_count.reset_index()


KeyError: ignored

In [10]:

cust_count[cust_count['CustomerID']==2106]

NameError: ignored

In [11]:
df.groupby('CustomerID').StockCode.nunique()

KeyError: ignored

In [None]:
df.groupby('item_number').describe().sort_values(by='item_number', ascending=False)

In [None]:
df.groupby('CustomerID').describe().sortby(ascending=False)

In [None]:
# Most frequently bought items

In [None]:
# Highest purchase volume items 

In [None]:
# Top customers by purchase frequency

In [None]:
# Top customers by purchase volume

In [None]:
data_count = data.groupby(['CustomerID']).count().sort_values(by='Quantity', ascending=False)
print(data_count.shape)
data_count.head(10)

# **Data Preparation**

In [14]:
# Get rename data for product/customer matrix
df.rename(columns={'store_number': 'CustomerID', \
                   'item_number': 'StockCode', \
                   'volume': 'Quantity'}, inplace=True)
data = df
print(data.shape)
data.head()

(446139, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
0,4829,27066,360.0
1,4311,43031,1.35
2,2627,43040,26.25
3,2500,27175,60.0
4,5054,5037,5.0


**Create Data with User, Product and Purchase Count**

In [15]:
# Group by to get purchase quantity by customer ID and stock code
data = data.groupby(['CustomerID','StockCode']) \
    .agg({'Quantity':'sum'}) \
    .reset_index().sort_values(by=['Quantity'], ascending=False)
print(data.shape)
data.head()

(446139, 3)


Unnamed: 0,CustomerID,StockCode,Quantity
160410,3814,936600,79852.5
255205,4677,936600,41580.0
160413,3814,962094,31185.0
387312,5666,936600,28350.0
267831,4829,38177,26280.0


In [16]:
# Check to ensure no negative purchase order quantity
data[data['Quantity']<0]

Unnamed: 0,CustomerID,StockCode,Quantity


In [17]:
# Check data types for current dataframe. CustomerID must be string
data.dtypes

CustomerID     object
StockCode      object
Quantity      float64
dtype: object

*Outcome: Need to change CustomerID from integer to string*

In [18]:
# Format CustomerID from integer to string
data['CustomerID'] = data['CustomerID'].astype(str)
data.dtypes

CustomerID     object
StockCode      object
Quantity      float64
dtype: object

**Create Dummy Dataset**

In [19]:
# Create dummy
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)
print(data_dummy.shape)
data_dummy.head()

(446139, 4)


Unnamed: 0,CustomerID,StockCode,Quantity,purchase_dummy
160410,3814,936600,79852.5,1
255205,4677,936600,41580.0,1
160413,3814,962094,31185.0,1
387312,5666,936600,28350.0,1
267831,4829,38177,26280.0,1


**Normalize Item Values Across Users**

In [20]:
# Function to normalize item values across users
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='Quantity', index='CustomerID', columns='StockCode')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['CustomerID'], value_name='scaled_purchase_freq').dropna()

In [21]:
# Execute function
norm_data = normalize_data(data)
print(norm_data.shape)
norm_data.head()

(443400, 3)


Unnamed: 0,CustomerID,StockCode,scaled_purchase_freq
18,2502,100015,0.482759
20,2506,100015,0.137931
27,2515,100015,0.482759
35,2528,100015,0.586207
39,2536,100015,0.482759


# **Split Train and Test Datasets**

In [22]:
# Declare function to split train and test data
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [23]:
# Split normalised scaled frequency data
train_data_norm, test_data_norm = split_data(norm_data)

In [24]:
# Split dummy data
train_data_dummy, test_data_dummy = split_data(data_dummy)

In [25]:
# Split purchase count data
train_data, test_data = split_data(data)

# **Define Models Using Turicreate Library**

In [26]:
# Define variables for field names
user_id = 'CustomerID'
item_id = 'StockCode'
users_to_recommend = list(data['CustomerID'])
n_rec = 10 # number of items to recommend
n_display = 10 # to display the first few rows in an output dataset

In [27]:
# Declare function for all models
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

# **Popularity Model as Baseline**

**Model based on Purchase Count**

In [28]:
# Using Purchase Count
name = 'popularity'
target = 'Quantity'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|    3814    |   936859  |     9800.4375      |  1   |
|    3814    |   962476  |       3612.0       |  2   |
|    3814    |   946574  |       2576.0       |  3   |
|    3814    |   77487   | 1464.921568627451  |  4   |
|    3814    |   988063  |       1368.0       |  5   |
|    3814    |   75087   | 1155.2380952380952 |  6   |
|    3814    |   926875  |       1134.0       |  7   |
|    3814    |   900138  |       1075.2       |  8   |
|    3814    |   988547  |       984.0        |  9   |
|    3814    |   926684  |       814.5        |  10  |
+------------+-----------+--------------------+------+
[4461390 rows x 4 columns]



**Model based on Purchase Dummy**

In [29]:
# Using purchase dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|    3814    |   41161   |  1.0  |  1   |
|    3814    |    5325   |  1.0  |  2   |
|    3814    |   34188   |  1.0  |  3   |
|    3814    |   23827   |  1.0  |  4   |
|    3814    |   64870   |  1.0  |  5   |
|    3814    |   86886   |  1.0  |  6   |
|    3814    |   24728   |  1.0  |  7   |
|    3814    |   86888   |  1.0  |  8   |
|    3814    |   65257   |  1.0  |  9   |
|    3814    |   68340   |  1.0  |  10  |
+------------+-----------+-------+------+
[4461390 rows x 4 columns]



**Model based on Scaled Purchase Count**

In [30]:
# Using scaled purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|    3814    |   88949   |  1.0  |  1   |
|    3814    |   973721  |  1.0  |  2   |
|    3814    |   951121  |  1.0  |  3   |
|    3814    |   928336  |  1.0  |  4   |
|    3814    |   100795  |  1.0  |  5   |
|    3814    |   977358  |  1.0  |  6   |
|    3814    |   903560  |  1.0  |  7   |
|    3814    |   900899  |  1.0  |  8   |
|    3814    |   988511  |  1.0  |  9   |
|    3814    |   992000  |  1.0  |  10  |
+------------+-----------+-------+------+
[4461390 rows x 4 columns]



**Baseline Summary**

In [31]:
# Calculate mean quantity by stock code
train_data.groupby(key_column_names='StockCode', operations={'mean_qty': agg.MEAN('Quantity')}).sort('mean_qty', ascending = False).head(20)

StockCode,mean_qty
936600,49927.5
962094,21420.0
936859,9800.4375
989291,8568.0
989289,7056.0
908806,6300.0
944650,5292.0
930746,4368.0
989292,3696.0
962476,3612.0


**EDA on Train Data**

In [36]:
train_data.groupby(key_column_names='CustomerID, operations={'count_custid': agg.COUNT_DISTINCT('StockCode')}).sort('count_custid', ascending = False).head(20)

SyntaxError: ignored

In [33]:
# Count by customer id for train_data
train_data.groupby(key_column_names='CustomerID', operations={'count_custid': agg.COUNT()}).sort('count_custid', ascending = False).head(20)

CustomerID,count_custid
2190,1480
4129,1415
2614,1394
2515,1231
2538,1195
4988,1192
2572,1181
2670,1179
5444,1158
5873,1118


# **Collaborative Filtering Model**

## Cosine Similarity

**Model based on Purchase Count**

In [37]:
# Using purchase count
name = 'cosine'
target = 'Quantity'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|    3814    |   919494  | 1646.1726269651267 |  1   |
|    3814    |    5349   | 1238.6378663606563 |  2   |
|    3814    |    4356   | 1164.143297432843  |  3   |
|    3814    |   21236   | 992.9686003721366  |  4   |
|    3814    |   22207   |  900.549888046111  |  5   |
|    3814    |    4716   | 889.4122046302941  |  6   |
|    3814    |   26828   | 689.2488082950398  |  7   |
|    3814    |   27102   |  683.192667742907  |  8   |
|    3814    |   17206   | 361.9645007070848  |  9   |
|    3814    |   20920   | 357.5918919444084  |  10  |
+------------+-----------+--------------------+------+
[4461390 rows x 4 columns]



**Model based on Purchase Dummy**

In [38]:
# Using purchase dummy
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| CustomerID | StockCode |        score        | rank |
+------------+-----------+---------------------+------+
|    3814    |   989292  | 0.18626539707183837 |  1   |
|    3814    |   64776   | 0.17706262767314912 |  2   |
|    3814    |   25608   | 0.15520341793696085 |  3   |
|    3814    |   19486   | 0.15246741473674774 |  4   |
|    3814    |    5006   | 0.14382308224836984 |  5   |
|    3814    |   26828   | 0.14175480802853901 |  6   |
|    3814    |   15940   |  0.1352985123793284 |  7   |
|    3814    |   64136   | 0.13391431868076326 |  8   |
|    3814    |   17766   | 0.13231062392393747 |  9   |
|    3814    |   27605   | 0.12805834412574768 |  10  |
+------------+-----------+---------------------+------+
[4461390 rows x 4 columns]



**Model based on Scaled Purchase Count**

In [39]:
# Using scaled purchase count
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| CustomerID | StockCode |        score        | rank |
+------------+-----------+---------------------+------+
|    3814    |    5329   |  0.1711694598197937 |  1   |
|    3814    |   18348   | 0.17090979549619886 |  2   |
|    3814    |   19478   | 0.16777102373264455 |  3   |
|    3814    |   28868   | 0.15597338477770487 |  4   |
|    3814    |    5349   | 0.11720618164097821 |  5   |
|    3814    |    4184   | 0.10958753581400271 |  6   |
|    3814    |   21236   | 0.10612083033279136 |  7   |
|    3814    |    4000   | 0.10089187158478631 |  8   |
|    3814    |   930746  |  0.0997303393152025 |  9   |
|    3814    |   944650  | 0.09861796873587149 |  10  |
+------------+-----------+---------------------+------+
[4461390 rows x 4 columns]



## Pearson similarity

**Model based on Purchase Count**

In [40]:
# Using purchase count
name = 'pearson'
target = 'Quantity'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| CustomerID | StockCode |       score        | rank |
+------------+-----------+--------------------+------+
|    3814    |   936859  |     9800.4375      |  1   |
|    3814    |   962476  |       3612.0       |  2   |
|    3814    |   946574  |       2576.0       |  3   |
|    3814    |   77487   | 1464.9215686274513 |  4   |
|    3814    |   988063  |       1368.0       |  5   |
|    3814    |   75087   | 1159.2885875510437 |  6   |
|    3814    |   926875  |       1134.0       |  7   |
|    3814    |   900138  |       1075.2       |  8   |
|    3814    |   988547  |       984.0        |  9   |
|    3814    |   926684  |       814.5        |  10  |
+------------+-----------+--------------------+------+
[4461390 rows x 4 columns]



**Model based on Purchase Dummy**

In [41]:
# Using purchase dummy
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|    3814    |   41161   |  0.0  |  1   |
|    3814    |    5325   |  0.0  |  2   |
|    3814    |   34188   |  0.0  |  3   |
|    3814    |   23827   |  0.0  |  4   |
|    3814    |   64870   |  0.0  |  5   |
|    3814    |   86886   |  0.0  |  6   |
|    3814    |   24728   |  0.0  |  7   |
|    3814    |   86888   |  0.0  |  8   |
|    3814    |   65257   |  0.0  |  9   |
|    3814    |   68340   |  0.0  |  10  |
+------------+-----------+-------+------+
[4461390 rows x 4 columns]



**Model based on Scaled Purchase Count**

In [42]:
# Using scaled purchase count
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|    3814    |   88949   |  1.0  |  1   |
|    3814    |   973721  |  1.0  |  2   |
|    3814    |   951121  |  1.0  |  3   |
|    3814    |   928336  |  1.0  |  4   |
|    3814    |   100795  |  1.0  |  5   |
|    3814    |   977358  |  1.0  |  6   |
|    3814    |   903560  |  1.0  |  7   |
|    3814    |   900899  |  1.0  |  8   |
|    3814    |   988511  |  1.0  |  9   |
|    3814    |   992000  |  1.0  |  10  |
+------------+-----------+-------+------+
[4461390 rows x 4 columns]



# **Apply to Trained Models to Test Dataset**

In [44]:
pop_dummy.recommend().print_rows(num_rows=40)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|    2663    |   49189   |  1.0  |  1   |
|    2663    |    4626   |  1.0  |  2   |
|    2663    |   64510   |  1.0  |  3   |
|    2663    |   41161   |  1.0  |  4   |
|    2663    |    5325   |  1.0  |  5   |
|    2663    |   23827   |  1.0  |  6   |
|    2663    |   64870   |  1.0  |  7   |
|    2663    |   19068   |  1.0  |  8   |
|    2663    |   24728   |  1.0  |  9   |
|    2663    |   65257   |  1.0  |  10  |
|    5153    |   64510   |  1.0  |  1   |
|    5153    |   27783   |  1.0  |  2   |
|    5153    |   41161   |  1.0  |  3   |
|    5153    |    5325   |  1.0  |  4   |
|    5153    |   34188   |  1.0  |  5   |
|    5153    |   23827   |  1.0  |  6   |
|    5153    |   86886   |  1.0  |  7   |
|    5153    |   24728   |  1.0  |  8   |
|    5153    |   86888   |  1.0  |  9   |
|    5153    |   68340   |  1.0  |  10  |
|    5244    |   64510   |  1.0  |

In [45]:
cos_dummy.recommend().print_rows(num_rows=40)

+------------+-----------+---------------------+------+
| CustomerID | StockCode |        score        | rank |
+------------+-----------+---------------------+------+
|    2663    |   41692   |  0.1270528933875018 |  1   |
|    2663    |   18348   | 0.08821413228087895 |  2   |
|    2663    |   28866   | 0.08447796828614491 |  3   |
|    2663    |   89198   |  0.0788739475496493 |  4   |
|    2663    |   15677   | 0.07723761948156735 |  5   |
|    2663    |   67526   | 0.07639445798178357 |  6   |
|    2663    |   65896   | 0.07556783714218832 |  7   |
|    2663    |   19068   | 0.07538529531872017 |  8   |
|    2663    |   42716   | 0.07438384291809791 |  9   |
|    2663    |   77632   |  0.0710803927813624 |  10  |
|    5153    |   41692   | 0.16807477338479296 |  1   |
|    5153    |   41846   | 0.16417938706898452 |  2   |
|    5153    |   43028   | 0.14620334057524653 |  3   |
|    5153    |   11788   |  0.1416821822081462 |  4   |
|    5153    |   27783   | 0.13408415712932548 |

In [46]:
pear_dummy.recommend().print_rows(num_rows=40)

+------------+-----------+-------+------+
| CustomerID | StockCode | score | rank |
+------------+-----------+-------+------+
|    2663    |   49189   |  0.0  |  1   |
|    2663    |    4626   |  0.0  |  2   |
|    2663    |   64510   |  0.0  |  3   |
|    2663    |   41161   |  0.0  |  4   |
|    2663    |    5325   |  0.0  |  5   |
|    2663    |   23827   |  0.0  |  6   |
|    2663    |   64870   |  0.0  |  7   |
|    2663    |   19068   |  0.0  |  8   |
|    2663    |   24728   |  0.0  |  9   |
|    2663    |   65257   |  0.0  |  10  |
|    5153    |   64510   |  0.0  |  1   |
|    5153    |   27783   |  0.0  |  2   |
|    5153    |   41161   |  0.0  |  3   |
|    5153    |    5325   |  0.0  |  4   |
|    5153    |   34188   |  0.0  |  5   |
|    5153    |   23827   |  0.0  |  6   |
|    5153    |   86886   |  0.0  |  7   |
|    5153    |   24728   |  0.0  |  8   |
|    5153    |   86888   |  0.0  |  9   |
|    5153    |   68340   |  0.0  |  10  |
|    5244    |   64510   |  0.0  |

# **Model Evaluation**

In [47]:
# Declare initial callable variables for model evaluation
models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [48]:
models_w_counts

[Class                            : PopularityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : Quantity
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Statistics
 ----------
 Number of observations           : 356911
 Number of users                  : 1841
 Number of items                  : 3710
 
 Training summary
 ----------------
 Training time                    : 0.0274
 
 Model Parameters
 ----------------
 Model class                      : PopularityRecommender,
 Class                            : ItemSimilarityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : Quantity
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Stat

In [49]:
models_w_dummy

[Class                            : PopularityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : purchase_dummy
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Statistics
 ----------
 Number of observations           : 356911
 Number of users                  : 1840
 Number of items                  : 3706
 
 Training summary
 ----------------
 Training time                    : 0.0271
 
 Model Parameters
 ----------------
 Model class                      : PopularityRecommender,
 Class                            : ItemSimilarityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : purchase_dummy
 Additional observation features  : 0
 User side features               : []
 Item side features               

In [50]:
models_w_norm

[Class                            : PopularityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : scaled_purchase_freq
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Statistics
 ----------
 Number of observations           : 354720
 Number of users                  : 1840
 Number of items                  : 3039
 
 Training summary
 ----------------
 Training time                    : 0.0288
 
 Model Parameters
 ----------------
 Model class                      : PopularityRecommender,
 Class                            : ItemSimilarityRecommender
 
 Schema
 ------
 User ID                          : CustomerID
 Item ID                          : StockCode
 Target                           : scaled_purchase_freq
 Additional observation features  : 0
 User side features               : []
 Item side features   

## Compare all the models based on RMSE and precision-recall characteristics

**Evaluating Purchase Count Models**

In [51]:
# Using purchase counts models
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+-----------------------+
| cutoff |     mean_precision     |      mean_recall      |
+--------+------------------------+-----------------------+
|   1    |          0.0           |          0.0          |
|   2    | 0.0005446623093681918  | 8.728562650131277e-05 |
|   3    | 0.0005446623093681916  | 9.775990168147029e-05 |
|   4    | 0.0004084967320261438  | 9.775990168147029e-05 |
|   5    | 0.0003267973856209149  | 9.775990168147029e-05 |
|   6    | 0.0002723311546840958  | 9.775990168147029e-05 |
|   7    | 0.0002334267040149393  | 9.775990168147029e-05 |
|   8    | 0.0002042483660130719  | 9.775990168147029e-05 |
|   9    | 0.00030259017187121766 |  0.00012178912121242  |
|   10   | 0.00032679738562091485 | 0.0001322633963925775 |
+--------+------------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 155.57789947722696

Per User RMSE (best)
+------------+----------------


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.46786492374727684 |  0.0150802039926751 |
|   2    |  0.4417211328976032 |  0.0277243715277485 |
|   3    | 0.42519970951343555 | 0.03959238908710528 |
|   4    | 0.40985838779956413 | 0.05031377797224898 |
|   5    | 0.39575163398692825 | 0.06040314092559801 |
|   6    |  0.386165577342048  | 0.07104131516962066 |
|   7    |  0.3784624961095548 | 0.08090321494795971 |
|   8    | 0.36962145969498916 | 0.08993100402367962 |
|   9    |  0.3609900750423625 |  0.0988368080014922 |
|   10   |  0.3517429193899785 | 0.10624771291890017 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 170.7968390782733

Per User RMSE (best)
+------------+--------------------+-------+
| CustomerID |        rmse        | count |
+------------


Precision and recall summary statistics by cutoff
+--------+------------------------+-----------------------+
| cutoff |     mean_precision     |      mean_recall      |
+--------+------------------------+-----------------------+
|   1    |          0.0           |          0.0          |
|   2    | 0.0005446623093681918  | 8.728562650131277e-05 |
|   3    | 0.0005446623093681916  | 9.775990168147034e-05 |
|   4    | 0.00040849673202614375 | 9.775990168147034e-05 |
|   5    | 0.00032679738562091496 | 9.775990168147033e-05 |
|   6    | 0.0002723311546840958  | 9.775990168147033e-05 |
|   7    | 0.0002334267040149392  | 9.775990168147033e-05 |
|   8    | 0.00020424836601307177 | 9.775990168147033e-05 |
|   9    | 0.00030259017187121777 |  0.00012178912121242  |
|   10   | 0.00032679738562091506 | 0.0001322633963925775 |
+--------+------------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 154.48374594615058

Per User RMSE (best)
+------------+----------------

**Evaluating Purchase Dummy Models**

In [52]:
# Using dummy models
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Dummy



Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    |  0.001633097441480675 | 1.6550480828848795e-05 |
|   2    | 0.0027218290691344562 | 4.226400816700887e-05  |
|   3    |  0.002358918526583197 | 5.229969949489437e-05  |
|   4    |  0.002177463255307567 |  6.46096691196399e-05  |
|   5    | 0.0019597169297768105 | 7.032334702305734e-05  |
|   6    |  0.001996007984031936 | 9.705627289892708e-05  |
|   7    | 0.0019441636208103266 | 0.00010695804536253228 |
|   8    |  0.002041371801850846 | 0.00013181378558757976 |
|   9    | 0.0021169781648823563 | 0.00014914165414840958 |
|   10   | 0.0020685900925421882 | 0.00018209575097349735 |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| CustomerID | 


Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    |  0.6162221012520411 | 0.026739530390281468 |
|   2    |  0.593358737071312  | 0.04940761897838514  |
|   3    |  0.5672291780076213 | 0.06797146362216665  |
|   4    |  0.5494011976047901 | 0.08628879539109352  |
|   5    |  0.5385955362003264 | 0.10340981637487959  |
|   6    |  0.5282162946833607 | 0.12036424492739035  |
|   7    |  0.5176918889493731 | 0.13626033494980608  |
|   8    |  0.5061921611322806 | 0.14944366502832854  |
|   9    | 0.49845763019415656 |  0.1639375763184381  |
|   10   |  0.4883505715841043 | 0.17630031977956934  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9429956930942877

Per User RMSE (best)
+------------+---------------------+-------+
| CustomerID |         rmse        | coun


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0016330974414806745 | 1.0147752927579792e-05 |
|   2    | 0.0016330974414806728 |  2.67081393855548e-05  |
|   3    |  0.001996007984031936 | 8.318422664616385e-05  |
|   4    |  0.002041371801850842 | 0.00010331826913804055 |
|   5    | 0.0027218290691344584 | 0.00017112934210573142 |
|   6    |  0.002631101433496643 | 0.00019226440654704057 |
|   7    |  0.00233299634497239  | 0.00020273297988986544 |
|   8    |  0.002517691888949375 | 0.00024223532914498092 |
|   9    |  0.002540373797858827 | 0.00027357033144652734 |
|   10   | 0.0026129559063690806 | 0.00030953135929431147 |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| CustomerID | 

**Evaluating Scaled Purchase Count Models**

In [53]:
# Using normalized purchase count models
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Scaled Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 0.00027233115468409594 | 3.6554517407261154e-06 |
|   3    |  0.000544662309368191  | 8.997614239076445e-06  |
|   4    |  0.000816993464052288  | 1.6673741097931278e-05 |
|   5    | 0.0007625272331154684  | 2.2934227412508195e-05 |
|   6    | 0.0007262164124909224  | 2.459478323375272e-05  |
|   7    | 0.0007780890133831314  | 3.541959293425622e-05  |
|   8    | 0.0007489106753812638  | 3.722910226770532e-05  |
|   9    | 0.0006656983781166788  | 3.722910226770535e-05  |
|   10   | 0.0006535947712418305  | 3.927670493450308e-05  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.14881177931030712

Per User RMSE (best)
+------------+-


Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    |  0.5713507625272326 | 0.019627617799047287 |
|   2    |  0.5296840958605661 | 0.03579774912648331  |
|   3    |  0.4981844589687735 | 0.05016410757123417  |
|   4    | 0.47453703703703703 | 0.06275432328413783  |
|   5    |  0.4538126361655774 | 0.07377967534167082  |
|   6    |  0.4379084967320262 | 0.08551212397514507  |
|   7    | 0.42491441020852794 | 0.09612157863052077  |
|   8    |  0.4127178649237473 | 0.10574343333163362  |
|   9    | 0.40081094166061526 | 0.11451100522596387  |
|   10   |  0.3928649237472765 |  0.1247223988696822  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.18954266481637314

Per User RMSE (best)
+------------+-----------------------+-------+
| CustomerID |          rmse         |


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0005446623093681919 | 3.975637294658332e-06  |
|   2    |  0.000816993464052288 | 9.317799793008657e-06  |
|   3    | 0.0009077705156136529 | 1.451607317328944e-05  |
|   4    | 0.0006808278867102399 | 1.451607317328944e-05  |
|   5    | 0.0008714596949891073 | 2.161025456361777e-05  |
|   6    | 0.0011801016702977472 | 3.565016606487445e-05  |
|   7    | 0.0013227513227513214 | 4.6698942552558585e-05 |
|   8    |  0.001157407407407408 | 4.669894255255857e-05  |
|   9    | 0.0010288065843621404 | 4.6698942552558585e-05 |
|   10   | 0.0009259259259259254 | 4.6698942552558585e-05 |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.1479981125759061

Per User RMSE (best)
+------------+----------------

## Evaluation Output

## Evaluation Summary

# **Final Output**

## Collaborative Filtering

In [54]:
# Rerun the model using the whole dataset, as we came to a final model using train data and evaluated with test set
final_model = tc.item_similarity_recommender.create(tc.SFrame(test_data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+---------------------+------+
| CustomerID | StockCode |        score        | rank |
+------------+-----------+---------------------+------+
|    3814    |   29566   |  0.0599596381187439 |  1   |
|    3814    |   936390  | 0.05773502588272095 |  2   |
|    3814    |   910880  | 0.05773502588272095 |  3   |
|    3814    |   904921  | 0.05773502588272095 |  4   |
|    3814    |   904009  | 0.05773502588272095 |  5   |
|    3814    |   988031  | 0.05773502588272095 |  6   |
|    3814    |   15500   | 0.05773502588272095 |  7   |
|    3814    |   989346  | 0.05773502588272095 |  8   |
|    3814    |   928293  | 0.05773502588272095 |  9   |
|    3814    |   902115  | 0.05773502588272095 |  10  |
+------------+-----------+---------------------+------+
[4461390 rows x 4 columns]



In [55]:
# Save final model for collaborative filtering
final_model.save("drinkupiowa")

In [56]:
# Transfer Turicreate model to pandas dataframe
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(4461390, 4)


Unnamed: 0,CustomerID,StockCode,score,rank
0,3814,29566,0.05996,1
1,3814,936390,0.057735,2
2,3814,910880,0.057735,3
3,3814,904921,0.057735,4
4,3814,904009,0.057735,5


In [57]:
# Declare a function to create a desired output and export dataframe to csv
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['CustomerID', 'recommendedProducts']].drop_duplicates() \
        .sort_values('CustomerID').set_index('CustomerID')
    if print_csv:
        df_output.to_csv('gdrive\option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [58]:
# Execute function on whole dataset  
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(1841, 1)


Unnamed: 0_level_0,recommendedProducts
CustomerID,Unnamed: 1_level_1
2106,88949|973721|951121|928336|100795|977358|90356...
2113,88949|973721|951121|928336|100795|977358|90356...
2130,88949|973721|951121|928336|100795|977358|90356...
2178,88949|973721|951121|928336|100795|977358|90356...
2190,946916|37384|926727|88949|973721|100795|977358...


## Popularity