### Ecommerce Analytics - Clustering

In [37]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import pickle
import numpy as np                                
import pandas as pd 
import mxnet as mx
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
import utils

from sagemaker import KMeans

#### Set up

In [38]:
bucket_name = 'sagemaker-us-east-2-962225948309'

In [39]:
# Define IAM role- this will be necessary when defining your model
role = get_execution_role()
print(role)

arn:aws:iam::560062611886:role/service-role/AmazonSageMaker-ExecutionRole-20190311T034175


In [40]:
my_region = boto3.session.Session().region_name
print(my_region)

us-east-2


#### Load Data

In [41]:
customer_review = pd.read_csv('./customer_reviews_dataset.csv')
order_item = pd.read_csv('./order_items_dataset.csv')
orders = pd.read_csv('./orders_dataset.csv')
products = pd.read_csv('./products_dataset.csv')
product_name_translate = pd.read_csv('./product_category_name_translation.csv')

#### Merge Data

In [42]:
data = customer_review[['review_id','order_id','survey_score']].merge(order_item[['order_id','product_id']], how = 'left', on = 'order_id')

In [43]:
data = data.merge(orders[['order_id','customer_id']], how = 'left', on = 'order_id')

In [44]:
data = data.merge(products[['product_id', 'product_category_name']], how = 'left', on = 'product_id')

In [45]:
data = data.merge(product_name_translate, how = 'left', on = 'product_category_name')

#### Prepare Data

In [47]:
# Drop NAs
data = data.dropna(how='any')

In [48]:
# Get Avg Survey Score per category per customer
d1 = data[['customer_id', 'product_category_name_english', 'survey_score']]
#d1 = d1.groupby(by = ['customer_id', 'product_category_name_english']).mean()
#d1 = d1.reset_index()
d1 = d1.rename(index=str, columns={"product_category_name_english": "category"})
d1 = d1.groupby(['customer_id','category'])['survey_score'].mean().unstack(fill_value=0)
d1 = d1.reset_index()

In [49]:
# Get total purchases per customer per category
d2 = data[['customer_id', 'product_category_name_english']]
d2 = d2.groupby(by = ['customer_id', 'product_category_name_english']).size().reset_index(name='counts')
d2 = d2.rename(index=str, columns={"product_category_name_english": "category"})
d2 = d2.groupby(['customer_id','category'])['counts'].sum().unstack(fill_value=0)
d2 = d2.reset_index()

In [50]:
# Merge d1 and d2
data = d1.merge(d2, how = 'left', on ='customer_id', suffixes=('_rate','_count'))

In [51]:
# Split into Train and Test (0.8,0.2)
train_data_unscaled, test_data_unscaled = np.split(data.sample(frac=1, 
                                                   random_state=1729), 
                                                   [int(0.8 * len(data))])

print ("Train and test data sizes")
print (train_data_unscaled.shape, test_data_unscaled.shape)

Train and test data sizes
(77804, 143) (19452, 143)


In [52]:
# Get feature column
featureCols = list(data.columns)[1:]

In [53]:
# Scale data for KMeans
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data_unscaled[featureCols])
train_data = pd.DataFrame(train_data, columns=featureCols)[featureCols]
test_data = scaler.transform(test_data_unscaled[featureCols])
test_data = pd.DataFrame(test_data, columns=featureCols)[featureCols]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### Set up KMeans Model

In [17]:
#data_location   = 's3://{}/data'.format(bucket_name)
#output_location = 's3://{}/output'.format(bucket_name)

#print('training data will be uploaded to: {}'.format(data_location))
#print('training artifacts will be uploaded to: {}'.format(output_location))

In [54]:
K = 5
kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.m4.xlarge',
                k=K)

#### Fit the model

In [55]:
km_model = kmeans.fit(kmeans.record_set(train_data.values.astype('float32')))

INFO:sagemaker:Creating training-job with name: kmeans-2019-04-02-05-17-11-285


2019-04-02 05:17:11 Starting - Starting the training job...
2019-04-02 05:17:12 Starting - Launching requested ML instances......
2019-04-02 05:18:13 Starting - Preparing the instances for training...
2019-04-02 05:18:59 Downloading - Downloading input data
2019-04-02 05:18:59 Training - Downloading the training image....
[31mDocker entrypoint called with argument(s): train[0m
[31m[04/02/2019 05:19:45 INFO 140415369017152] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metrics': u'["msd"]', u'_num_kv_se


2019-04-02 05:19:55 Uploading - Uploading generated training model
2019-04-02 05:19:55 Completed - Training job completed
Billable seconds: 65


#### Deploy KMeans Predictors

In [56]:
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

INFO:sagemaker:Creating model with name: kmeans-2019-04-02-05-20-23-134
INFO:sagemaker:Creating endpoint with name kmeans-2019-04-02-05-17-11-285


---------------------------------------------------------------------------------------!

#### Apply Model to Train and Test Sets

In [58]:
train_result = kmeans_predictor.predict(train_data.values.astype('float32'))

In [None]:
train_result = kmeans_predictor.predict(train_data.values.astype('float32'))
test_result = kmeans_predictor.predict(test_data.values.astype('float32'))

# Retrieve the closest cluster label and distance to cluster for each sample in training and testing set
train_clusters = [[r.label['closest_cluster'].float32_tensor.values[0],r.label['distance_to_cluster'].float32_tensor.values[0]] for r in train_result]
test_clusters = [[r.label['closest_cluster'].float32_tensor.values[0],r.label['distance_to_cluster'].float32_tensor.values[0]] for r in test_result]

train_clusters = np.array(train_clusters)
test_clusters = np.array(test_clusters)

### Distribution of Samples among Clusters

In [None]:
train_cluster_counts = pd.DataFrame(train_clusters[:,0])[0].value_counts().sort_index()
plt.bar(np.arange(len(train_cluster_counts)),train_cluster_counts)
plt.title("Cluster Counts for Training Set")
plt.show()

test_cluster_counts = pd.DataFrame(test_clusters[:,0])[0].value_counts().sort_index()
plt.bar(np.arange(len(test_cluster_counts)),test_cluster_counts)
plt.title("Cluster Counts for Test Set")
plt.show()

### Sample Train/Test Predictions

In [None]:
# Concatenate original training/testing dataframe with new column with assigned cluster
train_with_clusters = train_data_unscaled[featureCols]
test_with_clusters = test_data_unscaled[featureCols]

train_pd = pd.DataFrame(train_with_clusters, columns=featureCols+["prediction","distance"])
test_pd = pd.DataFrame(test_with_clusters, columns=featureCols+["prediction","distance"])

In [None]:
print ("Assigned Cluster and Distance to Cluster for Sample Train Samples")
display(train_pd[:5])
print ("\nAssigned Cluster and Distance to Cluster for Sample Test Samples")
display(test_pd[:5])

### Accessing KMeans Model Parameters

In [None]:
# Get location of model on S3
model_key = 'analytics/output/' + kmeans_predictor.endpoint + "/output/model.tar.gz"
print(model_key)

# Retrieve tar.gz file from your bucket
boto3.resource('s3').Bucket(bucket_name).download_file(model_key, 'model.tar.gz')

# Untar model file 
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

# Load KMeans Model Parameters
kmeans_model_params = mx.ndarray.load('model_algo-1')

# Cluster centroids
cluster_centroids=pd.DataFrame(kmeans_model_params[0].asnumpy())
cluster_centroids.columns = featureCols

In [None]:
print ("Cluster Centroids\n", cluster_centroids)
cluster_centers_named = utils.pd_centers(featureCols, cluster_centroids.values)