# Model training

This notebook reads a small training and test set in Protobuf format, and 
invokes Sagemaker to train and deploy a model.

In [8]:
import pandas as pd
from collections import Counter
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import f1_score
import boto3
import os.path
from sagemaker import get_execution_role
import scipy

In [9]:
# Bucket with raw data
bucket = 'promo-dataprep'

# input train, test, and header data
trainprefix = 'train-small/train.protobuf'
testprefix = 'train-small/test.protobuf'
headerprefix = 'train-small/headers.csv'
trainlocal = 'train-small.protobuf'
testlocal = 'test-small.protobuf'
headerlocal = 'headers-train.csv'

In [10]:
# set up boto for s3 access
role = get_execution_role()
s3 = boto3.resource('s3')

# download inputs
if os.path.isfile(trainlocal) and os.path.exists(trainlocal):
    print("{0} already exists, skipping".format(trainlocal))
else:
    s3.Bucket(bucket).download_file(trainprefix, trainlocal)
if os.path.isfile(testlocal) and os.path.exists(testlocal):
    print("{0} already exists, skipping".format(testlocal))
else:
    s3.Bucket(bucket).download_file(testprefix, testlocal)
if os.path.isfile(headerlocal) and os.path.exists(headerlocal):
    print("{0} already exists, skipping".format(headerlocal))
else:
    s3.Bucket(bucket).download_file(headerprefix, headerlocal)

train-small.protobuf already exists, skipping
test-small.protobuf already exists, skipping
headers-train.csv already exists, skipping


## Train model

In [11]:
import sagemaker
output_prefix  = 's3://{}/{}/output'.format(bucket, 'train-small')

containers = {'us-east-1': 'dkr.ecr.us-east-1.amazonaws.com/tffmpromo:latest'}
fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   role, 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c4.8xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm.set_hyperparameters(order='3',
                      rank='7',
                      epochs=50,
                       header_file_bucket=bucket,
                       header_file_prefix='train-small/headers.csv')


In [12]:
fm.fit({'train': "s3://{0}/{1}".format(bucket, trainprefix), 
        'test': "s3://{0}/{1}".format(bucket, testprefix)})


INFO:sagemaker:Creating training-job with name: tffmpromo-2018-04-10-02-17-05-752


...................................................
[31mStarting the training.[0m
[31mNumber of samples: 200000[0m
[31mCreating sparse matrix of shape 40000,62520[0m
[31mX_tr shape: (40000, 62520)[0m
[31m2018-04-10 02:21:14.694903: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA[0m
[31m#015  0%|          | 0/50 [00:00<?, ?epoch/s]#015  2%|2         | 1/50 [00:00<00:12,  3.85epoch/s]#015  8%|8         | 4/50 [00:00<00:04, 10.25epoch/s]#015 14%|#4        | 7/50 [00:00<00:03, 13.73epoch/s]#015 20%|##        | 10/50 [00:00<00:02, 15.92epoch/s]#015 26%|##6       | 13/50 [00:00<00:02, 17.48epoch/s]#015 32%|###2      | 16/50 [00:00<00:01, 18.54epoch/s]#015 38%|###8      | 19/50 [00:00<00:01, 19.29epoch/s]#015 44%|####4     | 22/50 [00:01<00:01, 19.87epoch/s]#015 50%|#####     | 25/50 [00:01<00:01, 20.38epoch/s]#015 56%|#####6    | 28/50 [00:01<00:01, 20.85epoch/s]#015

In [13]:
print(fm.model_data)

s3://promo-dataprep/train-small/output/tffmpromo-2018-04-10-02-17-05-752/output/model.tar.gz


In [14]:
fm_predictor = fm.deploy(initial_instance_count=1,
                         instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: tffmpromo-2018-04-10-02-23-16-492
INFO:sagemaker:Creating endpoint with name tffmpromo-2018-04-10-02-17-05-752


---------------------------------------------------------------!

## Factorization Machine

For the sake of comparison, let's see how well the native SageMaker FM algorithm does.

In [15]:
output_prefix  = 's3://{}/{}/output'.format(bucket, 'train-fm')

containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}
sfm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                   role, 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c4.8xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

sfm.set_hyperparameters(feature_dim=62520,
                      predictor_type='regressor',
                      num_factors=20,
                      epochs=20)

In [16]:
sfm.fit({'train': "s3://{0}/{1}".format(bucket, trainprefix), 
        'test': "s3://{0}/{1}".format(bucket, testprefix)})

INFO:sagemaker:Creating training-job with name: factorization-machines-2018-04-10-02-29-53-695


................................................................
[31mDocker entrypoint called with argument(s): train[0m
[31m[04/10/2018 02:35:09 INFO 140269428983616] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'bias_wd': u'0.01', u'use_linear': u'true', u'_speedometer_period': u'500', u'bias_lr': u'0.1', u'mini_batch_size': u'1000', u'_use_full_symbolic': u'true', u'bias_init_sigma': u'0.01', u'_num_gpus': u'auto', u'_data_format': u'record', u'factors_wd': u'0.00001', u'linear_wd': u'0.001', u'_kvstore': u'auto', u'_learning_rate': u'1.0', u'_optimizer': u'adam'}[0m
[31m[04/10/2018 02:35:09 IN

[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 229.57706451416016, "sum": 229.57706451416016, "min": 229.57706451416016}}, "EndTime": 1523327713.631451, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1523327713.401627}
[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 40, "sum": 40.0, "min": 40}, "Number of Batches Since Last Reset": {"count": 1, "max": 40, "sum": 40.0, "min": 40}, "Number of Records Since Last Reset": {"count": 1, "max": 40000, "sum": 40000.0, "min": 40000}, "Total Batches Seen": {"count": 1, "max": 761, "sum": 761.0, "min": 761}, "Total Records Seen": {"count": 1, "max": 761000, "sum": 761000.0, "min": 761000}, "Max Records Seen Between Resets": {"count": 1, "max": 40000, "sum": 40000.0, "min": 40000}, "Reset Count": {"count": 1, "max": 20, "sum": 20.0, "min": 20}}, "EndTime": 1523327713.631592, "Dimensions": {"Host": "algo-1", "Meta": "training_data_it

In [17]:
sfm_predictor = sfm.deploy(initial_instance_count=1,
                         instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: factorization-machines-2018-04-10-02-36-10-107
INFO:sagemaker:Creating endpoint with name factorization-machines-2018-04-10-02-29-53-695


-------------------------------------------------------------------------------------!

In [18]:
from sagemaker.predictor import json_deserializer
import json

def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

sfm_predictor.content_type = 'application/json'
sfm_predictor.serializer = fm_serializer
sfm_predictor.deserializer = json_deserializer

In [25]:
import scipy.sparse as sp
import sagemaker.amazon.common as smac
from io import FileIO
import csv

num_test_samples = 0
te_row_ind = []
te_col_ind = []
te_data = []
te_idx = 0
y_te = []

headers = []
with open(headerlocal, 'r') as csvfile:
        headerreader = csv.reader(csvfile) 
        for row in headerreader:
            headers.append("".join(row))

num_features = len(headers)
            
test_records = smac.read_records(FileIO(testlocal))
num_test_samples = num_test_samples + len(test_records)
for test_record in test_records:
    te_row_ind.extend([te_idx] * len(test_record.features['values'].float32_tensor.values))
    te_col_ind.extend(test_record.features['values'].float32_tensor.keys)
    te_data.extend(test_record.features['values'].float32_tensor.values)
    te_idx = te_idx + 1
    y_te.append(test_record.label['values'].float32_tensor.values[0])

print("Creating test sparse matrix of shape {0},{1}".format(num_test_samples, num_features))
X_te_sparse = sp.csr_matrix( (np.array(te_data),(np.array(te_row_ind),np.array(te_col_ind))), shape=(num_test_samples,num_features) )
print("X_te shape: {0}".format(X_te_sparse.shape))
            
predictions = []
for idx in range(0,X_te_sparse.shape[0]):
    result = sfm_predictor.predict(X_te_sparse[idx].toarray())
    predictions += [r['score'] for r in result['predictions']]

predictions = np.array(predictions)
print("Got {0} predictions: e.g. {1}".format(len(predictions), predictions[0]))

Creating test sparse matrix of shape 10000,62520
X_te shape: (10000, 62520)
Got 10000 predictions: e.g. 0.166213095188


In [26]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, average_precision_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from sklearn.metrics import precision_score, precision_recall_fscore_support, recall_score
predvecfm = np.where(predictions > 0.5, 1, 0)
print('Weighted F1: {}'.format(f1_score(y_te, predvecfm,average='weighted')))
print('Accuracy: {}'.format(accuracy_score(y_te, predvecfm)))
print('Weighted ROC: {}'.format(roc_auc_score(y_te, predvecfm, average='weighted')))

Weighted F1: 0.921709510534
Accuracy: 0.9394
Weighted ROC: 0.593726129929


In [27]:
print('Classification report: {}'.format(classification_report(y_te, predvecfm)))

Classification report:              precision    recall  f1-score   support

        0.0       0.94      1.00      0.97      9294
        1.0       0.79      0.19      0.31       706

avg / total       0.93      0.94      0.92     10000



In [28]:
print("Confusion matrix")
print(pd.crosstab(np.array(y_te), predvecfm, rownames=['actuals'], colnames=['predictions']))

Confusion matrix
predictions     0    1
actuals               
0.0          9259   35
1.0           571  135
