# Starbucks Capstone Challenge

## XGBoost Classifier (Hyperparameter Tuning)


In [1]:
import pandas as pd
import boto3
import sagemaker
import os

### Upload the data to S3

In [2]:
data_dir =  'ML_data'
prefix = 'sagemaker/ML-tuning-HL'

session = sagemaker.Session()
val_location = session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

### Import and upload the test data

In [3]:
test_data = pd.read_csv(os.path.join(data_dir,'test.csv'), header= None)

In [4]:
test_y, test_x = test_data.iloc[:,0], test_data.iloc[:,1:]

In [5]:
pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test_x.csv'), header=False, index=False)
test_location = session.upload_data(os.path.join(data_dir, 'test_x.csv'), key_prefix=prefix)

### Build and Train XGBoost Classifier

In [6]:
from sagemaker import get_execution_role
role = get_execution_role()

In [7]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(session.boto_region_name, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-1').


In [8]:
xgb = sagemaker.estimator.Estimator(container,
                                   role,
                                   train_instance_count=1,
                                   train_instance_type='ml.m4.xlarge',
                                   output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                   sagemaker_session = session)

xgb.set_hyperparameters(max_depth=5,
                       eta=0.2,
                       gamma=4,
                       min_child_weight=6,
                       subsample=0.8,
                       silent=0,
                       objective='binary:logistic',
                       early_stopping_rounds=10,
                       num_round=500)

### Hyperparameter Tuning

In [9]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb,
                                             objective_metric_name = 'validation:rmse',
                                             objective_type = 'Minimize',
                                             max_jobs = 20,
                                             max_parallel_jobs = 3,
                                             hyperparameter_ranges ={
                                                 'max_depth':IntegerParameter(3, 12),
                                                 'eta':ContinuousParameter(0.05, 0.5),
                                                 'min_child_weight':IntegerParameter(2, 8),
                                                 'subsample':ContinuousParameter(0.5, 0.9),
                                                 'gamma':ContinuousParameter(0, 10),
                                             })

In [10]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

In [11]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
xgb_hyperparameter_tuner.wait()

........................................................

In [None]:
xgb_hyperparameter_tuner.best_training_job()

### Test the model

In [22]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2020-06-06 23:52:56 Starting - Preparing the instances for training
2020-06-06 23:52:56 Downloading - Downloading input data
2020-06-06 23:52:56 Training - Training image download completed. Training in progress.
2020-06-06 23:52:56 Uploading - Uploading generated training model
2020-06-06 23:52:56 Completed - Training job completed[34mArguments: train[0m
[34m[2020-06-06:23:52:29:INFO] Running standalone xgboost training.[0m
[34m[2020-06-06:23:52:29:INFO] Setting up HPO optimized metric to be : rmse[0m
[34m[2020-06-06:23:52:29:INFO] File size need to be processed in the node: 10.16mb. Available memory size in the node: 8468.4mb[0m
[34m[2020-06-06:23:52:29:INFO] Determined delimiter of CSV input is ','[0m
[34m[23:52:29] S3DistributionType set as FullyReplicated[0m
[34m[23:52:29] 42641x18 matrix with 767538 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-06-06:23:52:29:INFO] Determined delimiter of CSV input is ','[0m
[34m[

In [23]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')



In [24]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [25]:
xgb_transformer.wait()

......................[34mArguments: serve[0m
[34m[2020-06-07 00:49:47 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2020-06-07 00:49:47 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2020-06-07 00:49:47 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2020-06-07 00:49:47 +0000] [38] [INFO] Booting worker with pid: 38[0m
[34m[2020-06-07 00:49:47 +0000] [39] [INFO] Booting worker with pid: 39[0m
[34m[2020-06-07 00:49:48 +0000] [40] [INFO] Booting worker with pid: 40[0m
[34m[2020-06-07:00:49:48:INFO] Model loaded successfully for worker : 38[0m
[34m[2020-06-07:00:49:48:INFO] Model loaded successfully for worker : 39[0m
[34m[2020-06-07 00:49:48 +0000] [41] [INFO] Booting worker with pid: 41[0m
[34m[2020-06-07:00:49:48:INFO] Model loaded successfully for worker : 40[0m
[34m[2020-06-07:00:49:48:INFO] Model loaded successfully for worker : 41[0m

[34m[2020-06-07:00:50:19:INFO] Sniff delimiter as ','[0m
[34m[2020-06-07:00:50:19:INFO] Determined d

In [26]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

Completed 196.4 KiB/196.4 KiB (1.8 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-ap-northeast-1-105243015009/xgboost-200606-2327-020-7662f1d1-2020-06-07-00-46-19-444/test_x.csv.out to ML_data/test_x.csv.out


In [27]:
predictions = pd.read_csv(os.path.join(data_dir, 'test_x.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [30]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_y, predictions)
from sklearn.metrics import f1_score
f1 = f1_score(test_y, predictions)
from sklearn.metrics import precision_score
precision = precision_score(test_y, predictions, average='binary')
from sklearn.metrics import recall_score
recall = recall_score(test_y, predictions, average='binary')
print('\nAccuracy Score:', accuracy)
print('\nF1 Score:', f1)
print('\nPrecision:', precision)
print('\nRecall:', recall)


Accuracy Score: 0.8511931562359297

F1 Score: 0.8813498474241608

Precision: 0.8718039772727273

Recall: 0.8911070780399274
