# Starbucks Capstone Challenge

## XGBoost Classifier 

In [1]:
import pandas as pd
import boto3
import sagemaker
import os

### Upload data to S3

In [2]:
data_dir =  'ML_data'
prefix = 'sagemaker/ML'

session = sagemaker.Session()
val_location = session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

### Import and upload the test data

In [3]:
test_data = pd.read_csv(os.path.join(data_dir,'test.csv'), header= None)

In [4]:
test_y, test_x = test_data.iloc[:,0], test_data.iloc[:,1:]

In [5]:
pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test_x.csv'), header=False, index=False)
test_location = session.upload_data(os.path.join(data_dir, 'test_x.csv'), key_prefix=prefix)

### Build and Train XGBoost Classifier

In [6]:
from sagemaker import get_execution_role
role = get_execution_role()

In [7]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(session.boto_region_name, 'xgboost')

	get_image_uri(region, 'xgboost', '0.90-1').


In [8]:
xgb = sagemaker.estimator.Estimator(container,
                                   role,
                                   train_instance_count=1,
                                   train_instance_type='ml.m4.xlarge',
                                   output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                   sagemaker_session = session)

xgb.set_hyperparameters(max_depth=5,
                       eta=0.2,
                       gamma=4,
                       min_child_weight=6,
                       subsample=0.8,
                       silent=0,
                       objective='binary:logistic',
                       early_stopping_rounds=10,
                       num_round=500)

In [9]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

In [10]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2020-06-14 07:08:42 Starting - Starting the training job...
2020-06-14 07:08:45 Starting - Launching requested ML instances......
2020-06-14 07:09:56 Starting - Preparing the instances for training......
2020-06-14 07:11:11 Downloading - Downloading input data
2020-06-14 07:11:11 Training - Downloading the training image...
2020-06-14 07:11:43 Uploading - Uploading generated training model
2020-06-14 07:11:43 Completed - Training job completed
[34mArguments: train[0m
[34m[2020-06-14:07:11:31:INFO] Running standalone xgboost training.[0m
[34m[2020-06-14:07:11:31:INFO] File size need to be processed in the node: 10.16mb. Available memory size in the node: 8474.45mb[0m
[34m[2020-06-14:07:11:31:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:11:31] S3DistributionType set as FullyReplicated[0m
[34m[07:11:31] 42641x18 matrix with 767538 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-06-14:07:11:31:INFO] Determined delimi

Training seconds: 53
Billable seconds: 53


### Test the model

In [11]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [12]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [13]:
xgb_transformer.wait()

......................[34mArguments: serve[0m
[34m[2020-06-14 07:15:21 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2020-06-14 07:15:21 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2020-06-14 07:15:21 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2020-06-14 07:15:21 +0000] [38] [INFO] Booting worker with pid: 38[0m
[34m[2020-06-14 07:15:21 +0000] [39] [INFO] Booting worker with pid: 39[0m
[34m[2020-06-14:07:15:21:INFO] Model loaded successfully for worker : 38[0m
[34m[2020-06-14 07:15:21 +0000] [40] [INFO] Booting worker with pid: 40[0m
[34m[2020-06-14:07:15:21:INFO] Model loaded successfully for worker : 39[0m
[34m[2020-06-14 07:15:21 +0000] [41] [INFO] Booting worker with pid: 41[0m
[34m[2020-06-14:07:15:21:INFO] Model loaded successfully for worker : 40[0m
[34m[2020-06-14:07:15:21:INFO] Model loaded successfully for worker : 41[0m

[32m2020-06-14T07:15:51.784:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrate

In [14]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

Completed 193.7 KiB/193.7 KiB (1.8 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-ap-northeast-1-105243015009/xgboost-2020-06-14-07-11-54-785/test_x.csv.out to ML_data/test_x.csv.out


In [15]:
predictions = pd.read_csv(os.path.join(data_dir, 'test_x.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [16]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_y, predictions)
from sklearn.metrics import f1_score
f1 = f1_score(test_y, predictions)
from sklearn.metrics import precision_score
precision = precision_score(test_y, predictions, average='binary')
from sklearn.metrics import recall_score
recall = recall_score(test_y, predictions, average='binary')
print('\nAccuracy Score:', accuracy)
print('\nF1 Score:', f1)
print('\nPrecision:', precision)
print('\nRecall:', recall)


Accuracy Score: 0.7291760468257542

F1 Score: 0.7935236569597803

Precision: 0.7526589971782071

Recall: 0.8390804597701149
