In [1]:
# sdfs

## 1. First, download and extract the dataset:

In [7]:
%%sh
wget -N https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
unzip bank-additional.zip

Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: bank-additional/bank-additional-full.csv  


--2021-04-18 08:52:53--  https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip
Resolving sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com (sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com)... 52.218.244.185
Connecting to sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com (sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com)|52.218.244.185|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 432828 (423K) [application/zip]
Saving to: ‘bank-additional.zip’

     0K .......... .......... .......... .......... .......... 11%  138M 0s
    50K .......... .......... .......... .......... .......... 23% 33.0M 0s
   100K .......... .......... .......... .......... .......... 35% 43.2M 0s
   150K .......... .......... .......... .......... .......... 47% 28.4M 0s
   200K .......... .......... .......... .......... .......... 59% 41.9M 0s
   250K .......... .......... .......... ...

## 2. Import the SageMaker SDK and define an S3 prefix for the job:


In [8]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = 'xgboost-direct-marketing'

In [9]:
bucket

'sagemaker-us-west-2-076084266064'

## 3. Load data and preprocess

In [11]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split

data = pd.read_csv('./bank-additional/bank-additional-full.csv')

data = pd.get_dummies(data)
data = data.drop(['y_no'], axis=1)
data = pd.concat([data['y_yes'], data.drop(['y_yes'], axis=1)], axis=1)

data = data.sample(frac=1, random_state=123)
train_data, val_data = train_test_split(data,test_size=0.05)
train_data.to_csv('training.csv', index=False, header=True)
val_data.to_csv('validation.csv', index=False, header=True)

## 4. Upload files  

In [12]:
training = sess.upload_data(path='training.csv', key_prefix=prefix + "/training")
validation = sess.upload_data(path="validation.csv", key_prefix=prefix + "/validation")
output   = 's3://{}/{}/output/'.format(bucket,prefix)

In [13]:
output

's3://sagemaker-us-west-2-076084266064/xgboost-direct-marketing/output/'

## 5. Define estimator

In [14]:
from sagemaker.xgboost import XGBoost
# https://github.com/PacktPublishing/Learn-Amazon-SageMaker/blob/master/sdkv2/ch7/xgb/xgb-dm.py
xgb_estimator = XGBoost(role= sagemaker.get_execution_role(),    
                        entry_point='xgb-dm.py',    
                        instance_count=1,     
                        instance_type='ml.m5.large',    
                        framework_version='1.0-1',    
                        py_version='py3',    
                        output_path=output,    
                        hyperparameters={'max-depth': 5,'eval-metric': 'auc'})

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210403T113990 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


In [15]:
sagemaker.get_execution_role()

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210403T113990 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


'arn:aws:iam::076084266064:role/service-role/AmazonSageMaker-ExecutionRole-20210403T113990'

## 6. Train the model

In [17]:
xgb_estimator.fit({'training':training, 'validation':validation})

2021-04-18 09:32:24 Starting - Starting the training job...
2021-04-18 09:32:49 Starting - Launching requested ML instancesProfilerReport-1618738344: InProgress
......
2021-04-18 09:33:49 Starting - Preparing the instances for training......
2021-04-18 09:34:49 Downloading - Downloading input data...
2021-04-18 09:35:09 Training - Downloading the training image...
2021-04-18 09:35:52 Uploading - Uploading generated training model[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Invoking user training script.[0m
[34mINFO:sagemaker-containers:Module xgb-dm does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34mINFO:sagemaker-containers:Generating setup.cfg[0m
[34mINFO:sagemaker-containers:Generating MANIFEST.in[0m
[34mINFO:sagemaker-containers:Installing module with the following command:[0m
[34m/min

## 7. Deploy the model 

In [19]:
import time

xgb_endpoint_name = prefix+time.strftime("%Y-%m-%d-%H-%M-%S",     time.gmtime())

xgb_predictor = xgb_estimator.deploy(    endpoint_name=xgb_endpoint_name,    initial_instance_count=1,    instance_type='ml.t2.medium')



-----------------!

## 8. Get prediction using a few random records from the validation set 

In [20]:
payload = val_data[:10].drop(['y_yes'], axis=1) 
payload = payload.to_csv(header=False,           index=False).rstrip('\n')
payload

'35,103,2,999,0,1.4,93.918,-42.7,4.963,5228.1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0\n44,41,3,999,0,1.4,93.918,-42.7,4.962,5228.1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0\n26,203,4,999,0,1.4,94.465,-41.8,4.959,5228.1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0\n26,305,1,999,1,-1.1,94.199,-37.5,0.879,4963.6,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0\n30,63,1,999,0,1.4,93.918,-42.7,4.962,5228.1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0\n29,77,1,999,0,1.4,93.444,-36.1,4.964,5228.1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0\n40,160,3,999,0,1.1,93.994,-36.4,4.857,5191.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,

In [25]:
# xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = xgb_predictor.predict(payload)
print(response)

[['0.00065220083'], ['2.580175e-05'], ['0.0018599931'], ['0.6353439'], ['0.000257116'], ['0.00063935167'], ['0.00024279703'], ['0.5183001'], ['0.0012866908'], ['0.00034604044']]
