### imports

In [None]:
# Boto is the Amazon Web Services (AWS) SDK for Python. 
# It enables Python developers to create, configure, and manage AWS services, such as EC2 and S3. 
# https://boto3.amazonaws.com/v1/documentation/api/latest/index.html
import boto3

In [1]:
# Amazon SageMaker Python SDK is an open source library for training & deploying ML models on Amazon SageMaker.
# https://sagemaker.readthedocs.io/en/stable/#
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer   

In [3]:
# import non-aws libraries
import re, sys, math, json, os,  urllib.request
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt     
from time import gmtime, strftime     

In [4]:
# these two are unique to using jupyter nb
from IPython.display import Image                 
from IPython.display import display               

### Set-up

In [5]:
# To enable the notebook instance to access and securely upload data to Amazon S3, an IAM role must be specified. 
# You defined this role with the proper permissions when you created the Sagemaker notebook instance)
# https://sagemaker.readthedocs.io/en/stable/session.html#sagemaker.session.get_execution_role
role = get_execution_role()

In [6]:
# Define environment variables for later use. Each region has its XGBoost container
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} 

In [9]:
# Set the region of the instance
my_region = boto3.session.Session().region_name 
print(my_region)

us-east-1


In [13]:
# Display the endpoint.
print("Success - the MySageMakerInstance is in the " + my_region +
      " region.", "\n", " You will use the " + containers[my_region] +
      " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. 
  You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


### create an S3 bucket that will store your data for this tutorial.

In [14]:
# change the name of the S3 bucket to make it unique. S3 bucket names must be globally unique.
bucket_name = 'atte-bucket' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET

In [15]:
# Create a resource service client by name using the default session.
# https://boto3.amazonaws.com/v1/documentation/api/latest/_modules/boto3.html#resource
s3 = boto3.resource('s3')

In [16]:
# Creates the new bucket. By default, the bucket is created in the US East (N. Virginia) Region. 
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html?highlight=create_bucket#S3.Client.create_bucket
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else: 
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


### Load and prepare the data

In [17]:
# Download the data to your Amazon SageMaker instance
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.


In [18]:
# load the data into a dataframe.
try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [23]:
# Check out the data
print(model_data.shape)
model_data.sample(3)

(41188, 61)


Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
18259,30,3,999,0,1,0,1,0,0,0,...,0,0,0,0,1,0,1,0,1,0
34426,54,1,999,0,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
33492,45,2,999,1,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0


In [29]:
# shuffle the data and split it into training data and test data.
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


### Train the model from the data

We will use gradient-based optimization to iteratively refine the model parameters. Gradient-based optimization is a way to find model parameter values that minimize the model error, using the gradient of the model loss function.

To use an Amazon SageMaker pre-built XGBoost model, you will need to reformat the header and first column of the training data and load the data from the S3 bucket.

In [43]:
# take a look at the last few columns.
train_data.columns

Index(['age', 'campaign', 'pdays', 'previous', 'no_previous_contact',
       'not_working', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon'

In [33]:
# drop the "y_no" column from the training data: it's the inverse of "y_yes"
tempfile = train_data.drop(['y_no'], axis=1)
tempfile.shape

(28831, 60)

In [25]:
# create a csv file named 'train.csv'
tempfile.to_csv('train.csv', index=False, header=False)

In [31]:
# demonstrate that the new .csv file has been added to our EC2 instance
! pwd
! ls

/home/ec2-user/SageMaker
bank_clean.csv	nlp-imdb-rnn.ipynb  xgb-bankdata-tutorial.ipynb
lost+found	train.csv	    xgboost-tutorial.ipynb


In [32]:
# upload train.csv from the EC2 instance to the S3 bucket
# A session manages state about a particular configuration. 
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/session.html?highlight=session
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

In [44]:
# Define the location of s3 data to train on.
# https://sagemaker.readthedocs.io/en/stable/inputs.html#sagemaker.inputs.s3_input.config
s3_input_train = sagemaker.s3_input(s3_data=f's3://{bucket_name}/{prefix}/train', content_type='csv')

Next, you need to set up the Amazon SageMaker session, create an instance of the XGBoost model (an estimator), and define the model’s hyperparameters

In [50]:
# A session manages state about a particular configuration.
sess = sagemaker.Session()

In [51]:
# Create an instance of the XGBoost model (called "estimator") using environment variables we defined earlier
# This generic Estimator class is designed for use with algorithms that don’t have their own, custom class.
# Remember: we selected 'xgboost:latest' when we defined 'containers' earlier.
# https://sagemaker.readthedocs.io/en/stable/estimators.html#sagemaker.estimator.Estimator
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path=f's3://{bucket_name}/{prefix}/output',
                                    sagemaker_session=sess)

In [53]:
# define the model’s hyperparameters. we choose 'logistic' because this is a binary classification task.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

In [54]:
# train the model using gradient optimization on the ml.m4.xlarge instance we defined earlier.
xgb.fit({'train': s3_input_train})

2020-03-20 19:48:14 Starting - Starting the training job...
2020-03-20 19:48:15 Starting - Launching requested ML instances......
2020-03-20 19:49:19 Starting - Preparing the instances for training......
2020-03-20 19:50:17 Downloading - Downloading input data...
2020-03-20 19:51:12 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2020-03-20:19:51:13:INFO] Running standalone xgboost training.[0m
[34m[2020-03-20:19:51:13:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-03-20:19:51:13:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8519.03mb[0m
[34m[2020-03-20:19:51:13:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:51:13] S3DistributionType set as FullyReplicated[0m
[34m[19:51:13] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[19:51:13] src/tree/updater_prune.cc:74: tree pruning


2020-03-20 19:51:24 Uploading - Uploading generated training model
2020-03-20 19:51:24 Completed - Training job completed
Training seconds: 67
Billable seconds: 67


### Deploy the model

In this step, you will deploy the trained model to an endpoint, reformat then load the CSV data, then run the model to create predictions.

In [55]:
# deploy the model on a server and create an endpoint that you can access
# Create a SageMaker Model and EndpointConfig, and deploy an Endpoint from this Model.
# https://sagemaker.readthedocs.io/en/stable/model.html
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

---------------!

predict whether customers in the test data enrolled for the bank product or not

In [59]:
# load the data into an array. drop the target column (and its inverse) just as we did for train_data.
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values 
test_data_array[0]

array([ 29,   2, 999,   0,   1,   0,   0,   1,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   1,   0,
         0,   0,   0,   0,   1,   0,   0,   1,   0,   0,   1,   0,   0,
         0,   1,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,
         0,   0,   1,   0,   0,   1,   0])

In [60]:
# set the data type for an inference. 
# The content_type attribute defines the endpoint request content type.
xgb_predictor.content_type = 'text/csv' 

In [61]:
# Set the serializer type. It accepts a single argument, the input data, and returns a sequence of bytes. 
# It may provide a content_type attribute that defines the endpoint request content type. 
# If not specified, a sequence of bytes is expected for the data.
xgb_predictor.serializer = csv_serializer 

In [62]:
# predict!
# https://sagemaker.readthedocs.io/en/stable/predictors.html
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') 

In [66]:
# the output is a series of probabilities in .csv format
predictions[:50]

'0.0602235160768,0.0892826914787,0.0591339841485,0.'

In [70]:
# and turn the prediction into an array
predictions_array = np.fromstring(predictions[1:], sep=',') 
predictions_array[:3]

array([0.06022352, 0.08928269, 0.05913398])

In [72]:
# note that it is the same length as the original test data, but only one column.
print(test_data.shape)
print(predictions_array.shape)

(12357, 61)
(12357,)


### Evaluate model performance

In [73]:
# compare actual vs. predicted values in a table called a confusion matrix.
cm = pd.crosstab(index=test_data['y_yes'], 
                 columns=np.round(predictions_array), 
                 rownames=['Observed'], 
                 colnames=['Predicted'])
cm

Predicted,0.0,1.0
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10785,151
1,1143,278


In [74]:
# define cells of conf matrix
tn = cm.iloc[0,0]; 
fn = cm.iloc[1,0]; 
tp = cm.iloc[1,1]; 
fp = cm.iloc[0,1]; 
p = (tp+tn)/(tp+tn+fp+fn)*100

In [75]:
# Evaluation results
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.5%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10785)    35% (151)
Purchase        10% (1143)     65% (278) 



Based on the prediction, we can conclude that you predicted a customer will enroll for a certificate of deposit accurately for 90% of customers in the test data, with a precision of 65% (278/429) for enrolled and 90% (10,785/11,928) for didn’t enroll.

## Terminate your resources

In [76]:
# delete the Amazon SageMaker endpoint and the objects in your S3 bucket
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'ED7B573B28502331',
   'HostId': 'PocR19c56vd/B5HIuFavgI6z2d+b+YBc4nz6fsLGD1qdyckuUCx0LDvmotk5z6DE46nYOxFPcr4=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'PocR19c56vd/B5HIuFavgI6z2d+b+YBc4nz6fsLGD1qdyckuUCx0LDvmotk5z6DE46nYOxFPcr4=',
    'x-amz-request-id': 'ED7B573B28502331',
    'date': 'Fri, 20 Mar 2020 20:46:06 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2020-03-20-19-48-14-047/output/model.tar.gz'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/train/train.csv'}]}]