
<h1 align="center"> Machine Learning with Amazon SageMaker </h1>


# <a name="intro"></a><font color='#347B98'> 1 - Data Preparation in SageMaker Notebook Instance</font> <font size='3'></font>


## $\Delta$ 1.1 - Set S3 Bucket and Prefix

> Note: You need to customize the prefix. Use your own name/identifier

In [None]:
bucket = 'wcd-sagemaker-workshop'
prefix = '<yourname>/bank-campaign' # !!! Please change the prefix to your own name `david/bank-campaign`

## $\Delta$ 1.2 - Get Execution Role


In [None]:
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role() # this notebook needs to be executed on the SageMaker instance

In [None]:
# Go take a look at your AmazonSageMaker-ExecutionRole now in IAM
role

## $\Delta$ 1.3 - Download Data


In [None]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference


> The code below downloads the bank dataset and unzips it into the current directory

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
!unzip -o bank-additional.zip

## $\Delta$ 1.4 - Light ETL using Pandas


In [None]:
data = pd.read_csv('./bank-additional/bank-additional-full.csv', sep=';')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 20)         # Keep the output on one page
data

### Exploring Numeric and Categorical Features

In [None]:
# Frequency tables for each categorical feature
for column in data.select_dtypes(include=['object']).columns:
    display(pd.crosstab(index=data[column], columns='% observations', normalize='columns'))

# Histograms for each numeric features
display(data.describe())
%matplotlib inline
hist = data.hist(bins=30, sharey=True, figsize=(10, 10))

### Target Distribution

In [None]:
for column in data.select_dtypes(include=['object']).columns:
    if column != 'y':
        display(pd.crosstab(index=data[column], columns=data['y'], normalize='columns'))

for column in data.select_dtypes(exclude=['object']).columns:
    print(column)
    hist = data[[column, 'y']].hist(by='y', bins=30)
    plt.show()

### Correlation Matrix

In [None]:
display(data.corr())
pd.plotting.scatter_matrix(data, figsize=(12, 12))
plt.show()

### Feature Preprocessing and Engineering

In [None]:
# Create an indicator variable to capture when pdays takes a value of 999
data['no_previous_contact'] = np.where(data['pdays'] == 999, 1, 0)    

# Create an indicator for individuals not actively employed
data['not_working'] = np.where(np.in1d(data['job'], ['student', 'retired', 'unemployed']), 1, 0)   

# Dummy encoding
model_data = pd.get_dummies(data)                                                                 


## $\Delta$ 1.5 - Create the Modeling dataset and upload to S3

In [None]:
model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)


### Train/Validation/Test Split

> Note: we create a test set here mainly to use it later for batch prediction (not for hold out purpose)

In [None]:
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%


In [None]:
# create training and save to csv
(pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)
  .to_csv('train.csv', index=False, header=False)
)

# create validation and save to csv
(pd.concat([validation_data['y_yes'], validation_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)
   .to_csv('validation.csv', index=False, header=False)
)

# create scoring data and save to csv
test_data.drop(['y_no', 'y_yes'], axis=1).to_csv('score.csv', index=False, header=False)

### Upload data to Amazon S3

In [None]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'score/score.csv')).upload_file('score.csv')

> Now go to Amazon S3 bucket `weclouddata` and prefix `sagemaker/sagemaker-demo1-xgboost` to check if data has been uploaded

# <a name="intro"></a><font color='#347B98'> 2 - SageMaker Model Training</font> <font size='3'></font>


> SageMaker has a limited suite of algorithms 

## $\Delta$ 2.1 - Get the SageMaker `xgboost` Container

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')

## $\Delta$ 2.2 - Set Input Train/Test S3 Path

In [None]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

## $\Delta$ 2.3 - Train the `xgboost` model using SageMaker Estimator

In [None]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

print(xgb.latest_training_job.name)

> Once the training is done, go to SageMaker management console and check out the model detail

# <a name="intro"></a><font color='#347B98'> 3 - Model Deployment in SageMaker</font> <font size='3'></font>


### Create a prediction API

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

In [None]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.drop(['y_no', 'y_yes'], axis=1).as_matrix())

In [None]:
pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])

### Batch Prediction using the Batch Transformer

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.estimator import Estimator
container = get_image_uri(boto3.Session().region_name, 'xgboost')

sess = sagemaker.Session()

# load the training estimator
xgb_saved = Estimator.attach('xgboost-2020-03-25-21-29-20-556')

In [None]:
# Set the location of the scoring dataset
batch_input = 's3://{}/{}/score'.format(bucket, prefix) 

# Set the location to store the results of the batch transform job
batch_output = 's3://{}/{}/batch-inference'.format(bucket, prefix) 

# Run the batch transformer
transformer = xgb_saved.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output)
transformer.transform(data=batch_input, data_type='S3Prefix', content_type='text/csv', split_type='Line')
transformer.wait()

# <a name="intro"></a><font color='#347B98'> 4 - Cleaning Up the Environment</font> <font size='3'></font>


In [None]:
xgb_predictor.delete_model()
transformer.delete_model()
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)