## Initialize Notebook

In [None]:
# Defining global variables BUCKET and ROLE that point to the bucket assocaited with the Domain and it's execution role
# Fetch this data by importing the Sagemaker library
import sagemaker
# Fetch the bucket associated with the Domain
BUCKET = sagemaker.Session().default_bucket()
# Fetch the execution role associated with the Domain
ROLE = sagemaker.get_execution_role()

In [None]:
import sagemaker_datawrangler                     # For interactive data prep widget
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import boto3
import re
import zipfile

In [None]:
# Print Pandas version
print(pd.__version__)

## Data Import

In [None]:
'''Using the requests library download the ZIP file from 
the url "https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip" 
and save it to current directory and unzip the archive
'''
import requests
r = requests.get("https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip")
with open("bank-additional.zip", "wb") as f:
    f.write(r.content)
    
with zipfile.ZipFile('bank-additional.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
bank_additional = pd.read_csv('bank-additional/bank-additional-full.csv')
print(bank_additional.head())

## Data Exploration

In [None]:
# Let's correlate the features using heatmap from the seaborn library and plot it on the noteboo
import seaborn as sns
corr = bank_additional.corr()
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
plt.show()

# Plot pair plot
sns.pairplot(bank_additional)
plt.show()

## Data Transformation

In [None]:
'''
Create a new dataframe with column no_previous_contact and populates from existing dataframe column pdays using numpy when the condition equals to 999, 1, 0 and show the table
'''
bank_additional['no_previous_contact'] = np.where(bank_additional['pdays'] == 999, 1, 0)
print(bank_additional.head())

In [None]:
bank_additional['not_working'] = np.where(bank_additional['job'] == 'student', 1, 0)
bank_additional['not_working'] = np.where(bank_additional['job'] == 'retired', 1, 0)
bank_additional['not_working'] = np.where(bank_additional['job'] == 'unemployed', 1, 0)

In [None]:
bank_additional = pd.get_dummies(bank_additional)
bank_additional.head()

In [None]:
'''
Drop the columns 'duration', emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m' and 'nr.employed' 
from the dataframe and create a new dataframe with name model_data
'''
model_data = bank_additional.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)
model_data.head()

In [None]:
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=42), [int(.7 * len(model_data)), int(.9 * len(model_data))])

In [None]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([validation_data['y_yes'], validation_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)

In [None]:
# Use boto3 to upload the following files to S3 and upload train.csv to train/train.csv prefix and also upload validation.csv uploads to validation/validation.csv prefix
s3 = boto3.resource('s3')
s3.Bucket(BUCKET).upload_file('train.csv', 'train/train.csv')
s3.Bucket(BUCKET).upload_file('validation.csv', 'validation/validation.csv')

## Model Training and Tuning

In [None]:
import sagemaker
CONTAINER = sagemaker.image_uris.retrieve('xgboost', sagemaker.session.Session().boto_region_name, 'latest')
print(CONTAINER)

In [None]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(s3_data='s3://{}/train/train.csv'.format(BUCKET),
                                                      content_type='text/csv')
validation_input = TrainingInput(s3_data='s3://{}/validation/validation.csv'.format(BUCKET),
                                                      content_type='text/csv')

In [None]:
xgb_estimator = sagemaker.estimator.Estimator(CONTAINER, ROLE, train_instance_count=1, train_instance_type='ml.m5.xlarge', output_path='s3://{}/output'.format(BUCKET))
xgb_estimator.set_hyperparameters(max_depth=5, eta=0.2, gamma=4, min_child_weight=6, subsample=0.8, objective='binary:logistic', num_round=100)
xgb_estimator.fit({'train': train_input, 'validation': validation_input})

## Model Hosting

In [None]:
# Let's deploy a model that's hosted behind a real-time endpoint
xgb_predictor = xgb_estimator.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

## Model Evaluation

In [None]:
from sagemaker.serializers import CSVSerializer

xgb_predictor.serializer = CSVSerializer()

def predict(data, predictor, rows=500 ):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.drop(['y_no', 'y_yes'], axis=1).to_numpy(), xgb_predictor)

In [None]:
pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])

## Clean Up

In [None]:
# Delete the endpoint
xgb_predictor.delete_endpoint()