# Sagemaker XGBoost Implementation for predicting Bank Customer Behavior

### Introduction

In this mini-project we are using a sample dataset provided by AWS, which contains 60 columns of data about approximately 41,000 bank customers and whether or not they enrolled for a particular bank product.  

The objective is to use __AWS Sagemaker__ to create an XGBoost Model which will predict whether customers enrolled for the bank product.

### Setup

In [25]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from io import StringIO
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.serializers import CSVSerializer

# Define IAM role
role = get_execution_role()
my_region = boto3.session.Session().region_name # set the region of the instance


Ignoring unnecessary instance type: None.


Success - the MySageMakerInstance is in the eu-north-1 region. You will use the 662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-xgboost:1.7-1 container for your SageMaker endpoint.


### Create Bucket and Upload Data

In [3]:
bucket_name = 'arne-t-sagemaker-first-bucket' 

s3 = boto3.resource('s3')

In [14]:
# # buckets created outside us-east-1 require the appropriate LocationConstraint to be specified
# s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
# print('S3 bucket created successfully')

S3 bucket created successfully


In [2]:
# try:
#   urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
#   print('Success: downloaded bank_clean.csv.')
# except Exception as e:
#   print('Data load error: ',e)

try:
  model_data = pd.read_csv('./bank_clean.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [5]:
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


### EDA

In [9]:
from fast_ml import eda
from fast_ml.utilities import display_all

summary_df = eda.df_info(model_data)
display_all(summary_df)

Unnamed: 0,data_type,data_type_grp,num_unique_values,sample_unique_values,num_missing,perc_missing
age,int64,Numerical,78,"[56, 57, 37, 40, 45, 59, 41, 24, 25, 29]",0,0.0
campaign,int64,Numerical,42,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]",0,0.0
pdays,int64,Numerical,27,"[999, 6, 4, 3, 5, 1, 0, 10, 7, 8]",0,0.0
previous,int64,Numerical,8,"[0, 1, 2, 3, 4, 5, 6, 7]",0,0.0
no_previous_contact,int64,Numerical,2,"[1, 0]",0,0.0
not_working,int64,Numerical,2,"[0, 1]",0,0.0
job_admin.,int64,Numerical,2,"[0, 1]",0,0.0
job_blue-collar,int64,Numerical,2,"[0, 1]",0,0.0
job_entrepreneur,int64,Numerical,2,"[0, 1]",0,0.0
job_housemaid,int64,Numerical,2,"[1, 0]",0,0.0


In [17]:
# create sweetviz report
import sweetviz as sv

report = sv.analyze(model_data)
report.show_html()

                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### Split data

In [34]:
# sagemaker expects 'y' to be first column, therefore we rearrange the data (and drop redundant columns)
formatted_df = pd.concat([model_data['y_yes'], model_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)
formatted_df.head()

Unnamed: 0,y_yes,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,56,1,999,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,57,1,999,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,37,1,999,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,40,1,999,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,56,1,999,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [36]:
# Randomly shuffle the data
shuffled_data = formatted_df.sample(frac=1, random_state=1729)

# Determine the indices for splitting the data
train_idx = int(0.6 * len(shuffled_data))
val_idx = int(0.8 * len(shuffled_data))

# Split the data into training, validation, and test sets
train_data = shuffled_data[:train_idx]
val_data = shuffled_data[train_idx:val_idx]
test_data = shuffled_data[val_idx:]

# Print the shapes of each data set to verify the sizes
print("Train Data Shape:", train_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Test Data Shape:", test_data.shape)

Train Data Shape: (24712, 60)
Validation Data Shape: (8238, 60)
Test Data Shape: (8238, 60)


In [37]:
# sagemaker expects 'y' to be first column, therefore we rearrange the data (and drop redundant columns)
train_data.to_csv('train.csv', index=False, header=False)
val_data.to_csv('validation.csv', index=False, header=False)

# upload to bucket and apply as TrainingInput
for f in ('train', 'validation'):
    boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, f'{f}/{f}.csv')).upload_file(f'{f}.csv')

prefix = 'DEMO-xgboost'
training_input = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')
validation_input = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation'.format(bucket_name, prefix), content_type='csv')


Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


### Set up XGBoost model

In [26]:
# this line automatically looks for the XGBoost image URI (Uniform Resource Identifier which specifies location) and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "1.7-1")  # according to docs, should not use 'latest'

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Ignoring unnecessary instance type: None.


Success - the MySageMakerInstance is in the eu-north-1 region. You will use the 662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-xgboost:1.7-1 container for your SageMaker endpoint.


In [45]:
# initialize hyperparameters
hyperparameters = {'max_depth':5,
                   'eta':0.2,
                   'gamma':4,
                   'min_child_weight':6,
                   'subsample':0.8,
                   'verbosity':2,
                   'objective':'binary:logistic',
                   'num_round':100}


# set an output path where the trained model will be saved
# bucket = sagemaker.Session().default_bucket()    # default_bucket method returns default bucket for session or creates one
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)


In [31]:
output_path

's3://arne-t-sagemaker-first-bucket/DEMO-xgboost/output'

In [47]:
# initialize a sagemaker session
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                    role=role,
                                    hyperparameters=hyperparameters,
                                    instance_count=1, 
                                    instance_type='ml.m5.xlarge',
                                    output_path=output_path,
                                    sagemaker_session=sess)


In [48]:
# execute the XGBoost training job
xgb.fit({'train': training_input, 'validation': validation_input})

Creating training-job with name: sagemaker-xgboost-2023-08-28-12-25-09-388


2023-08-28 12:25:09 Starting - Starting the training job...
2023-08-28 12:25:26 Starting - Preparing the instances for training......
2023-08-28 12:26:26 Downloading - Downloading input data...
2023-08-28 12:26:51 Training - Downloading the training image...
2023-08-28 12:27:37 Uploading - Uploading generated training model[34m[2023-08-28 12:27:31.795 ip-10-0-179-47.eu-north-1.compute.internal:8 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-08-28 12:27:31.818 ip-10-0-179-47.eu-north-1.compute.internal:8 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-08-28:12:27:32:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-08-28:12:27:32:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-08-28:12:27:32:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-08-28:12:27:32:INFO] Running XGBoost Sagemaker in algorithm mode[0m



2023-08-28 12:27:48 Completed - Training job completed
Training seconds: 82
Billable seconds: 82


In [49]:
# deploys the model on a server and creates a SageMaker endpoint that you can access
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m5.xlarge')

Creating model with name: sagemaker-xgboost-2023-08-28-12-34-55-186
Creating endpoint-config with name sagemaker-xgboost-2023-08-28-12-34-55-186
Creating endpoint with name sagemaker-xgboost-2023-08-28-12-34-55-186


----!

In [57]:
#  predict whether customers in the test data enrolled for the bank product or not 
test_data_array = test_data.drop(['y_yes'], axis=1).values #load the data into an array

# set the serializer type - in this case, converts python data structure to csv
xgb_predictor.serializer = CSVSerializer() 

# predict
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') 

# convert to array
predictions_array = np.genfromtxt(StringIO(predictions), delimiter=',')
print(predictions_array.shape)

(8238,)


In [58]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.3%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (7165)    36% (105)
Purchase        10% (778)     64% (190) 



### Cleanup

In [59]:
# terminate endpoint
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

# delete buckets
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

Deleting endpoint configuration with name: sagemaker-xgboost-2023-08-28-12-34-55-186
Deleting endpoint with name: sagemaker-xgboost-2023-08-28-12-34-55-186


[{'ResponseMetadata': {'RequestId': 'PX88YWZM9WZQS6RK',
   'HostId': 'pzZ2vF8/FjiOlhBhDxITxaUdbcWfIizLZFnuZXQ7z04M8bbg4WjMBtoQ3FO9LEzKGxQR1L5+ZxE=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'pzZ2vF8/FjiOlhBhDxITxaUdbcWfIizLZFnuZXQ7z04M8bbg4WjMBtoQ3FO9LEzKGxQR1L5+ZxE=',
    'x-amz-request-id': 'PX88YWZM9WZQS6RK',
    'date': 'Mon, 28 Aug 2023 13:07:31 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'DEMO-xgboost/output/sagemaker-xgboost-2023-08-28-12-12-34-953/profiler-output/system/incremental/2023082812/1693224840.algo-1.json'},
   {'Key': 'DEMO-xgboost/output/sagemaker-xgboost-2023-08-28-12-25-09-388/debug-output/index/000000000/000000000040_worker_0.json'},
   {'Key': 'DEMO-xgboost/output/sagemaker-xgboost-2023-08-28-12-25-09-388/debug-output/events/000000000000/000000000000_worker_0.tfevents'},
   {'Key': 'DEMO-xgboost/output/sagemak