In [5]:
#import the necessary libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display 
from time import gmtime, strftime 
from sagemaker.serializers import CSVSerializer
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput

#Define the IAM role in AWS
role = get_execution_role() 
prefix = 'bank-marketing' 
#each region has its XGBoost Container 
# AWS XGBoost container URIs for different regions
containers = {
    'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
    'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
    'us-west-1': '632365934929.dkr.ecr.us-west-1.amazonaws.com/xgboost:latest',
    'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest'
}
my_region = boto3.session.Session().region_name
print("Great! - your SageMaker Instance is in the" + my_region + "region. You will use the " + containers[my_region] + "container for your SageMaker endpoint to make inference requests.")

Great! - your SageMaker Instance is in theus-east-1region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latestcontainer for your SageMaker endpoint to make inference requests.


In [7]:
#Download from your s3 bucket the bank marketing csv file on publically available census data 
from io import StringIO
s3 = boto3.resource('s3') 
bucket_name = 'sagemaker-us-east-1-895550936162'
object_key = 'bank-additional-full.csv'

#Load the data into a pandas dataframe

csv_obj = s3.Object(bucket_name, object_key) 
csv_string = csv_obj.get()['Body'].read().decode('utf-8') 

raw_data = pd.read_csv(StringIO(csv_string), sep=';')
raw_data.head()

[2;36m[03/03/25 22:22:01][0m[2;36m [0m[1;94mINFO    [0m Skipping checksum validation.   ]8;id=775744;file:///opt/conda/lib/python3.11/site-packages/botocore/httpchecksum.py\[2mhttpchecksum.py[0m]8;;\[2m:[0m]8;id=64721;file:///opt/conda/lib/python3.11/site-packages/botocore/httpchecksum.py#481\[2m481[0m]8;;\
[2;36m                    [0m         Response did not contain one of [2m                   [0m
[2;36m                    [0m         the following algorithms:       [2m                   [0m
[2;36m                    [0m         [1m[[0m[32m'crc32'[0m, [32m'sha1'[0m, [32m'sha256'[0m[1m][0m.    [2m                   [0m


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [9]:
#Now we need to utilize one-hot ecoding method to create a numerical continuous elements since XGBoost does not support/handle categorical data
model_data = pd.get_dummies(raw_data) #apply one-hot encoding
model_data = model_data.astype(int) #convert boolean values to integers of 0s and 1s
model_data.head() #print the table


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,marital_unknown,education_basic.4y,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,default_no,default_unknown,default_yes,housing_no,housing_unknown,housing_yes,loan_no,loan_unknown,loan_yes,contact_cellular,contact_telephone,month_apr,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,261,1,999,0,1,93,-36,4,5191,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
1,57,149,1,999,0,1,93,-36,4,5191,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
2,37,226,1,999,0,1,93,-36,4,5191,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
3,40,151,1,999,0,1,93,-36,4,5191,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
4,56,307,1,999,0,1,93,-36,4,5191,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0


In [10]:
#We now need to randomize our data and split the data between train and test using the 70/30 split respectively
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7*len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 65) (12357, 65)


  return bound(*args, **kwds)


In [13]:
#Reformat the header and first column of the training data,
#save the new train dataset to your s3 bucket as train.csv and load the datat from the s3 bucket
# Reformat the header and first column
pd.concat([train_data["y_yes"], train_data.drop(["y_no", "y_yes"], axis=1)], axis=1).to_csv("train.csv", index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
from sagemaker.inputs import TrainingInput

s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')


In [14]:
#Now we need to setup the sagemaker session in order to create an instance of the XGBoost model, and define the model's hyperparameters
session_sm = sagemaker.Session() 
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge')
xgb.set_hyperparameters(eta=0.1, objective='binary:logistic',num_round=25) 


[2;36m                    [0m         renamed in sagemaker>=[1;36m2[0m.         [2m                  [0m
[2;36m                    [0m         See:                             [2m                  [0m
[2;36m                    [0m         [4;94mhttps://sagemaker.readthedocs.io[0m [2m                  [0m
[2;36m                    [0m         [4;94m/en/stable/v2.html[0m for details.  [2m                  [0m
[2;36m                    [0m         renamed in sagemaker>=[1;36m2[0m.         [2m                  [0m
[2;36m                    [0m         See:                             [2m                  [0m
[2;36m                    [0m         [4;94mhttps://sagemaker.readthedocs.io[0m [2m                  [0m
[2;36m                    [0m         [4;94m/en/stable/v2.html[0m for details.  [2m                  [0m


In [15]:
#Now after the data is loaded and the XGboost estimator is configured, we now need to train the model using gradient
#optimization on the ml.m4.xlarge instance
xgb.fit({'train': s3_input_train})

[2;36m[03/03/25 22:44:58][0m[2;36m [0m[1;94mINFO    [0m SageMaker Python SDK will   ]8;id=541642;file:///opt/conda/lib/python3.11/site-packages/sagemaker/telemetry/telemetry_logging.py\[2mtelemetry_logging.py[0m]8;;\[2m:[0m]8;id=709733;file:///opt/conda/lib/python3.11/site-packages/sagemaker/telemetry/telemetry_logging.py#91\[2m91[0m]8;;\
[2;36m                    [0m         collect telemetry to help   [2m                       [0m
[2;36m                    [0m         us better understand our    [2m                       [0m
[2;36m                    [0m         user's needs, diagnose      [2m                       [0m
[2;36m                    [0m         issues, and deliver         [2m                       [0m
[2;36m                    [0m         additional features.        [2m                       [0m
[2;36m                    [0m         To opt out of telemetry,    [2m                       [0m
[2;36m                    [0m        

In [17]:
#Deploy the model and create an endpoint that you can access
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge') 

[2;36m[03/03/25 22:52:02][0m[2;36m [0m[1;94mINFO    [0m Creating model with name:           ]8;id=625730;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py\[2msession.py[0m]8;;\[2m:[0m]8;id=511179;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py#4094\[2m4094[0m]8;;\
[2;36m                    [0m         xgboost-[1;36m2025[0m-03-03-22-52-02-175     [2m               [0m
[2;36m[03/03/25 22:52:03][0m[2;36m [0m[1;94mINFO    [0m Creating endpoint-config with name  ]8;id=127399;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py\[2msession.py[0m]8;;\[2m:[0m]8;id=238396;file:///opt/conda/lib/python3.11/site-packages/sagemaker/session.py#5889\[2m5889[0m]8;;\
[2;36m                    [0m         xgboost-[1;36m2025[0m-03-03-22-52-02-175     [2m               [0m
[2;36m                   [0m[2;36m [0m[1;94mINFO    [0m Creating endpoint with name         ]8;id=713916;file:///opt/conda/li

In [20]:
#Predict whether bank customers in the test dataset will subscribe to a term deposit
test_data_array = test_data.drop(['y_no','y_yes'], axis=1).values #drop the target columns and convert test data to numpy array
xgb_predictor.content_type = 'text/csv' #set the datat type for an inference
xgb_predictor.serializer = CSVSerializer() #set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8')#predict 
predictions_array = np.fromstring(predictions[1:], sep=',')# and turn the prediction into an array 
print(predictions_array.shape)

(12357,)


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Get the actual labels (ground truth)
y_test = test_data['y_yes'].values  # Since 'y_yes' is 1 for term deposit subscription

# Convert predictions to binary (0 or 1) using a threshold of 0.5
y_pred = (predictions_array >= 0.5).astype(int)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation results
print(f"Model Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Model Accuracy: 0.9161
Precision: 0.6781
Recall: 0.5144
F1 Score: 0.5850
Confusion Matrix:
[[10589   347]
 [  690   731]]
