In [1]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from time import gmtime, strftime
from pathlib import Path
from sagemaker.predictor import csv_serializer
from sagemaker.serializers import CSVSerializer

## Data Prep

In [2]:
# https://archive.ics.uci.edu/ml/datasets/bank+marketing
data_url = "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"
urllib.request.urlretrieve(data_url, "bank_clean.csv")
model_data = pd.read_csv("./bank_clean.csv")

In [3]:
train_sz = int(0.7 * len(model_data))
shuffled_df = model_data.sample(frac=1., random_state=1729)
train_data, test_data = np.split(shuffled_df, [train_sz])

In [4]:
print(type(train_data), type(test_data))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [5]:
print(len(model_data), len(train_data), len(test_data))

41188 28831 12357


In [6]:
train_data.to_csv("train.csv")
test_data.to_csv("test.csv")

In [7]:
s3 = boto3.client("s3")

In [8]:
prefix = "bank-marketing"
s3.upload_file("bank_clean.csv", "avilabs-mldata", f"{prefix}/bank_clean.csv")
s3.upload_file("train.csv", "avilabs-mldata", f"{prefix}/train.csv")
s3.upload_file("test.csv", "avilabs-mldata", f"{prefix}/test.csv")

In [9]:
# Convert train.csv per the format expected by sagemaker
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('sgfmt_train.csv', index=False, header=False)
s3.upload_file("sgfmt_train.csv", "avilabs-mldata", f"{prefix}/sagemaker/train/train.csv")

## Train

In [10]:
# Get the container that has the model baked into it.
xgboost_container = sagemaker.image_uris.retrieve(
    "xgboost", 
    boto3.session.Session().region_name,
    "latest"
)

In [11]:
# Get the training data from S3. Just specificy the train directory, not the actual filename!
# Maybe the filename is hardcoded into the model code?
s3_data = "s3://avilabs-mldata/bank-marketing/sagemaker/train"
trainset = sagemaker.inputs.TrainingInput(s3_data=s3_data, content_type="csv")

In [13]:
session = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(
    xgboost_container, 
    role=get_execution_role(),
    instance_count=1, 
    instance_type="ml.m4.xlarge",
    output_path="s3://avilabs-mldata/bank-marketing/sagemaker/output",
    sagemaker_sess=session
)
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0,
    objective="binary:logistic",
    num_round=100
)

In [14]:
xgb.fit({"train": trainset})

2021-08-22 05:44:15 Starting - Starting the training job...
2021-08-22 05:44:17 Starting - Launching requested ML instancesProfilerReport-1629611054: InProgress
...
2021-08-22 05:45:14 Starting - Preparing the instances for training.........
2021-08-22 05:46:39 Downloading - Downloading input data...
2021-08-22 05:47:15 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-08-22:05:47:27:INFO] Running standalone xgboost training.[0m
[34m[2021-08-22:05:47:27:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2021-08-22:05:47:27:INFO] File size need to be processed in the node: 3.54mb. Available memory size in the node: 8417.89mb[0m
[34m[2021-08-22:05:47:27:INFO] Determined delimiter of CSV input is ','[0m
[34m[05:47:27] S3DistributionType set as FullyReplicated[0m
[34m[05:47:27] 28831x60 matrix with 1729860 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[05:47:27] src/tree/updater_prune.cc:74:

## Deploy

In [15]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
xgb_predictor.serializer = CSVSerializer()

-----------------!

In [16]:
if not Path("./test.csv").exists():
    print("Download test.csv from s3://avilabs-mldata/bank-marketing/test.csv")
else:
    test_data = pd.read_csv("./test.csv", header=None)
    testset = test_data.drop(["y_no", "y_yes"], axis=1).values

KeyError: "['y_no' 'y_yes'] not found in axis"

In [None]:
predictions = xgb_predictor.predict(testset).decode("utf-8")

In [None]:
type(predictions)

In [None]:
preds = np.fromstring(predictions, sep=",")

In [None]:
preds.shape

In [None]:
confusion_mat = pd.crosstab(
    index=test_data["y_yes"],
    columns=np.round(preds),
    rownames=["Observed"],
    colnames=["Predicted"]
)
true_neg = confusion_mat.iloc[0, 0]
false_neg = confusion_mat.iloc[1, 0]
false_pos = cm.iloc[0, 1]
true_pos = cm.iloc[1, 1]
accuracy = 100 * (true_pos + true_neg) / (true_pos + true_neg + false_pos, false_neg)

In [None]:
confusion_mat

In [18]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)