# 1. Import Libraries

In [None]:
# General Data Manipulation Libraries
import numpy as np
import pandas as pd

# Model & Helper Libraries
import os
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Plotting Tools
import matplotlib.pyplot as plt

# Sagemaker Unique Libraries
import sagemaker
import boto3
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.inputs import TrainingInput

# 2. Configure Boto3 Clients and Sessions

In [None]:
region = boto3.Session().region_name
smclient = boto3.Session().client("sagemaker")

role = sagemaker.get_execution_role()

bucket = "sagemaker-santander"

print(f'AWS Region name : {region},\nSession : {smclient},\nRole : {role}')

# 3. Load Data

In [None]:
# Print Files in Input Training Directory
s3client = boto3.client('s3') # S3 Client
subfolder = 'input-data' # Subfolder
contents = s3client.list_objects(Bucket=bucket, Prefix=subfolder)['Contents']
for f in contents:
    print(f['Key'])

In [None]:
input_file = 'input-data/train.csv'
response = s3client.get_object(Bucket=bucket, Key=input_file)
df = pd.read_csv(io.BytesIO(response['Body'].read()))

df.head()

# 4. Data Preperation

In [None]:
var_colums = [c for c in df.columns if c not in ['ID_code','target']]
X = df.loc[:, var_colums]
y = df.loc[:, 'target']

# We are performing a 80-20 split for Training and Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
# Construct Train and Validation Dataframes
pd.concat([y_train, X_train], axis=1).to_csv(
    "train.csv", index=False, header=False
)
pd.concat([y_valid, X_valid], axis=1).to_csv(
    "validation.csv", index=False, header=False
)

In [None]:
# Copy the files to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    "train/train.csv"
).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    "validation/validation.csv"
).upload_file("validation.csv")

In [None]:
pd.read_csv('train.csv')

# 5. Model Setup

In [None]:
sess = sagemaker.Session()

container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")


xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path="s3://{}/output".format(bucket),
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)

## 5.1 Hyper-paramter Tuning

In [None]:
hyperparameter_ranges = {
    "max_depth": IntegerParameter(1, 4),
}

objective_metric_name = "validation:auc"

tuner = HyperparameterTuner(
    xgb, objective_metric_name, hyperparameter_ranges, max_jobs=2, max_parallel_jobs=3
)

# 6. Model Training

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/train".format(bucket),
    content_type="csv",

)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/validation".format(bucket), 
    content_type="csv",

)

tuner.fit({"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False)

# 7. Examine Results

In [None]:
tuning_job_name = 'xgboost-210923-2339'
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)

full_df = tuner.dataframe()

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    if len(df) > 0:
        df = df.sort_values("FinalObjectiveValue", ascending=False)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest": min(df["FinalObjectiveValue"]), "highest": max(df["FinalObjectiveValue"])})
        pd.set_option("display.max_colwidth", -1)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df

In [None]:
import bokeh
import bokeh.io

bokeh.io.output_notebook()
from bokeh.plotting import figure, show
from bokeh.models import HoverTool


class HoverHelper:
    def __init__(self, tuning_analytics):
        self.tuner = tuning_analytics

    def hovertool(self):
        tooltips = [
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
            ("TrainingJobName", "@TrainingJobName"),
        ]
        for k in self.tuner.tuning_ranges.keys():
            tooltips.append((k, "@{%s}" % k))

        ht = HoverTool(tooltips=tooltips)
        return ht

    def tools(self, standard_tools="pan,crosshair,wheel_zoom,zoom_in,zoom_out,undo,reset"):
        return [self.hovertool(), standard_tools]


hover = HoverHelper(tuner)

p = figure(plot_width=900, plot_height=400, tools=hover.tools(), x_axis_type="datetime")
p.circle(source=df, x="TrainingStartTime", y="FinalObjectiveValue")
show(p)

In [None]:
tuning_job_result = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)
objective_name = tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']["MetricName"]

ranges = tuner.tuning_ranges
figures = []
for hp_name, hp_range in ranges.items():
    categorical_args = {}
    if hp_range.get("Values"):
        # This is marked as categorical.  Check if all options are actually numbers.
        def is_num(x):
            try:
                float(x)
                return 1
            except:
                return 0

        vals = hp_range["Values"]
        if sum([is_num(x) for x in vals]) == len(vals):
            # Bokeh has issues plotting a "categorical" range that's actually numeric, so plot as numeric
            print("Hyperparameter %s is tuned as categorical, but all values are numeric" % hp_name)
        else:
            # Set up extra options for plotting categoricals.  A bit tricky when they're actually numbers.
            categorical_args["x_range"] = vals

    # Now plot it
    p = figure(
        plot_width=500,
        plot_height=500,
        title="Objective vs %s" % hp_name,
        tools=hover.tools(),
        x_axis_label=hp_name,
        y_axis_label=objective_name,
        **categorical_args,
    )
    p.circle(source=df, x=hp_name, y="FinalObjectiveValue")
    figures.append(p)
show(bokeh.layouts.Column(*figures))

# 8. Model Deployment

In [None]:
results = sagemaker.analytics.HyperparameterTuningJobAnalytics(tuning_job_name)
results_df = results.dataframe()
best_training_job_summary = results.description()["BestTrainingJob"]


In [None]:
best_training_job_summary

In [None]:
# Attach to an existing hyperparameter tuning job.
tuning_job_details = smclient.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)
xgb_tuner = HyperparameterTuner.attach(
    tuning_job_name,
    job_details=tuning_job_details,
    sagemaker_session=sagemaker.Session(),
    estimator_cls=None,
)

# Get the best XGBoost training job name from the HPO job
xgb_best_training_job = xgb_tuner.best_training_job()
print(xgb_best_training_job)
# Attach estimator to the best training job name
best_estimator = sagemaker.estimator.Estimator.attach(xgb_best_training_job)

# Create model to be passed to the inference pipeline
best_model = sagemaker.model.Model(
    model_data=best_estimator.model_data,
    role=sagemaker.get_execution_role(),
    image_uri=best_estimator.image_uri,
)

predictor = best_model.deploy(initial_instance_count=1, instance_type="ml.m5.large")