## Dataset

The dataset you use is 

Based on this tutorial on Bank with autotuning 
- https://sagemaker-examples.readthedocs.io/en/latest/hyperparameter_tuning/xgboost_random_log/hpo_xgboost_random_log.html
- Old tutorial https://docs.aws.amazon.com/en_jp/sagemaker/latest/dg/automatic-model-tuning-ex-data.html


TODO document the dataset



# Session initialisation

In [3]:
import sys
!{sys.executable} -m pip install "sagemaker>=2.121.0"


[0m

In [None]:
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()


# Parameters

In [None]:
from time import gmtime, strftime
import time

run_id = f"{strftime('%y%m%d%H%M', gmtime())}"

stage_prefix = "L"
project_prefix = "bank"
variant_prefix = "autotune"

In [None]:
job_prefix_short = f"{variant_prefix}/{run_id}"
job_prefix_long = f"{stage_prefix}/{project_prefix}/{job_prefix_short}"

In [None]:
print(f"{job_prefix_short=}")
print(f"{job_prefix_long=}")

In [None]:
import os
base_folder = os.path.join("./generated", job_prefix_short)
base_uri = f"s3://{default_bucket}/{job_prefix_long}"
base_uri_for_jobs = f"s3://{default_bucket}/{stage_prefix}-jobs"

In [None]:
print(f"{base_uri=}")
print(f"{base_uri_for_jobs=}")

# Data Acquisition

In [None]:
data_source_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"

In [None]:
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

data_folder = os.path.join(base_folder, "data")

raw_data_folder = os.path.join(data_folder, "raw")
os.makedirs(raw_data_folder, exist_ok=True)

response = urlopen(data_source_uri)
source_zip = ZipFile(BytesIO(response.read()))
source_zip.extractall(raw_data_folder) 

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 10)         # Keep the output on one page

In [None]:
dataset_folder = os.path.join(raw_data_folder, "bank-additional")

In [None]:
os.listdir(dataset_folder)

In [None]:
raw_dataset_path = os.path.join(dataset_folder, "bank-additional-full.csv")
df = pd.read_csv(raw_dataset_path, sep=';')

In [None]:
df.head(10)

# Feature Engineering

In [None]:
import numpy as np

df['no_previous_contact'] = np.where(df['pdays'] == 999, 1, 0)                                 # Indicator variable to capture when pdays takes a value of 999
df['not_working'] = np.where(np.in1d(df['job'], ['student', 'retired', 'unemployed']), 1, 0)   # Indicator for individuals not actively employed
model_data = pd.get_dummies(df)                                                                  # Convert categorical variables to sets of indicators

In [None]:
model_data.head(10)

In [None]:
# drop
model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)

In [None]:
model_data.head(10)

In [None]:
# split
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9*len(model_data))])


In [None]:
train_data_final = pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)
validation_data_final = pd.concat([validation_data['y_yes'], validation_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)
test_data_final = pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1)

In [None]:
def save_dataset(df, prefix):
    os.makedirs(os.path.join(data_folder, f"{prefix}"), exist_ok=True)
    local_filename = os.path.join(data_folder, f"{prefix}/{prefix}.csv")
    df.to_csv(local_filename, index=False, header=True)

save_dataset(train_data_final, "train")
save_dataset(validation_data_final, "validation")
save_dataset(test_data_final, "test")

In [None]:
os.listdir(os.path.join(data_folder, "train"))

In [None]:
os.listdir(os.path.join(data_folder, "validation"))

In [None]:
os.listdir(os.path.join(data_folder, "test"))

In [None]:
# uppload

def upload_dataset(prefix):
    local_filename = os.path.join(data_folder, f"{prefix}/{prefix}.csv")
    uri = f"{base_uri}/{prefix}"
    return sagemaker.s3.S3Uploader.upload(
        local_path=local_filename,
        desired_s3_uri=uri
    )


train_data_uri = upload_dataset("train")
validation_data_uri = upload_dataset("validation")
test_data_uri = upload_dataset("test")

In [None]:
print(train_data_uri)

In [None]:
train_df = df = pd.read_csv(train_data_uri)
train_df.head(10)

In [None]:
train_df.describe()

TODO 
version plus simple
#xgbDF['target'] = xgbDF['target'].map(diagnoses) # re-mapping


# Training parameters

In [None]:
instance_type = "ml.c4.2xlarge"  # "ml.m5.large"
instance_count = 2

# Training Job

API Reference
- https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-ex-tuning-job.html
- https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-metrics.html

Examples: 
- https://aws.plainenglish.io/tuning-your-model-hyperparameters-with-aws-sagemaker-ba4071ad0792

TODO
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [None]:
# relove run_id as a time stamp is added 
training_job_name = f"{stage_prefix}-{project_prefix}-{variant_prefix}"

In [None]:
print(f"{training_job_name=}")

In [None]:
from sagemaker.estimator import Estimator
from sagemaker.image_uris import retrieve as model_image

In [None]:
image_uri = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.0-1",
    py_version="py3",
    instance_type=instance_type,  
)

API Reference for SageMaker Training Estimator 
- https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
API Reference for XGBoost estimator
- https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
- https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters


In [None]:
# Creating the estimator
xbg_train_estimator = Estimator(
    image_uri = image_uri,
    base_job_name = training_job_name,
    role = role,
    sagemaker_session = sagemaker_session,
    output_path = base_uri_for_jobs,
    instance_type = instance_type,
    instance_count = instance_count
)

In [None]:
xbg_train_estimator.set_hyperparameters(
    objective="binary:logistic",
    eval_metric = "auc",
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4
)
# no prior knowledge - using the tutorial values implemented an estimator

In [None]:
train_input = TrainingInput(
    train_data_uri, 
    content_type="csv"
)

validation_input = TrainingInput(
    validation_data_uri, 
    content_type="csv"
)


In [None]:
%%time

train_job = xbg_train_estimator.fit(
    inputs={
        "train": train_input,
        "validation": validation_input
    }
)

## Analyse training result

In [None]:
analytics = xbg_train_estimator.training_job_analytics

In [None]:
analytics.dataframe()

In [None]:
training_job_name = xbg_train_estimator.latest_training_job.job_name
model_uri = f"{xbg_train_estimator.output_path}/{training_job_name}/output/model.tar.gz"

In [None]:
print(f"{training_job_name=}")
print(f"{model_uri=}")

TODO
download model

# Tuning job

API Reference
- https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html

XGBoost Tuning Examples
- https://sagemaker-examples.readthedocs.io/en/latest/hyperparameter_tuning/xgboost_random_log/hpo_xgboost_random_log.html


The HyperparameterTuner() class optimizes the searching of values to be tested on each model training using Bayesian Search. This method treats this whole value searching task as a regression problem.

TODO
tuning_job_config = {
    "RandomSeed" : 123
  }

In [None]:
tuning_job_name = f"{job_name}-at"

In [None]:
print(f"{tuning_job_name=}")

Instead of passing values for hyperparameter option, the search grid will define range spaces for values to be tested



In [None]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter

tuner_search_grid = {
    "eta": ContinuousParameter(0, 1),
    "alpha": ContinuousParameter(0, 2),
    "min_child_weight": ContinuousParameter(1, 10),
    "max_depth": IntegerParameter(1, 10)
}

In [None]:
from sagemaker.tuner import HyperparameterTuner

# Estimator objective and evaluation metric to be applied
xbg_train_estimator.set_hyperparameters(
    eval_metric = 'auc',
    objective = 'binary:logistic',
    num_round = 100,
    rate_drop = 0.3
)

# Creating a tuner class
hyperparam_tuner = HyperparameterTuner(
    estimator = xbg_train_estimator,
    early_stopping_type = "Auto",
    objective_metric_name = "validation:auc",
    strategy = "Bayesian",
    objective_type = "Maximize",
    max_jobs = 10,
    max_parallel_jobs = 2,
    hyperparameter_ranges = tuner_search_grid
)

- estimator is the train estimator based on XGBoosty.
- objective_metric_name and objective_type, as well as evl_metric in the train job are setting which metric the tuning job should use to evaluate the validation folds during the processand whether it should minimize ot maximize. Here "Maximize" AUC.
- early_stopping_type set as "Auto" to allow for dropping the training job before it completess, when the ongoing job clearly will not reach a higher performance than a previously trained models.
- max_jobs and max_parallel_jobs indicates the maximum total of 10 training jobs, where 2 jobs tmay be run in parallel.


In [None]:
%%time

from sagemaker.inputs import TrainingInput

hyperparam_tuner.fit(
    wait = False,
    job_name = tuning_job_name,
    inputs = {
        "train": train_input, 
        "validation": validation_input
    }
)

TODO
wait end

##  Tuning result analysis

In [None]:
from pprint import pprint

In [None]:
description = hyperparam_tuner.describe()

In [None]:
pprint(description)

In [None]:
pprint(description['HyperParameterTuningJobName'])

In [None]:
pprint(description['HyperParameterTuningJobStatus'])

In [None]:
pprint(description['ObjectiveStatusCounters'])

In [None]:
pprint(description['TrainingJobStatusCounters'])

In [None]:
best_training_job = pprint(description['BestTrainingJob'])
best_training_job

In [None]:
best_estimator = hyperparam_tuner.best_training_job()

In [None]:
best_estimator 

## Tuner data Analysis

API Reference Analytics
- https://sagemaker.readthedocs.io/en/stable/api/training/analytics.html

In [None]:
analytics = hyperparam_tuner.analytics()

In [None]:
analytics

In [None]:
analytics.description()

In [None]:
analytics.training_job_summaries()

In [None]:
analytics.dataframe()

# TODO register Best candidate

# TODO cleanup