# Bank using autoML

## Notebook Description

**Dataset Reference:** https://archive.ics.uci.edu/ml/datasets/bank+marketing

**Type of problem:** Classification

**Type of solution:** AutoML - generation of an ensembling model

**Stack:**
- pandas, numpy 
- SageMaker AutoML/AutoPilot
- Studio's prebuilt image DataScience 3.0 (conda) and XGBoost Stack

**Steps:**
- download data
- do some data preparation: features are untouched but the file is saved as CSV comma separated (requirement for AutoML)
- upload the full dataset to S3 
- configure the automl job and run it 
- get the run information and analytics
- download the model and reports (explainability and reports)
- delete data and output generated on s3


# Session initialisation

In [3]:
import sys
!{sys.executable} -m pip install "sagemaker>=2.121.0"


[0m

In [None]:
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

# Parameters

In [None]:
from time import gmtime, strftime
import time

run_id = f"{strftime('%y%m%d%H%M', gmtime())}"

stage_prefix = "L"
project_prefix = "bank"
variant_prefix = "automl"

In [None]:
job_prefix_short = f"{variant_prefix}/{run_id}"
job_prefix_long = f"{stage_prefix}/{project_prefix}/{job_prefix_short}"

In [None]:
print(f"{job_prefix_short=}")
print(f"{job_prefix_long=}")

In [None]:
import os
base_folder = os.path.join("./generated", job_prefix_short)
base_uri = f"s3://{default_bucket}/{job_prefix_long}"
base_uri_for_jobs = f"s3://{default_bucket}/{stage_prefix}-jobs"

In [None]:
print(f"{base_folder=}")
print(f"{base_uri=}")
print(f"{base_uri_for_jobs=}")

# Data Acquisition

In [None]:
data_source_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"

In [None]:
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

data_folder = os.path.join(base_folder, "data")

raw_data_folder = os.path.join(data_folder, "raw")
os.makedirs(raw_data_folder, exist_ok=True)

response = urlopen(data_source_uri)
source_zip = ZipFile(BytesIO(response.read()))
source_zip.extractall(raw_data_folder) 

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 10)         # Keep the output on one page

In [None]:
dataset_folder = os.path.join(raw_data_folder, "bank-additional")

In [None]:
os.listdir(dataset_folder)

In [None]:
raw_dataset_path = os.path.join(dataset_folder, "bank-additional-full.csv")
df = pd.read_csv(raw_dataset_path, sep=';')

In [None]:
df.head(10)

In [None]:
# TODO jobout
# TODO job data clean up S3

# Data Preparation

In [None]:
# save as comma separated as AutoML requires comma separated CSV

def save_dataset(df, prefix):
    os.makedirs(os.path.join(data_folder, f"{prefix}"), exist_ok=True)
    local_filename = os.path.join(data_folder, f"{prefix}/{prefix}.csv")
    df.to_csv(local_filename, index=False, header=True)
    return local_filename

input_dataset_path = save_dataset(df, "input")

In [None]:
# uppload
input_data_uri = sagemaker.s3.S3Uploader.upload(
        local_path=input_dataset_path,
        desired_s3_uri=f"{base_uri}/input"
    )

In [None]:
print(input_data_uri)

In [None]:
input_df = df = pd.read_csv(input_data_uri)
input_df.head(10)

## Training Parameters

API Reference
- https://sagemaker.readthedocs.io/en/stable/api/training/automl.html

*Parameters*
- max_candidates (int) – The maximum number of times a training job is allowed to run. Sensible values are over 10.
- total_job_runtime_in_seconds (int) – the total wait time of an AutoML job in seconds. Sensible values are over 1800.


In [None]:
max_candidates = 10
validation_fraction = 0.2
total_job_runtime_in_seconds = 1800

label_column = "y"

## AutoML Training

In [None]:
job_name = job_prefix_long.replace("/","-")

In [None]:
print(f"{job_name=}")

In [None]:
from sagemaker.automl.automl import AutoML
from sagemaker.automl.automl import AutoMLInput
from sagemaker.workflow.automl_step import AutoMLStep

 
automl = AutoML(
    role=role,
    output_path=base_uri_for_jobs,
    target_attribute_name=label_column,
    validation_fraction=validation_fraction,
    sagemaker_session=sagemaker_session,
    total_job_runtime_in_seconds=total_job_runtime_in_seconds, 
    max_candidates = max_candidates,
    mode="ENSEMBLING"
)

input_raw = AutoMLInput(
    inputs=input_data_uri,    
    target_attribute_name=label_column
)

In [None]:
%%time

step_args = automl.fit(
    inputs=[input_raw],
    job_name=job_name
)


In [None]:
# TODO loockup sagemaker_session.logs_for_job(job_name, wait=False, poll=10, log_type='All')

# Inspect the best candidate

In [None]:
from pprint import pprint

In [None]:
#automl.__dict__

In [None]:
#automl.latest_auto_ml_job.__dict__

In [None]:
best_candidate = automl.best_candidate()

In [None]:
#pprint(best_candidate)

In [None]:
print(best_candidate['CandidateName'])

In [None]:
print(best_candidate['InferenceContainers'][0]['ModelDataUrl'])

In [None]:
pprint(best_candidate['FinalAutoMLJobObjectiveMetric'])

In [None]:
pprint(best_candidate['CandidateProperties']['CandidateArtifactLocations'])

In [None]:
#pprint(best_candidate['CandidateProperties']['CandidateMetrics'])

In [None]:
pprint([ (metric['StandardMetricName'], metric['Value']) 
          for metric in best_candidate['CandidateProperties']['CandidateMetrics']
       ])

In [None]:
#pprint(automl.list_candidates())

In [None]:
TODO describe

In [None]:
analytics

In [None]:
analytics = automl.describe_auto_ml_job()

# Export best candidate

In [None]:
from sagemaker.s3 import S3Downloader

model_output_folder = os.path.join(base_folder, "model")

explainability_folder = os.path.join(model_output_folder, "explainability")
os.makedirs(explainability_folder, exist_ok=True)

explainability_uri = best_candidate['CandidateProperties']['CandidateArtifactLocations']['Explainability'] 
S3Downloader.download(s3_uri=explainability_uri,
                      local_path=explainability_folder,
                      sagemaker_session=sagemaker_session)

model_insight_folder = os.path.join(model_output_folder, "model-insight")
os.makedirs(model_insight_folder, exist_ok=True)

model_insights_uri = best_candidate['CandidateProperties']['CandidateArtifactLocations']['ModelInsights'] 
S3Downloader.download(s3_uri=model_insights_uri,
                      local_path=model_insight_folder,
                      sagemaker_session=sagemaker_session)


model_data_folder = os.path.join(model_output_folder, "model-data")
os.makedirs(model_data_folder, exist_ok=True)

model_data_uri = best_candidate['InferenceContainers'][0]['ModelDataUrl'] 
S3Downloader.download(s3_uri=model_data_uri,
                      local_path=model_data_folder,
                      sagemaker_session=sagemaker_session)

print(f"{model_output_folder=}")

In [None]:
import tarfile

tar_filename = os.path.join(model_data_folder, "model.tar.gz")
with tarfile.open(tar_filename, 'r') as archive:
    archive.extractall(model_data_folder)
    
print(f"{tar_filename=}")

# Cleanup

In [None]:
from pprint import pprint

## Delete local data

In [None]:
import shutil
# Try to remove the local files; if it fails, throw an error using try...except.
try:
    shutil.rmtree(base_folder)
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

## Delete S3 data

In [None]:
# TODO exception management
s3 = boto3.Session(region_name = region).resource('s3')
bucket = s3.Bucket(default_bucket)
collection_to_be_deleted = bucket.object_versions.filter(Prefix=f"{job_prefix_long}")

In [None]:
pprint(list(collection_to_be_deleted.all()))

In [None]:
deleted_keys = collection_to_be_deleted.delete()

In [None]:
pprint([element['Deleted'] for element in deleted_keys])

# TODO Delete experiment and job

# TODO

In [None]:
step_auto_ml_training

## Define a Create Model Step to Create a Model

In order to perform batch transformation using the example model, create a SageMaker model.


In [None]:
from sagemaker.workflow.model_step import ModelStep

best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
    role, 
    sagemaker_session=pipeline_session
)

step_args_create_model = best_auto_ml_model.create(
    instance_type=instance_type_param
)

step_create_model = ModelStep(
    name="AutoMLCreateModel", 
    step_args=step_args_create_model
) 

## Define a Register Model Step to Create a Model Package


In [None]:
from sagemaker.model_metrics import ModelMetrics, MetricsSource

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
        content_type="application/json",
    ),
    explainability=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
        content_type="application/json",
    ),
)

register_args = best_auto_ml_model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

step_register = ModelStep(
    name="AutoMLRegisterModel", 
    step_args=register_args
)