# Dataset

The dataset you use is 

Based on this tutorial:
https://docs.aws.amazon.com/en_jp/sagemaker/latest/dg/automatic-model-tuning-ex-data.html

TODO document the dataset

# Session initialisation

In [2]:
import sys
!{sys.executable} -m pip install "sagemaker>=2.121.0"


[0m

In [3]:
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

# Parameters

In [4]:
from time import gmtime, strftime
import time

run_id = f"{strftime('%y%m%d%H%M', gmtime())}"

stage_prefix = "L"
project_prefix = "bank"
variant_prefix = "automl"

In [5]:
job_prefix_short = f"{variant_prefix}/{run_id}"
job_prefix_long = f"{stage_prefix}/{project_prefix}/{job_prefix_short}"

In [6]:
print(f"{job_prefix_short=}")
print(f"{job_prefix_long=}")

job_prefix_short='automl/2212291610'
job_prefix_long='L/bank/automl/2212291610'


In [7]:
import os
base_folder = os.path.join("./generated", job_prefix_short)
base_uri = f"s3://{default_bucket}/{job_prefix_long}"
base_uri_for_jobs = f"s3://{default_bucket}/{stage_prefix}-jobs"

In [8]:
print(f"{base_folder=}")
print(f"{base_uri=}")
print(f"{base_uri_for_jobs=}")

base_folder='./generated/automl/2212291610'
base_uri='s3://sagemaker-eu-west-1-102959664345/L/bank/automl/2212291610'
base_uri_for_jobs='s3://sagemaker-eu-west-1-102959664345/L-jobs'


# Data Acquisition

In [9]:
data_source_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"

In [10]:
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

data_folder = os.path.join(base_folder, "data")

raw_data_folder = os.path.join(data_folder, "raw")
os.makedirs(raw_data_folder, exist_ok=True)

response = urlopen(data_source_uri)
source_zip = ZipFile(BytesIO(response.read()))
source_zip.extractall(raw_data_folder) 

In [11]:
import pandas as pd
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 10)         # Keep the output on one page

In [12]:
dataset_folder = os.path.join(raw_data_folder, "bank-additional")

In [13]:
os.listdir(dataset_folder)

['bank-additional.csv',
 'bank-additional-full.csv',
 '.Rhistory',
 '.DS_Store',
 'bank-additional-names.txt']

In [14]:
raw_dataset_path = os.path.join(dataset_folder, "bank-additional-full.csv")
df = pd.read_csv(raw_dataset_path, sep=';')

In [15]:
df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [16]:
# TODO jobout
# TODO job data clean up S3

# Data Preparation

In [17]:
# save as comma separated as AutoML requires comma separated CSV

def save_dataset(df, prefix):
    os.makedirs(os.path.join(data_folder, f"{prefix}"), exist_ok=True)
    local_filename = os.path.join(data_folder, f"{prefix}/{prefix}.csv")
    df.to_csv(local_filename, index=False, header=True)
    return local_filename

input_dataset_path = save_dataset(df, "input")

In [18]:
# uppload
input_data_uri = sagemaker.s3.S3Uploader.upload(
        local_path=input_dataset_path,
        desired_s3_uri=f"{base_uri}/input"
    )

In [19]:
print(input_data_uri)

s3://sagemaker-eu-west-1-102959664345/L/bank/automl/2212291610/input/input.csv


In [20]:
input_df = df = pd.read_csv(input_data_uri)
input_df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## Training Parameters

API Reference
- https://sagemaker.readthedocs.io/en/stable/api/training/automl.html

*Parameters*
- max_candidates (int) – The maximum number of times a training job is allowed to run. Sensible values are over 10.
- total_job_runtime_in_seconds (int) – the total wait time of an AutoML job in seconds. Sensible values are over 1800.


In [21]:
max_candidates = 10
validation_fraction = 0.2
total_job_runtime_in_seconds = 1800

label_column = "y"

## AutoML Training

In [22]:
job_name = job_prefix_long.replace("/","-")

In [23]:
print(f"{job_name=}")

job_name='L-bank-automl-2212291610'


In [24]:
from sagemaker.automl.automl import AutoML
from sagemaker.automl.automl import AutoMLInput
from sagemaker.workflow.automl_step import AutoMLStep

 
automl = AutoML(
    role=role,
    output_path=base_uri_for_jobs,
    target_attribute_name=label_column,
    validation_fraction=validation_fraction,
    sagemaker_session=sagemaker_session,
    total_job_runtime_in_seconds=total_job_runtime_in_seconds, 
    max_candidates = max_candidates,
    mode="ENSEMBLING"
)

input_raw = AutoMLInput(
    inputs=input_data_uri,    
    target_attribute_name=label_column
)

In [25]:
%%time

step_args = automl.fit(
    inputs=[input_raw],
    job_name=job_name
)


....................................................................................................................................................................................
..Done
CPU times: user 1.04 s, sys: 108 ms, total: 1.15 s
Wall time: 30min 20s


In [26]:
# TODO loockup sagemaker_session.logs_for_job(job_name, wait=False, poll=10, log_type='All')

# Inspect the best candidate

In [27]:
from pprint import pprint

In [28]:
#automl.__dict__

In [29]:
#automl.latest_auto_ml_job.__dict__

In [30]:
best_candidate = automl.best_candidate()

In [31]:
#pprint(best_candidate)

In [32]:
print(best_candidate['CandidateName'])

WeightedEnsemble-L2-FULL-t3102959664345L-bank-automl-2212291610


In [33]:
print(best_candidate['InferenceContainers'][0]['ModelDataUrl'])

s3://sagemaker-eu-west-1-102959664345/L-jobs/L-bank-automl-2212291610/sagemaker-automl-candidates/model/WeightedEnsemble-L2-FULL-t3/model.tar.gz


In [34]:
pprint(best_candidate['FinalAutoMLJobObjectiveMetric'])

{'MetricName': 'F1', 'Type': 'Maximize', 'Value': 0.6684210300445557}


In [35]:
pprint(best_candidate['CandidateProperties']['CandidateArtifactLocations'])

{'Explainability': 's3://sagemaker-eu-west-1-102959664345/L-jobs/L-bank-automl-2212291610/documentation/explainability/output/L-bank-automl-2212291610-t3-1-4feebddbd584449fbfd034d7fe740cdf2',
 'ModelInsights': 's3://sagemaker-eu-west-1-102959664345/L-jobs/L-bank-automl-2212291610/documentation/model_monitor/output/WeightedEnsemble-L2-FULL-t3102959664345L-bank-automl-2212291610'}


In [36]:
#pprint(best_candidate['CandidateProperties']['CandidateMetrics'])

In [37]:
pprint([ (metric['StandardMetricName'], metric['Value']) 
          for metric in best_candidate['CandidateProperties']['CandidateMetrics']
       ])

[('Accuracy', 0.9082301259040833),
 ('F1', 0.6684210300445557),
 ('BalancedAccuracy', 0.8702046871185303),
 ('AUC', 0.9515122771263123),
 ('Precision', 0.5636094808578491),
 ('Recall', 0.8211206793785095),
 ('LogLoss', 0.19093722105026245),
 ('InferenceLatency', 0.17369800806045532)]


In [38]:
#pprint(automl.list_candidates())

In [None]:
TODO describe

In [45]:
analytics

{'AutoMLJobName': 'L-bank-automl-2212291610',
 'AutoMLJobArn': 'arn:aws:sagemaker:eu-west-1:102959664345:automl-job/L-bank-automl-2212291610',
 'InputDataConfig': [{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
     'S3Uri': 's3://sagemaker-eu-west-1-102959664345/L/bank/automl/2212291610/input/input.csv'}},
   'TargetAttributeName': 'y',
   'ContentType': 'text/csv;header=present',
   'ChannelType': 'training'}],
 'OutputDataConfig': {'S3OutputPath': 's3://sagemaker-eu-west-1-102959664345/L-jobs'},
 'RoleArn': 'arn:aws:iam::102959664345:role/salvia/labbench/salvia-labbench-eu-west-1',
 'AutoMLJobConfig': {'CompletionCriteria': {'MaxCandidates': 10,
   'MaxAutoMLJobRuntimeInSeconds': 1800},
  'SecurityConfig': {'EnableInterContainerTrafficEncryption': False},
  'DataSplitConfig': {'ValidationFraction': 0.20000000298023224},
  'Mode': 'ENSEMBLING'},
 'CreationTime': datetime.datetime(2022, 12, 29, 16, 10, 45, 954000, tzinfo=tzlocal()),
 'EndTime': datetime.datetime(2022, 12, 

In [44]:
analytics = automl.describe_auto_ml_job()

# Export best candidate

In [39]:
from sagemaker.s3 import S3Downloader

model_output_folder = os.path.join(base_folder, "model")

explainability_folder = os.path.join(model_output_folder, "explainability")
os.makedirs(explainability_folder, exist_ok=True)

explainability_uri = best_candidate['CandidateProperties']['CandidateArtifactLocations']['Explainability'] 
S3Downloader.download(s3_uri=explainability_uri,
                      local_path=explainability_folder,
                      sagemaker_session=sagemaker_session)

model_insight_folder = os.path.join(model_output_folder, "model-insight")
os.makedirs(model_insight_folder, exist_ok=True)

model_insights_uri = best_candidate['CandidateProperties']['CandidateArtifactLocations']['ModelInsights'] 
S3Downloader.download(s3_uri=model_insights_uri,
                      local_path=model_insight_folder,
                      sagemaker_session=sagemaker_session)


model_data_folder = os.path.join(model_output_folder, "model-data")
os.makedirs(model_data_folder, exist_ok=True)

model_data_uri = best_candidate['InferenceContainers'][0]['ModelDataUrl'] 
S3Downloader.download(s3_uri=model_data_uri,
                      local_path=model_data_folder,
                      sagemaker_session=sagemaker_session)

print(f"{model_output_folder=}")

model_output_folder='./generated/automl/2212291610/model'


In [40]:
import tarfile

tar_filename = os.path.join(model_data_folder, "model.tar.gz")
with tarfile.open(tar_filename, 'r') as archive:
    archive.extractall(model_data_folder)
    
print(f"{tar_filename=}")

tar_filename='./generated/automl/2212291610/model/model-data/model.tar.gz'


# Cleanup

In [None]:
from pprint import pprint

## Delete local data

In [None]:
import shutil
# Try to remove the local files; if it fails, throw an error using try...except.
try:
    shutil.rmtree(base_folder)
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

## Delete S3 data

In [None]:
# TODO exception management
s3 = boto3.Session(region_name = region).resource('s3')
bucket = s3.Bucket(default_bucket)
collection_to_be_deleted = bucket.object_versions.filter(Prefix=f"{job_prefix_long}")

In [None]:
pprint(list(collection_to_be_deleted.all()))

In [None]:
deleted_keys = collection_to_be_deleted.delete()

In [None]:
pprint([element['Deleted'] for element in deleted_keys])

# TODO Delete experiment and job

# TODO

In [None]:
step_auto_ml_training

## Define a Create Model Step to Create a Model

In order to perform batch transformation using the example model, create a SageMaker model.


In [None]:
from sagemaker.workflow.model_step import ModelStep

best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
    role, 
    sagemaker_session=pipeline_session
)

step_args_create_model = best_auto_ml_model.create(
    instance_type=instance_type_param
)

step_create_model = ModelStep(
    name="AutoMLCreateModel", 
    step_args=step_args_create_model
) 

## Define a Register Model Step to Create a Model Package


In [None]:
from sagemaker.model_metrics import ModelMetrics, MetricsSource

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
        content_type="application/json",
    ),
    explainability=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
        content_type="application/json",
    ),
)

register_args = best_auto_ml_model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

step_register = ModelStep(
    name="AutoMLRegisterModel", 
    step_args=register_args
)