# Dataset

The dataset you use is 

Based on this tutorial:
https://docs.aws.amazon.com/en_jp/sagemaker/latest/dg/automatic-model-tuning-ex-data.html

TODO document the dataset

# Session initialisation

In [2]:
import sys
!{sys.executable} -m pip install "sagemaker>=2.121.0"


[0m

In [3]:
import boto3
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()


# Parameters

In [25]:
from time import gmtime, strftime
import time

run_id = f"{strftime('%y%m%d%H%M', gmtime())}"

stage_prefix = "L"
project_prefix = "bank"
variant_prefix = "automlwdp"

In [26]:
print(f"{run_id=}")

run_id='2212282105'


# Data Acquisition

In [5]:
data_source_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"

In [6]:
import os
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

data_folder = os.path.join("./generated/data", stage_prefix, project_prefix, variant_prefix)
os.makedirs(data_folder, exist_ok=True)

raw_data_folder = os.path.join(data_folder, "raw")
os.makedirs(raw_data_folder, exist_ok=True)

response = urlopen(data_source_uri)
source_zip = ZipFile(BytesIO(response.read()))
source_zip.extractall(data_folder) 


In [7]:
import pandas as pd
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 5)         # Keep the output on one page

In [8]:
dataset_folder = os.path.join(data_folder, "bank-additional")

In [9]:
os.listdir(dataset_folder)

['bank-additional.csv',
 'bank-additional-full.csv',
 '.Rhistory',
 '.DS_Store',
 'bank-additional-names.txt']

In [10]:
raw_dataset_path = os.path.join(dataset_folder, "bank-additional-full.csv")
df = pd.read_csv(raw_dataset_path, sep=';')

In [11]:
df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


# Feature Engineering

In [12]:
import numpy as np

df['no_previous_contact'] = np.where(df['pdays'] == 999, 1, 0)                                 # Indicator variable to capture when pdays takes a value of 999
df['not_working'] = np.where(np.in1d(df['job'], ['student', 'retired', 'unemployed']), 1, 0)   # Indicator for individuals not actively employed
model_data = df                                                           # Convert categorical variables to sets of indicators

In [13]:
model_data.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,no_previous_contact,not_working
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1,0
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,1,0


In [14]:
# drop
model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)

In [15]:
model_data.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,y,no_previous_contact,not_working
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,no,1,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,no,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,1,999,0,nonexistent,no,1,0
9,25,services,single,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,no,1,0


In [16]:
# split
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729), 
    [int(0.7 * len(model_data)), int(0.9*len(model_data))]
)

def save_dataset(df, prefix):
    os.makedirs(os.path.join(data_folder, f"{prefix}"), exist_ok=True)
    local_filename = os.path.join(data_folder, f"{prefix}/{prefix}.csv")
    df.to_csv(local_filename, index=False, header=True)

save_dataset(train_data, "train")
save_dataset(validation_data, "validation")
save_dataset(test_data, "test")

In [17]:
os.listdir(os.path.join(data_folder, "train"))

['train.csv']

In [18]:
os.listdir(os.path.join(data_folder, "validation"))

['validation.csv']

In [19]:
# uppload
base_uri = f"s3://{default_bucket}/{stage_prefix}/{project_prefix}/{variant_prefix}"

def upload_dataset(prefix):
    local_filename = os.path.join(data_folder, f"{prefix}/{prefix}.csv")
    uri = f"{base_uri}/{prefix}"
    return sagemaker.s3.S3Uploader.upload(
        local_path=local_filename,
        desired_s3_uri=uri
    )

train_data_uri = upload_dataset("train")
validation_data_uri = upload_dataset("validation")
test_data_uri = upload_dataset("test")

In [20]:
print(train_data_uri)

s3://sagemaker-eu-west-1-102959664345/L/bank/automlwdp/train/train.csv


In [21]:
train_df = pd.read_csv(train_data_uri)
train_df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,y,no_previous_contact,not_working
0,54,self-employed,married,university.degree,no,no,no,cellular,oct,thu,3,999,0,nonexistent,no,1,0
1,56,blue-collar,married,basic.9y,unknown,no,no,telephone,jun,fri,2,999,0,nonexistent,no,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,39,entrepreneur,married,university.degree,no,yes,no,telephone,jun,wed,1,999,0,nonexistent,no,1,0
9,27,admin.,single,high.school,no,yes,no,cellular,may,thu,1,999,0,nonexistent,no,1,0


## Training Parameters

TODO

In [22]:
max_candidates = 5
validation_fraction = 0.2
total_job_runtime_in_seconds = 1800

label_column = "y"


## AutoML Training

In [27]:
from sagemaker.automl.automl import AutoML
from sagemaker.automl.automl import AutoMLInput
from sagemaker.workflow.automl_step import AutoMLStep
 
automl = AutoML(
    role=role,
    target_attribute_name=label_column,
    sagemaker_session=sagemaker_session,
    #total_job_runtime_in_seconds=total_job_runtime_in_seconds, 
    max_candidates = max_candidates,
    mode="ENSEMBLING"
)

input_training = AutoMLInput(
    inputs=train_data_uri,
    target_attribute_name=label_column,
    channel_type="training",
)

input_validation = AutoMLInput(
    inputs=validation_data_uri,    
    target_attribute_name=label_column,
    channel_type="validation",
)


step_args = automl.fit(
    inputs=[input_training, input_validation],
    job_name=f"{stage_prefix}-{project_prefix}-{variant_prefix}-{run_id}"
)


............................................................................................................

KeyboardInterrupt: 

# Inspect the best candidate

In [28]:
automl.__dict__

{'role': 'arn:aws:iam::102959664345:role/salvia/labbench/salvia-labbench-eu-west-1',
 'output_kms_key': None,
 'output_path': 's3://sagemaker-eu-west-1-102959664345/',
 'base_job_name': None,
 'compression_type': None,
 'volume_kms_key': None,
 'encrypt_inter_container_traffic': False,
 'vpc_config': None,
 'problem_type': None,
 'max_candidate': 5,
 'max_runtime_per_training_job_in_seconds': None,
 'total_job_runtime_in_seconds': None,
 'target_attribute_name': 'y',
 'job_objective': None,
 'generate_candidate_definitions_only': False,
 'tags': None,
 'content_type': None,
 's3_data_type': None,
 'feature_specification_s3_uri': None,
 'validation_fraction': None,
 'mode': 'ENSEMBLING',
 'auto_generate_endpoint_name': None,
 'endpoint_name': None,
 'current_job_name': 'L-bank-automlwdp-2212282105',
 '_auto_ml_job_desc': None,
 '_best_candidate': None,
 'sagemaker_session': <sagemaker.session.Session at 0x7faa22900400>,
 'latest_auto_ml_job': <sagemaker.automl.automl.AutoMLJob at 0x7fa9

In [None]:
automl.latest_auto_ml_job.__dict__

In [29]:
best_candidate = automl.best_candidate()

In [33]:
from pprint import pprint

In [None]:
pprint(best_candidate)

In [30]:
print(best_candidate['CandidateName'])

WeightedEnsemble-L3-FU-t2102959664345L-bank-automlwdp-2212282105


In [34]:
pprint(best_candidate['FinalAutoMLJobObjectiveMetric'])

{'MetricName': 'F1', 'Type': 'Maximize', 'Value': 0.47309744358062744}


In [35]:
pprint(best_candidate['CandidateProperties']['CandidateArtifactLocations'])

{'Explainability': 's3://sagemaker-eu-west-1-102959664345/L-bank-automlwdp-2212282105/documentation/explainability/output/L-bank-automlwdp-2212282105-t2-1-97f9c46768d0460ba9876ea482ac82',
 'ModelInsights': 's3://sagemaker-eu-west-1-102959664345/L-bank-automlwdp-2212282105/documentation/model_monitor/output/WeightedEnsemble-L3-FU-t2102959664345L-bank-automlwdp-2212282105'}


In [37]:
pprint(best_candidate['CandidateProperties']['CandidateMetrics'])

[{'MetricName': 'Accuracy',
  'Set': 'Validation',
  'StandardMetricName': 'Accuracy',
  'Value': 0.8680505156517029},
 {'MetricName': 'F1',
  'Set': 'Validation',
  'StandardMetricName': 'F1',
  'Value': 0.47309744358062744},
 {'MetricName': 'BalancedAccuracy',
  'Set': 'Validation',
  'StandardMetricName': 'BalancedAccuracy',
  'Value': 0.7164977788925171},
 {'MetricName': 'AUC',
  'Set': 'Validation',
  'StandardMetricName': 'AUC',
  'Value': 0.7671751379966736},
 {'MetricName': 'Precision',
  'Set': 'Validation',
  'StandardMetricName': 'Precision',
  'Value': 0.433777779340744},
 {'MetricName': 'Recall',
  'Set': 'Validation',
  'StandardMetricName': 'Recall',
  'Value': 0.5202558636665344},
 {'MetricName': 'LogLoss',
  'Set': 'Validation',
  'StandardMetricName': 'LogLoss',
  'Value': 0.4538300335407257},
 {'MetricName': 'InferenceLatency',
  'Set': 'Validation',
  'StandardMetricName': 'InferenceLatency',
  'Value': 0.6369072794914246}]


# Export best candidate

In [None]:
from sagemaker.s3 import S3Downloader

model_output_folder = os.path.join("./generated/model", stage_prefix, project_prefix, variant_prefix, run_id)
os.makedirs(data_folder, exist_ok=True)


explainability_folder = os.path.join(model_output_folder, "explainability")
os.makedirs(explainability_folder, exist_ok=True)

explainability_uri = best_candidate['CandidateProperties']['CandidateArtifactLocations']['Explainability'] 
S3Downloader.download(s3_uri=explainability_uri,
                      local_path=explainability_folder,
                      sagemaker_session=sagemaker_session)

model_insight_folder = os.path.join(model_output_folder, "model-insight")
os.makedirs(model_insight_folder, exist_ok=True)

model_insights_uri = best_candidate['CandidateProperties']['CandidateArtifactLocations']['ModelInsights'] 
S3Downloader.download(s3_uri=model_insights_uri,
                      local_path=model_insight_folder,
                      sagemaker_session=sagemaker_session)


model_data_folder = os.path.join(model_output_folder, "model-data")
os.makedirs(model_data_folder, exist_ok=True)

model_data_uri = best_candidate['InferenceContainers'][0]['ModelDataUrl'] 
S3Downloader.download(s3_uri=model_data_uri,
                      local_path=model_data_folder,
                      sagemaker_session=sagemaker_session)

esponse = urlopen(data_source_uri)
source_zip = ZipFile(BytesIO(response.read()))
source_zip.extractall(raw_data_folder) 

print(f"{model_output_folder=}")

In [None]:

step_auto_ml_training = AutoMLStep(
    name="AutoMLStep",
    step_args=step_args,
) 

In [None]:
import tarfile

tar_filename = os.path.join(model_data_folder, "model.tar.gz")
with tarfile.open(tar_filename, 'r') as archive:
    archive.extractall(model_data_folder)

# TODO

## Define a Create Model Step to Create a Model

In order to perform batch transformation using the example model, create a SageMaker model.


In [None]:
from sagemaker.workflow.model_step import ModelStep

best_auto_ml_model = step_auto_ml_training.get_best_auto_ml_model(
    role, 
    sagemaker_session=pipeline_session
)

step_args_create_model = best_auto_ml_model.create(
    instance_type=instance_type_param
)

step_create_model = ModelStep(
    name="AutoMLCreateModel", 
    step_args=step_args_create_model
) 

## Define a Register Model Step to Create a Model Package


In [None]:
from sagemaker.model_metrics import ModelMetrics, MetricsSource

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ModelInsightsJsonReportPath,
        content_type="application/json",
    ),
    explainability=MetricsSource(
        s3_uri=step_auto_ml_training.properties.BestCandidateProperties.ExplainabilityJsonReportPath,
        content_type="application/json",
    ),
)

register_args = best_auto_ml_model.register(
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

step_register = ModelStep(
    name="AutoMLRegisterModel", 
    step_args=register_args
)