## Parallel Hyper Parameter Optimization (HPO) using SageMaker Tuning

### Imports 

In [1]:
# ML Imports 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import json
import os

# AWS Imports 
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import sagemaker
import boto3

### 2. Essentials 

In [2]:
# Get the Execution Role for this Notebook and AWS Session to create Clients
sagemaker_execution_role = get_execution_role()
print('Role = {}'.format(sagemaker_execution_role))
session = boto3.Session()

# Clients and Resources
s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client('sagemaker')

BUCKET = sagemaker_session.default_bucket()
PREFIX = 'xgboost-clf'

Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20200629T123070


##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [3]:
s3.create_bucket(Bucket=BUCKET)

s3.Bucket(name='sagemaker-us-east-1-892313895307')

In [4]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/batch_test/batch_test.csv')

##### Create Pointers to the uploaded files 

In [5]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [6]:
print(train_set_location)
print(test_set_location)

s3://sagemaker-us-east-1-892313895307/xgboost-clf/train/
s3://sagemaker-us-east-1-892313895307/xgboost-clf/test/


In [7]:
train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv')

In [8]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

{
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://sagemaker-us-east-1-892313895307/xgboost-clf/train/",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


### 3. Train a Model using SageMaker + Built-in XgBoost Algorithm

In [9]:
container_uri = sagemaker.image_uris.retrieve(region=session.region_name, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')

In [10]:
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=sagemaker_execution_role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='clf-xgboost')

In [11]:
xgb.set_hyperparameters(objective='binary:logistic',
                        num_round=100)

In [12]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                         'min_child_weight': ContinuousParameter(1, 10),
                         'alpha': ContinuousParameter(0, 2),
                         'max_depth': IntegerParameter(1, 10)}

In [13]:
objective_metric_name = 'validation:accuracy'

In [14]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=10,
                            max_parallel_jobs=5)

In [15]:
tuner.fit({'train': train_set_pointer, 
           'validation': test_set_pointer}, 
           include_cls_metadata=False)

.......................................................................................................!


In [16]:
hpo_job_name = tuner.latest_tuning_job.job_name
hpo_job_name

'sagemaker-xgboost-201209-1711'

In [17]:
tuning_job_results = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=hpo_job_name)
status = tuning_job_results['HyperParameterTuningJobStatus']
status

'Completed'

In [18]:
best_training_job = tuning_job_results['BestTrainingJob']
best_training_job

{'TrainingJobName': 'sagemaker-xgboost-201209-1711-008-d299a5c2',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:892313895307:training-job/sagemaker-xgboost-201209-1711-008-d299a5c2',
 'CreationTime': datetime.datetime(2020, 12, 9, 17, 15, 41, tzinfo=tzlocal()),
 'TrainingStartTime': datetime.datetime(2020, 12, 9, 17, 18, 12, tzinfo=tzlocal()),
 'TrainingEndTime': datetime.datetime(2020, 12, 9, 17, 19, 25, tzinfo=tzlocal()),
 'TrainingJobStatus': 'Completed',
 'TunedHyperParameters': {'alpha': '1.8711020244174286',
  'eta': '0.0228779762711755',
  'max_depth': '1',
  'min_child_weight': '9.952159410331406'},
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:accuracy',
  'Value': 0.7961400151252747},
 'ObjectiveStatus': 'Succeeded'}

### 4. Evaluate Results
We can list hyperparameters and objective metrics of all training jobs and pick up the training job with the best objective metric.

In [19]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(hpo_job_name)
hpo_results_df = tuner.dataframe()

In [20]:
hpo_results_df

Unnamed: 0,alpha,eta,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,1.91401,0.05837,1.0,9.0554,sagemaker-xgboost-201209-1711-010-656e513b,Completed,0.79614,2020-12-09 17:18:14+00:00,2020-12-09 17:19:26+00:00,72.0
1,0.43065,0.453878,1.0,9.749736,sagemaker-xgboost-201209-1711-009-f515aaa9,Completed,0.79614,2020-12-09 17:17:49+00:00,2020-12-09 17:18:57+00:00,68.0
2,1.871102,0.022878,1.0,9.952159,sagemaker-xgboost-201209-1711-008-d299a5c2,Completed,0.79614,2020-12-09 17:18:12+00:00,2020-12-09 17:19:25+00:00,73.0
3,1.718331,0.842489,2.0,4.76157,sagemaker-xgboost-201209-1711-007-95579536,Completed,0.79534,2020-12-09 17:18:22+00:00,2020-12-09 17:19:33+00:00,71.0
4,0.712389,0.749532,4.0,4.465488,sagemaker-xgboost-201209-1711-006-15241baa,Completed,0.77644,2020-12-09 17:18:14+00:00,2020-12-09 17:19:26+00:00,72.0
5,0.823433,0.901349,2.0,7.495843,sagemaker-xgboost-201209-1711-005-967cf7f8,Completed,0.79373,2020-12-09 17:14:08+00:00,2020-12-09 17:15:15+00:00,67.0
6,0.108732,0.16206,9.0,5.529233,sagemaker-xgboost-201209-1711-004-8352abcf,Completed,0.79011,2020-12-09 17:14:10+00:00,2020-12-09 17:15:24+00:00,74.0
7,0.853271,0.188539,1.0,8.609403,sagemaker-xgboost-201209-1711-003-254bbf0a,Completed,0.79614,2020-12-09 17:14:10+00:00,2020-12-09 17:15:24+00:00,74.0
8,0.631308,0.171973,9.0,2.621634,sagemaker-xgboost-201209-1711-002-cdd916a3,Completed,0.79011,2020-12-09 17:13:46+00:00,2020-12-09 17:14:53+00:00,67.0
9,1.060636,0.918407,1.0,9.268074,sagemaker-xgboost-201209-1711-001-d71be9dd,Completed,0.79614,2020-12-09 17:14:11+00:00,2020-12-09 17:15:23+00:00,72.0
