## Perform Hyper-Parameter Optimization using SageMaker Tuning Jobs

### Imports 

In [1]:
# ML Imports 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import json
import os

# AWS Imports 
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import sagemaker
import boto3

### 2. Essentials 

In [2]:
# Get the Execution Role for this Notebook and AWS Session to create Clients
sagemaker_execution_role = get_execution_role()
print('Role = {}'.format(sagemaker_execution_role))
session = boto3.Session()

# Clients and Resources
s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client('sagemaker')

BUCKET = 'sagemaker-demo-892313895307' # USE YOUR ACCOUNT ID OR INITIALS AS SUFFIX
PREFIX = 'xgboost-clf'

Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20200826T084395


##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [3]:
s3.create_bucket(Bucket=BUCKET)

s3.Bucket(name='sagemaker-demo-892313895307')

In [5]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/batch_test/batch_test.csv')

##### Create Pointers to the uploaded files 

In [6]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [7]:
print(train_set_location)
print(test_set_location)

s3://sagemaker-demo-892313895307/xgboost-clf/train/
s3://sagemaker-demo-892313895307/xgboost-clf/test/


In [8]:
train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv')

In [9]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

{
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://sagemaker-demo-892313895307/xgboost-clf/train/",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


### 3. Train a Model using SageMaker + Built-in XgBoost Algorithm

In [10]:
container_uri = sagemaker.image_uris.retrieve(region=session.region_name, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')

In [11]:
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=sagemaker_execution_role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='clf-xgboost')

In [12]:
xgb.set_hyperparameters(objective='binary:logistic',
                        num_round=100)

In [13]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                         'min_child_weight': ContinuousParameter(1, 10),
                         'alpha': ContinuousParameter(0, 2),
                         'max_depth': IntegerParameter(1, 10)}

In [14]:
objective_metric_name = 'validation:accuracy'

In [15]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=10,
                            max_parallel_jobs=5)

In [16]:
tuner.fit({'train': train_set_pointer, 
           'validation': test_set_pointer}, 
           include_cls_metadata=False)

........................................................................................................!


In [17]:
hpo_job_name = tuner.latest_tuning_job.job_name
hpo_job_name

'sagemaker-xgboost-201207-2113'

In [18]:
tuning_job_results = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=hpo_job_name)
status = tuning_job_results['HyperParameterTuningJobStatus']
status

'Completed'

In [19]:
best_training_job = tuning_job_results['BestTrainingJob']
best_training_job

{'TrainingJobName': 'sagemaker-xgboost-201207-2113-008-d1c5df61',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:892313895307:training-job/sagemaker-xgboost-201207-2113-008-d1c5df61',
 'CreationTime': datetime.datetime(2020, 12, 7, 21, 17, 21, tzinfo=tzlocal()),
 'TrainingStartTime': datetime.datetime(2020, 12, 7, 21, 19, 42, tzinfo=tzlocal()),
 'TrainingEndTime': datetime.datetime(2020, 12, 7, 21, 20, 46, tzinfo=tzlocal()),
 'TrainingJobStatus': 'Completed',
 'TunedHyperParameters': {'alpha': '1.4163806530351208',
  'eta': '0.08060260859718339',
  'max_depth': '3',
  'min_child_weight': '4.199881332801262'},
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:accuracy',
  'Value': 0.7961400151252747},
 'ObjectiveStatus': 'Succeeded'}

### 4. Evaluate Results
We can list hyperparameters and objective metrics of all training jobs and pick up the training job with the best objective metric.

In [20]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(hpo_job_name)
hpo_results_df = tuner.dataframe()

In [21]:
hpo_results_df

Unnamed: 0,alpha,eta,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,0.538207,0.069231,2.0,2.748412,sagemaker-xgboost-201207-2113-010-b5bbff17,Completed,0.79614,2020-12-07 21:19:43+00:00,2020-12-07 21:20:50+00:00,67.0
1,0.498207,0.059067,2.0,2.568412,sagemaker-xgboost-201207-2113-009-1468ef7f,Completed,0.79614,2020-12-07 21:19:51+00:00,2020-12-07 21:21:04+00:00,73.0
2,1.416381,0.080603,3.0,4.199881,sagemaker-xgboost-201207-2113-008-d1c5df61,Completed,0.79614,2020-12-07 21:19:42+00:00,2020-12-07 21:20:46+00:00,64.0
3,0.236732,0.150427,5.0,1.176268,sagemaker-xgboost-201207-2113-007-9829f0f8,Completed,0.79051,2020-12-07 21:19:24+00:00,2020-12-07 21:20:36+00:00,72.0
4,1.750831,0.089605,7.0,8.845725,sagemaker-xgboost-201207-2113-006-206d64c1,Completed,0.79574,2020-12-07 21:19:46+00:00,2020-12-07 21:21:41+00:00,115.0
5,0.378858,0.343983,6.0,4.500288,sagemaker-xgboost-201207-2113-005-291ac3a6,Completed,0.78649,2020-12-07 21:15:44+00:00,2020-12-07 21:17:00+00:00,76.0
6,0.433438,0.906519,9.0,8.865288,sagemaker-xgboost-201207-2113-004-7aad9f90,Completed,0.75312,2020-12-07 21:15:40+00:00,2020-12-07 21:17:14+00:00,94.0
7,1.811568,0.234967,10.0,9.540089,sagemaker-xgboost-201207-2113-003-54c3806f,Completed,0.7877,2020-12-07 21:15:47+00:00,2020-12-07 21:17:03+00:00,76.0
8,0.853375,0.564844,10.0,9.291,sagemaker-xgboost-201207-2113-002-67b3fb26,Completed,0.76518,2020-12-07 21:15:43+00:00,2020-12-07 21:16:54+00:00,71.0
9,1.006244,0.344265,8.0,7.596766,sagemaker-xgboost-201207-2113-001-41509c17,Completed,0.78689,2020-12-07 21:15:55+00:00,2020-12-07 21:17:23+00:00,88.0
