## Perform Hyper-Parameter Optimization using SageMaker Tuning Jobs

### Imports 

In [10]:
# ML Imports 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import json
import os

# AWS Imports 
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import sagemaker
import boto3

### 2. Essentials 

In [3]:
# Get the Execution Role for this Notebook and AWS Session to create Clients
sagemaker_execution_role = get_execution_role()
print('Role = {}'.format(sagemaker_execution_role))
session = boto3.Session()

# Clients and Resources
s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client('sagemaker')

BUCKET = 'sagemaker-demo-892313895307' # USE YOUR ACCOUNT ID OR INITIALS AS SUFFIX
PREFIX = 'xgboost-clf'

Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20200826T084395


##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [4]:
s3.create_bucket(Bucket=BUCKET)

s3.Bucket(name='sagemaker-demo-892313895307')

In [7]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/batch_test/batch_test.csv')



##### Create Pointers to the uploaded files 

In [8]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [9]:
print(train_set_location)
print(test_set_location)

s3://sagemaker-demo-892313895307/xgboost-clf/train/
s3://sagemaker-demo-892313895307/xgboost-clf/test/


In [11]:
train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv')

In [12]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

{
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://sagemaker-demo-892313895307/xgboost-clf/train/",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


### 3. Train a Model using SageMaker + Built-in XgBoost Algorithm

In [13]:
container_uri = sagemaker.image_uris.retrieve(region=session.region_name, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')

In [14]:
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=sagemaker_execution_role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='clf-xgboost')

In [15]:
xgb.set_hyperparameters(objective='binary:logistic',
                        num_round=100)

In [16]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                         'min_child_weight': ContinuousParameter(1, 10),
                         'alpha': ContinuousParameter(0, 2),
                         'max_depth': IntegerParameter(1, 10)}

In [17]:
objective_metric_name = 'validation:accuracy'

In [18]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=10,
                            max_parallel_jobs=5)

In [19]:
tuner.fit({'train': train_set_pointer, 
           'validation': test_set_pointer}, 
          include_cls_metadata=False)

................................................................................................................!


In [20]:
hpo_job_name = tuner.latest_tuning_job.job_name
hpo_job_name

'sagemaker-xgboost-201109-0251'

In [21]:
tuning_job_results = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=hpo_job_name)
status = tuning_job_results['HyperParameterTuningJobStatus']
status

'Completed'

In [22]:
best_training_job = tuning_job_results['BestTrainingJob']
best_training_job

{'TrainingJobName': 'sagemaker-xgboost-201109-0251-006-b67ff3a6',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:892313895307:training-job/sagemaker-xgboost-201109-0251-006-b67ff3a6',
 'CreationTime': datetime.datetime(2020, 11, 9, 2, 56, 9, tzinfo=tzlocal()),
 'TrainingStartTime': datetime.datetime(2020, 11, 9, 2, 58, 24, tzinfo=tzlocal()),
 'TrainingEndTime': datetime.datetime(2020, 11, 9, 2, 59, 35, tzinfo=tzlocal()),
 'TrainingJobStatus': 'Completed',
 'TunedHyperParameters': {'alpha': '0.4370194182464262',
  'eta': '0.1472495646102641',
  'max_depth': '1',
  'min_child_weight': '1.2354284387255998'},
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:accuracy',
  'Value': 0.7961400151252747},
 'ObjectiveStatus': 'Succeeded'}

### 4. Evaluate Results
We can list hyperparameters and objective metrics of all training jobs and pick up the training job with the best objective metric.

In [23]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(hpo_job_name)
hpo_results_df = tuner.dataframe()

In [24]:
hpo_results_df

Unnamed: 0,alpha,eta,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,1.13337,0.022724,7.0,8.98436,sagemaker-xgboost-201109-0251-010-bdc973d6,Completed,0.79614,2020-11-09 03:00:04+00:00,2020-11-09 03:01:15+00:00,71.0
1,1.215318,0.588947,1.0,4.84324,sagemaker-xgboost-201109-0251-009-ab27c289,Completed,0.79614,2020-11-09 02:58:24+00:00,2020-11-09 02:59:37+00:00,73.0
2,1.055528,0.462787,2.0,4.617471,sagemaker-xgboost-201109-0251-008-14e5e263,Completed,0.79534,2020-11-09 02:58:34+00:00,2020-11-09 02:59:43+00:00,69.0
3,1.52208,0.16872,2.0,1.00667,sagemaker-xgboost-201109-0251-007-8ae3987a,Completed,0.79574,2020-11-09 02:58:31+00:00,2020-11-09 02:59:46+00:00,75.0
4,0.437019,0.14725,1.0,1.235428,sagemaker-xgboost-201109-0251-006-b67ff3a6,Completed,0.79614,2020-11-09 02:58:24+00:00,2020-11-09 02:59:35+00:00,71.0
5,1.920099,0.854157,4.0,3.046718,sagemaker-xgboost-201109-0251-005-d23e9bd7,Completed,0.77161,2020-11-09 02:54:21+00:00,2020-11-09 02:55:31+00:00,70.0
6,1.295331,0.20969,8.0,8.753706,sagemaker-xgboost-201109-0251-004-812f9995,Completed,0.79292,2020-11-09 02:54:33+00:00,2020-11-09 02:57:16+00:00,163.0
7,0.257386,0.932444,10.0,2.3,sagemaker-xgboost-201109-0251-003-d00a6d40,Completed,0.73583,2020-11-09 02:54:19+00:00,2020-11-09 02:55:31+00:00,72.0
8,0.417937,0.013581,3.0,1.129578,sagemaker-xgboost-201109-0251-002-85c197ab,Completed,0.79534,2020-11-09 02:54:39+00:00,2020-11-09 02:55:44+00:00,65.0
9,1.563565,0.757821,8.0,9.645825,sagemaker-xgboost-201109-0251-001-06aab40d,Completed,0.76679,2020-11-09 02:54:12+00:00,2020-11-09 02:55:26+00:00,74.0
