### Imports 

In [1]:
# ML Imports 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import json
import os

# AWS Imports 
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
from sagemaker import get_execution_role
import sagemaker
import boto3

### 1. Load & Prep Data

The <a href='https://homepages.inf.ed.ac.uk/imurray2/teaching/oranges_and_lemons/'>Fruits Dataset</a> was originally created by Dr. Iain Murray from University of Edinburgh and extended more recently by the University of Michigan. It is a simple multi-class dataset with 4 columns (features) and 4 classes (fruits). The 4 classes are apple, orange, mandarin and lemon. The four features are mass, width, height and color score of the fruit.

The color score feature maps to a color and its intensity in the color spectrum (0 - 1) scale. <br><br>
<table align="left" style="width:50%">
    <tr>
        <th>Color</th>
        <th>Range</th>
    </tr>
    <tr>
        <td>Red</td>
        <td>0.85 - 1.00</td>
    </tr>
    <tr>
        <td>Orange</td>
        <td>0.75 - 0.85</td>
    </tr>
    <tr>
        <td>Yellow</td>
        <td>0.65 - 0.75</td>
    </tr>
    <tr>
        <td>Green</td>
        <td>0.45 - 0.65</td>
    </tr>
</table>

In [2]:
df = pd.read_csv('./DATA/fruits.csv', names=['class', 'mass', 'width', 'height', 'color_score'])

In [3]:
df.sample(10)

Unnamed: 0,class,mass,width,height,color_score
44,lemon,200,7.3,10.5,0.72
8,apple,178,7.1,7.8,0.92
47,lemon,196,7.3,9.7,0.72
33,orange,190,7.5,8.1,0.74
14,apple,152,7.6,7.3,0.69
25,orange,356,9.2,9.2,0.75
16,apple,156,7.6,7.5,0.67
31,orange,210,7.8,8.0,0.82
28,orange,140,6.7,7.1,0.72
36,orange,160,7.1,7.6,0.76


In [4]:
df.shape

(59, 5)

In [5]:
df['class'].unique().tolist()

['apple', 'mandarin', 'orange', 'lemon']

In [6]:
X = df[['mass', 'width', 'height', 'color_score']]
y = df['class']

##### Encode the classes into numerical values using Sklearn's LabelEncoder

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(['apple', 'orange', 'mandarin', 'lemon'])
y = label_encoder.transform(y)

In [8]:
y

array([0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

##### Split X, y into train and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [10]:
X_train.shape

(44, 4)

In [11]:
X_test.shape

(15, 4)

##### Scale feature columns using Sklearn's MinMaxScaler

In [12]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
X_train[0]

array([0.32142857, 0.32352941, 0.50769231, 1.        ])

In [14]:
y_train[0]

0

In [15]:
print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


##### Combine Scaled X & y into Train and Test DataFrames 

In [16]:
X_train = pd.DataFrame(X_train, columns=['mass', 'width', 'height', 'color_score'])
y_train = pd.DataFrame(y_train, columns=['class'])
train_df = pd.concat([y_train, X_train], axis=1)
train_df.head()

Unnamed: 0,class,mass,width,height,color_score
0,0,0.321429,0.323529,0.507692,1.0
1,1,0.421429,0.411765,0.969231,0.323529
2,0,0.364286,0.382353,0.584615,0.970588
3,0,0.278571,0.352941,0.476923,0.852941
4,1,0.192857,0.058824,0.646154,0.352941


In [17]:
X_test = pd.DataFrame(X_test, columns=['mass', 'width', 'height', 'color_score'])
y_test = pd.DataFrame(y_test, columns=['class'])
test_df = pd.concat([y_test, X_test], axis=1)
test_df.head()

Unnamed: 0,class,mass,width,height,color_score
0,1,0.142857,0.058824,0.538462,0.382353
1,3,0.371429,0.529412,0.646154,0.588235
2,0,0.314286,0.441176,0.569231,0.323529
3,1,0.157143,0.058824,0.676923,0.441176
4,3,0.457143,0.5,0.8,0.529412


##### Create a DataFrame for Batch Inference without the Class column

In [18]:
batch_test_df = test_df.drop(['class'], axis=1)
batch_test_df.head()

Unnamed: 0,mass,width,height,color_score
0,0.142857,0.058824,0.538462,0.382353
1,0.371429,0.529412,0.646154,0.588235
2,0.314286,0.441176,0.569231,0.323529
3,0.157143,0.058824,0.676923,0.441176
4,0.457143,0.5,0.8,0.529412


##### Write Train & Test Sets to Local Directory

In [19]:
train_df.to_csv('./DATA/train.csv', header=False, index=False)
test_df.to_csv('./DATA/test.csv', header=False, index=False)
batch_test_df.to_csv('./DATA/batch_test.csv', header=False, index=False)

### 2. Essentials 

In [20]:
# Get the Execution Role for this Notebook and AWS Session to create Clients
sagemaker_execution_role = get_execution_role()
print('Role = {}'.format(sagemaker_execution_role))
session = boto3.Session()

# Clients and Resources
s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client('sagemaker')

BUCKET = 'sagemaker-demo-892313895307' # USE YOUR ACCOUNT ID OR INITIALS AS SUFFIX
PREFIX = 'fruits-clf'

Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20201021T165275


##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [21]:
s3.create_bucket(Bucket=BUCKET)

s3.Bucket(name='sagemaker-demo-892313895307')

In [22]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/batch_test.csv')

##### Create Pointers to the uploaded files 

In [23]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [24]:
print(train_set_location)
print(test_set_location)

s3://sagemaker-demo-892313895307/fruits-clf/train/
s3://sagemaker-demo-892313895307/fruits-clf/test/


In [25]:
train_set_pointer = sagemaker.s3_input(s3_data=train_set_location, content_type='csv')
test_set_pointer = sagemaker.s3_input(s3_data=test_set_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [26]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

{
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://sagemaker-demo-892313895307/fruits-clf/train/",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


### 3. Train a Model using SageMaker + Built-in XgBoost Algorithm

In [27]:
container_uri = get_image_uri(session.region_name, 'xgboost', '1.0-1')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [28]:
xgb = sagemaker.estimator.Estimator(image_name=container_uri,
                                    role=sagemaker_execution_role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='fruits-clf-xgboost')

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [29]:
xgb.set_hyperparameters(objective='multi:softmax',
                        num_class=4,
                        num_round=100)

In [30]:
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                         'min_child_weight': ContinuousParameter(1, 10),
                         'alpha': ContinuousParameter(0, 2),
                         'max_depth': IntegerParameter(1, 10)}

In [31]:
objective_metric_name = 'validation:accuracy'

In [32]:
tuner = HyperparameterTuner(xgb,
                            objective_metric_name,
                            hyperparameter_ranges,
                            max_jobs=10,
                            max_parallel_jobs=5)

In [33]:
tuner.fit({'train': train_set_pointer, 'validation': test_set_pointer}, include_cls_metadata=False)

In [34]:
hpo_job_name = tuner.latest_tuning_job.job_name
hpo_job_name

'sagemaker-xgboost-201022-0335'

In [36]:
tuning_job_results = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=hpo_job_name)
status = tuning_job_results['HyperParameterTuningJobStatus']
status

'Completed'

In [37]:
best_training_job = tuning_job_results['BestTrainingJob']
best_training_job

{'TrainingJobName': 'sagemaker-xgboost-201022-0335-004-02b57de6',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:892313895307:training-job/sagemaker-xgboost-201022-0335-004-02b57de6',
 'CreationTime': datetime.datetime(2020, 10, 22, 3, 35, 33, tzinfo=tzlocal()),
 'TrainingStartTime': datetime.datetime(2020, 10, 22, 3, 37, 50, tzinfo=tzlocal()),
 'TrainingEndTime': datetime.datetime(2020, 10, 22, 3, 39, 6, tzinfo=tzlocal()),
 'TrainingJobStatus': 'Completed',
 'TunedHyperParameters': {'alpha': '0.014274175863314476',
  'eta': '0.60388441559809',
  'max_depth': '10',
  'min_child_weight': '1.1710502160852412'},
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:accuracy',
  'Value': 0.7333300113677979},
 'ObjectiveStatus': 'Succeeded'}

### 4. Evaluate Results
We can list hyperparameters and objective metrics of all training jobs and pick up the training job with the best objective metric.

In [38]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(hpo_job_name)
hpo_results_df = tuner.dataframe()

In [39]:
hpo_results_df

Unnamed: 0,alpha,eta,max_depth,min_child_weight,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,1.140055,0.931075,10.0,8.116883,sagemaker-xgboost-201022-0335-010-1d430b82,Completed,0.26667,2020-10-22 03:41:33+00:00,2020-10-22 03:43:01+00:00,88.0
1,0.371989,0.282882,6.0,8.241951,sagemaker-xgboost-201022-0335-009-f0dd9e71,Completed,0.6,2020-10-22 03:41:40+00:00,2020-10-22 03:42:41+00:00,61.0
2,0.517009,0.334395,8.0,4.996766,sagemaker-xgboost-201022-0335-008-fea4a371,Completed,0.66667,2020-10-22 03:41:20+00:00,2020-10-22 03:42:22+00:00,62.0
3,0.0,0.608626,10.0,4.577637,sagemaker-xgboost-201022-0335-007-e39d5546,Completed,0.6,2020-10-22 03:40:52+00:00,2020-10-22 03:41:56+00:00,64.0
4,1.206257,0.043765,10.0,7.720101,sagemaker-xgboost-201022-0335-006-1b854583,Completed,0.4,2020-10-22 03:41:05+00:00,2020-10-22 03:42:07+00:00,62.0
5,1.339357,0.502173,3.0,6.517747,sagemaker-xgboost-201022-0335-005-95b9456f,Completed,0.53333,2020-10-22 03:37:28+00:00,2020-10-22 03:38:35+00:00,67.0
6,0.014274,0.603884,10.0,1.17105,sagemaker-xgboost-201022-0335-004-02b57de6,Completed,0.73333,2020-10-22 03:37:50+00:00,2020-10-22 03:39:06+00:00,76.0
7,0.508539,0.242112,6.0,2.75097,sagemaker-xgboost-201022-0335-003-7b94c107,Completed,0.6,2020-10-22 03:37:26+00:00,2020-10-22 03:39:00+00:00,94.0
8,1.534186,0.235733,3.0,4.508081,sagemaker-xgboost-201022-0335-002-630f4ddd,Completed,0.53333,2020-10-22 03:37:24+00:00,2020-10-22 03:38:41+00:00,77.0
9,1.714294,0.3388,5.0,3.568455,sagemaker-xgboost-201022-0335-001-31be91da,Completed,0.6,2020-10-22 03:37:34+00:00,2020-10-22 03:38:47+00:00,73.0
