### 1. Imports 

In [1]:
# ML Imports 
import pandas as pd
import numpy as np
import json
import os

# AWS Imports 
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer
from sagemaker import get_execution_role
import sagemaker
import boto3

### 2. Essentials 

In [2]:
# Get the Execution Role for this Notebook and AWS Session to create Clients
sagemaker_execution_role = get_execution_role()
print('Role = {}'.format(sagemaker_execution_role))
session = boto3.Session()

# Clients and Resources
s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()

BUCKET = 'sagemaker-demo-892313895307' # USE YOUR ACCOUNT ID OR INITIALS AS SUFFIX
PREFIX = 'fruits-clf'

Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20200827T161464


##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [3]:
s3.create_bucket(Bucket=BUCKET)

s3.Bucket(name='sagemaker-demo-892313895307')

In [4]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/test/batch_test.csv')

##### Create Pointers to the uploaded files 

In [5]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [6]:
print(train_set_location)
print(test_set_location)

s3://sagemaker-demo-892313895307/fruits-clf/train/
s3://sagemaker-demo-892313895307/fruits-clf/test/


In [7]:
train_set_pointer = sagemaker.s3_input(s3_data=train_set_location, content_type='csv')
test_set_pointer = sagemaker.s3_input(s3_data=test_set_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [8]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

{
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://sagemaker-demo-892313895307/fruits-clf/train/",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


### 3. Train a Model using SageMaker + Builtin XgBoost Algorithm

In [9]:
container_uri = get_image_uri(session.region_name, 'xgboost', '1.0-1')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [10]:
xgb = sagemaker.estimator.Estimator(image_name=container_uri,
                                    role=sagemaker_execution_role, 
                                    train_instance_count=2, 
                                    train_instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='fruits-clf-xgboost')

xgb.set_hyperparameters(objective='multi:softmax',
                        num_class=4,
                        num_round=100)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [11]:
xgb.fit({'train': train_set_pointer, 'validation': test_set_pointer})

2020-11-04 15:45:32 Starting - Starting the training job...
2020-11-04 15:45:34 Starting - Launching requested ML instances......
2020-11-04 15:46:48 Starting - Preparing the instances for training.........
2020-11-04 15:48:13 Downloading - Downloading input data...
2020-11-04 15:48:59 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[15:48:54] 44x4 matrix with 176 entries loaded from

[34mINFO:RabitTracker:[34]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[35]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[36]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[37]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[38]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[39]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[40]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[41]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[42]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[43]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[44]#011train-merror:0.00000#011validation-merror:0.13333[0m
[34mINFO:RabitTracker:[45]#011train-merror:0.00000#011validation

### 4. Host the Trained Model as a SageMaker Endpoint (using Estimator object)

In [12]:
xgb_predictor = xgb.deploy(initial_instance_count=2,
                           instance_type='ml.m5.large')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!

### 5. Real Time Inference from the Endpoint

In [13]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [15]:
test_df = pd.read_csv('./DATA/test/test.csv', names=['class', 'mass', 'width', 'height', 'color_score'])

In [16]:
test_df.head()

Unnamed: 0,class,mass,width,height,color_score
0,1,0.142857,0.058824,0.538462,0.382353
1,3,0.371429,0.529412,0.646154,0.588235
2,0,0.314286,0.441176,0.569231,0.323529
3,1,0.157143,0.058824,0.676923,0.441176
4,3,0.457143,0.5,0.8,0.529412


In [17]:
X = test_df.sample(1)
X

Unnamed: 0,class,mass,width,height,color_score
5,3,1.021429,1.117647,0.8,0.441176


In [18]:
X = X.values[0]
X[1:]

array([1.02142857, 1.11764706, 0.8       , 0.44117647])

In [19]:
payload = X[1:]

In [21]:
%%time

predicted_class = xgb_predictor.predict(payload).decode('utf-8')
predicted_class

CPU times: user 3.46 ms, sys: 2.04 ms, total: 5.5 ms
Wall time: 24.5 ms


'3.0'

### 5. Evaluate Hosted Model for Accuracy

In [22]:
predictions = []
expected = []
correct = 0
for row in test_df.values:
    expected_class = row[0]
    payload = row[1:]
    predicted_class = xgb_predictor.predict(payload).decode('utf-8')
    if predicted_class == str(expected_class):
        correct += 1
    predictions.append(predicted_class)
    expected.append(expected_class)

In [23]:
print('Accuracy = {:.2f}%'.format(correct/len(predictions) * 100))

Accuracy = 86.67%


#### Confusion Matrix

In [24]:
expected = pd.Series(np.array(expected))
predictions = pd.Series(np.array(predictions))
pd.crosstab(expected, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,2.0,3.0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,4,1,0,0,5
1.0,0,2,0,1,3
2.0,0,0,1,0,1
3.0,0,0,0,6,6
All,4,3,1,7,15
