## Train XGBoost Model using SageMaker Training + Serve (Host) the Model as a SageMaker Endpoint

### 1. Imports 

In [1]:
import pandas as pd
import numpy as np
import json
import os

from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role
import sagemaker
import boto3

In [2]:
sagemaker.__version__

'2.16.4.dev0'

### 2. Essentials 

In [3]:
# Get the Execution Role for this Notebook and AWS Session to create Clients
sagemaker_execution_role = get_execution_role()
print('Role = {}'.format(sagemaker_execution_role))
session = boto3.Session()

# Clients and Resources
s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()

BUCKET = sagemaker_session.default_bucket()
PREFIX = 'clf'

Role = arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20200629T123070


##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [4]:
s3.create_bucket(Bucket=BUCKET)

s3.Bucket(name='sagemaker-us-east-1-892313895307')

In [5]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/batch_test/batch_test.csv')

##### Create Pointers to the uploaded files 

In [6]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [7]:
print(train_set_location)
print(test_set_location)

s3://sagemaker-us-east-1-892313895307/clf/train/
s3://sagemaker-us-east-1-892313895307/clf/test/


In [8]:
train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv')

In [9]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

{
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://sagemaker-us-east-1-892313895307/clf/train/",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


### 3. Train a Model using SageMaker + Builtin XgBoost Algorithm

In [10]:
container_uri = sagemaker.image_uris.retrieve(region=session.region_name, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')

In [11]:
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=sagemaker_execution_role, 
                                    instance_count=2, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='classifier')

xgb.set_hyperparameters(objective='binary:logistic',
                        num_round=100)

In [None]:
xgb.fit({'train': train_set_pointer, 'validation': test_set_pointer})

### 4. Host the Trained Model as a SageMaker Endpoint (using Estimator object)

In [13]:
xgb_predictor = xgb.deploy(initial_instance_count=2,
                           instance_type='ml.m5.large')

---------------!

### 5. Real Time Inference from the Endpoint

In [14]:
csv_serializer = CSVSerializer()
predictor = Predictor(endpoint_name=xgb_predictor.endpoint_name, 
                      serializer=csv_serializer)

In [15]:
test_df = pd.read_csv('./DATA/test/test.csv', names=['class', 'bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate'])

In [16]:
test_df.head()

Unnamed: 0,class,bmi,diastolic_bp_change,systolic_bp_change,respiratory_rate
0,0,-0.940089,-0.403964,-0.279542,-0.817379
1,0,-0.502614,-0.665582,0.131742,-0.36245
2,0,1.078473,0.347981,0.228029,-0.817379
3,1,-0.636164,-0.251491,0.587034,-0.817379
4,1,-0.528479,2.037253,1.383463,0.185934


In [17]:
X = test_df.sample(1)
X

Unnamed: 0,class,bmi,diastolic_bp_change,systolic_bp_change,respiratory_rate
957,0,-0.101595,2.298871,0.061596,-0.075366


In [18]:
X = X.values[0]
X[1:]

array([-0.10159542,  2.29887101,  0.06159553, -0.07536643])

In [19]:
payload = X[1:]

In [20]:
%%time

predicted_class_prob = predictor.predict(payload).decode('utf-8')
if float(predicted_class_prob) < 0.5:
    print('PREDICTION = NOT DIABETIC')
else:
    print('PREDICTION = DIABETIC')
print()

PREDICTION = NOT DIABETIC

CPU times: user 12.6 ms, sys: 171 Âµs, total: 12.8 ms
Wall time: 176 ms


### 5. Evaluate Hosted Model for Accuracy

In [21]:
predictions = []
expected = []
correct = 0
for row in test_df.values:
    expected_class = row[0]
    payload = row[1:]
    predicted_class_prob = predictor.predict(payload).decode('utf-8')
    predicted_class = 1
    if float(predicted_class_prob) < 0.5:
        predicted_class = 0  
    if predicted_class == expected_class:
        correct += 1
    predictions.append(predicted_class)
    expected.append(expected_class)

In [22]:
print('Accuracy = {:.2f}%'.format(correct/len(predictions) * 100))

Accuracy = 77.72%


#### Confusion Matrix

In [23]:
expected = pd.Series(np.array(expected))
predictions = pd.Series(np.array(predictions))
pd.crosstab(expected, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1909,71,1980
1.0,483,24,507
All,2392,95,2487
