## Train XGBoost Model using SageMaker Training + Serve (Host) the Model as a SageMaker Endpoint

##### Imports 

In [None]:
import sagemaker
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import  boto3
import json
import os

### Essentials 

Get the SageMaker execution role for this notebook and create AWS sessions.

In [None]:
sagemaker_execution_role = get_execution_role()
session = boto3.Session()

s3 = session.resource('s3')
sagemaker_session = sagemaker.Session()

BUCKET = sagemaker_session.default_bucket()
PREFIX = 'clf'

##### Upload Train & Test Sets to S3 and Create Pointers to Data

In [None]:
s3.create_bucket(Bucket=BUCKET)

In [None]:
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('./DATA/train/train.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'test/test.csv')).upload_file('./DATA/test/test.csv')
s3.Bucket(BUCKET).Object(os.path.join(PREFIX, 'batch_test/batch_test.csv')).upload_file('./DATA/batch_test/batch_test.csv')

##### Create Pointers to the uploaded files 

In [None]:
train_set_location = 's3://{}/{}/train/'.format(BUCKET, PREFIX)
test_set_location = 's3://{}/{}/test/'.format(BUCKET, PREFIX)

In [None]:
print(train_set_location)
print(test_set_location)

In [None]:
train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv', distribution='FullyReplicated')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv', distribution='FullyReplicated')

In [None]:
print(json.dumps(train_set_pointer.__dict__, indent=2))

### Train a Model using SageMaker + Builtin XgBoost Algorithm

In [None]:
container_uri = sagemaker.image_uris.retrieve(region=session.region_name, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')

#### Distributed training <br>
Distributed training is possible whether or not the data is stored in a single file or multiple files. If it’s stored in a single file, the training computations are distributed across the number of EC2 instances specified by the user.<br> <br>If the data is distributed, SageMaker can handle the data in one of two ways: <br>1) `FullyReplicated` across the number of EC2 instances specified by the user. This leads to slower training times and greater memory consumption, yet it likely produces more accurate models since each EC2 instance is seeing the full training data. <br>2) `ShardedByS3Key`is faster and memory efficient yet slightly less accurate since each EC2 instance only sees a portion of the total training data. For e.g., given 3 EC2 instances forming the disributed compute environment, training data in S3 is split equally across the 3 EC2 instances (i.e. each EC2 sees roughly 1/3rd of the total training data) for fast training times.

**Note:** When you use `FullyReplicated` mode in distributed setting, the data is copied onto all the machines, however, the training is done using all the machines. Each machine will use part of the data for training and after every batch the weights are synchronized across machines.

In [None]:
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=sagemaker_execution_role, 
                                    instance_count=2, 
                                    instance_type='ml.m5.large',
                                    output_path='s3://{}/{}/model-artifacts'.format(BUCKET, PREFIX),
                                    sagemaker_session=sagemaker_session,
                                    base_job_name='classifier')

xgb.set_hyperparameters(objective='binary:logistic',
                        num_round=100)

In [None]:
xgb.fit({'train': train_set_pointer, 'validation': test_set_pointer})

### Host the Trained Model as a SageMaker Endpoint (using Estimator object)

In [None]:
xgb_predictor = xgb.deploy(initial_instance_count=2,
                           instance_type='ml.m5.large')

### Real Time Inference from the Endpoint

In [None]:
csv_serializer = CSVSerializer()
predictor = Predictor(endpoint_name=xgb_predictor.endpoint_name, 
                      serializer=csv_serializer)

In [None]:
test_df = pd.read_csv('./DATA/test/test.csv', names=['class', 'bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate'])

In [None]:
test_df.head()

In [None]:
X = test_df.sample(1)
X

In [None]:
X = X.values[0]
X[1:]

In [None]:
payload = X[1:]

In [None]:
%%time

predicted_class_prob = predictor.predict(payload).decode('utf-8')
if float(predicted_class_prob) < 0.5:
    print('PREDICTION = NOT DIABETIC')
else:
    print('PREDICTION = DIABETIC')
print()

### Evaluate Hosted Model for Accuracy

In [None]:
predictions = []
expected = []
correct = 0
for row in test_df.values:
    expected_class = row[0]
    payload = row[1:]
    predicted_class_prob = predictor.predict(payload).decode('utf-8')
    predicted_class = 1
    if float(predicted_class_prob) < 0.5:
        predicted_class = 0  
    if predicted_class == expected_class:
        correct += 1
    predictions.append(predicted_class)
    expected.append(expected_class)

In [None]:
print('Accuracy = {:.2f}%'.format(correct/len(predictions) * 100))

#### Confusion Matrix

In [None]:
expected = pd.Series(np.array(expected))
predictions = pd.Series(np.array(predictions))
pd.crosstab(expected, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)