# SageMaker Classification Demo with XGBoost

## Startup

In [None]:
import os, sagemaker
import numpy as np
import pandas as pd
from sagemaker import get_execution_role

In [None]:
# Define IAM role- this will be necessary when defining your model
iam_role = get_execution_role()

# Set SageMaker session handle
sess = sagemaker.Session()

# Set the region of the instance 
my_region = sess.boto_session.region_name

print("Success - the SageMaker instance is in the " + my_region + " region")

## Load Data

In [None]:
# Set S3 bucket name and folder

bucket = << BUCKET NAME >>
prefix = "data"
print('Using bucket ' + bucket)

In [None]:
data_fname = "s3://{}/{}/{}".format(bucket, prefix ,"train_data.csv")
train_df = pd.read_csv(data_fname)

data_fname = "s3://{}/{}/{}".format(bucket, prefix, "val_data.csv")
val_df   = pd.read_csv(data_fname)

data_fname = "s3://{}/{}/{}".format(bucket, prefix, "test_data.csv")
test_df  = pd.read_csv(data_fname)

## Explore Data

In [None]:
print(train_df.shape)
train_df.describe()

In [None]:
print(val_df.shape)
val_df.describe()

In [None]:
print(test_df.shape)
test_df.describe()

## Prepare Data

In [None]:
# This is where data preparation steps are performed
# XGBoost expects labels to be in the first column
# Normally we need to add labels as the first column in the data, like so:
# np.insert(x, 0, y, axis=1)  # where x is (n,m), y is (n,1)
# But the wine dataset already has labels in the first column. 

In [None]:
train_df.head(2)

In [None]:
val_df.head(2)

In [None]:
test_df.head(2)

### Set up data on S3 for model to access
* Note `index=False` and `header=False` arguments passed to `to_csv`
* This format is required to train XGBoost model

In [None]:
# Write prepared data to files

train_df.to_csv('train_data.csv', index=False, header=False)
val_df.to_csv('val_data.csv', index=False, header=False)
test_df.to_csv('test_data.csv', index=False, header=False)

In [None]:
# Upload data to S3 for model to access

key_prefix = prefix + "/model_data"
train_path = sess.upload_data(
    path='train_data.csv', bucket=bucket, key_prefix=key_prefix)
print('Train data uploaded to ' + train_path)

val_path = sess.upload_data(
    path='val_data.csv', bucket=bucket, key_prefix=key_prefix)
print('Validation data uploaded to ' + val_path)

test_path = sess.upload_data(
    path='test_data.csv', bucket=bucket, key_prefix=key_prefix)
print('Test data uploaded to ' + test_path)

## Train XGBoost Model

### Create channels for train and validation data to feed to model

In [None]:
# Set data channels

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_path, content_type='csv')
s3_input_val = sagemaker.inputs.TrainingInput(s3_data=val_path, content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=test_path, content_type='csv')

In [None]:
# Set model output location

output_location = "s3://{}/{}/model".format(bucket,prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

### Create model

In [None]:
from sagemaker.amazon.amazon_estimator import image_uris
xgb_image = image_uris.retrieve(framework="xgboost", region=my_region, version='latest')

In [None]:
xgb_model = sagemaker.estimator.Estimator(xgb_image,
                                          iam_role, 
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge',
                                          # train_volume_size = 5,
                                          output_path=output_location,
                                          sagemaker_session=sess)

### Set model hyperparameters

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters

xgb_model.set_hyperparameters(max_depth = 10,             
                              objective = "multi:softmax",
                              num_class = 3,
                              num_round = 10)

### Train model using train and validation data channels

In [None]:
%%time

# NOTE:  This step may take several minutes

# Fit model using  data channels
xgb_model.fit({'train': s3_input_train, 'validation': s3_input_val})

### Deploy model for real-time inference

In [None]:
%%time

# NOTE:  This step may take several minutes

xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                                 serializer = sagemaker.serializers.CSVSerializer(),
                                 instance_type='ml.t2.medium')

## Real-Time Inference
### NOTE - This step can(and should) be done in a separate notebook/application
* For the purpose of the exercise, we will extract the endpoint from the `xgb_predictor` variable
* The actual endpoint will be available in SageMaker dashboard once the model is deployed
* Initialize a new Predictor(usually done in a separete application) with the endpoint
* Use it for prediction

In [None]:
print(test_df.shape)
test_df.head(1)

In [None]:
# Drop the label column and load data into an array

test_df_array = test_df.drop(['Class'], axis=1).values

### Using `xgb_predictor` from the previous step

In [None]:
# Get predictions from model
# Predictions are returned as byte object, so need to decode contents into string, then convert to number array

# predictions = xgb_predictor.predict(data=test_df_array).decode('utf-8') 
# predictions_array = np.fromstring(predictions, sep=',')                 

### Initializing new `Predictor` object with the endpoint, session, serializer and deserializer
* Here, `xgb_predictor.endpoint_name` is used
* In practice, endpoint is obtained, post model deployment, from SageMaker dashboard

In [None]:
predictor = sagemaker.predictor.Predictor(endpoint_name=xgb_predictor.endpoint_name,
                                          sagemaker_session=sess,
                                          serializer=sagemaker.serializers.CSVSerializer(),
                                          deserializer=sagemaker.deserializers.BytesDeserializer())

In [None]:
predictions = predictor.predict(data=test_df_array).decode('utf-8') 
predictions_array = np.fromstring(predictions, sep=',')  

### Evaluate results

In [None]:
# Calculate evaluation metrics

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_true = test_df['Class'].values
y_pred = predictions_array.astype(int)

print(y_pred)
print(y_true)

print("Accuracy : %.3f" % accuracy_score(y_true,y_pred))

### Delete endpoint

In [None]:
# Note: There is a limit on the number of active endpoints

xgb_predictor.delete_endpoint()
# xgb_predictor.delete_model()

## Batch Inference

### Set S3 location for model input and output

In [None]:
test_nolabel_df = test_df.drop('Class', axis=1)

In [None]:
test_nolabel_df.head(2)

In [None]:
test_batch_path = "s3://{}/{}/model_data/{}".format(bucket,prefix,"test_batch_data.csv")
test_nolabel_df.to_csv(test_batch_path, index=False, header=False)
print('Test data for batch inference uploaded to ' + test_batch_path)

In [None]:
test_batch_output = "s3://{}/{}/batch_output".format(bucket,prefix)
print('test outputs will be uploaded to: {}'.format(test_batch_output))

### Start transformer job for batch inference

In [None]:
xgb_transformer = xgb_model.transformer(instance_count=1,
                                        instance_type='ml.m5.large',
                                        output_path=test_batch_output)

In [None]:
%%time

# Note:  This step may take several minutes

xgb_transformer.transform(test_batch_path, content_type="text/csv")

### Evaluate results

In [None]:
print(test_batch_output)

In [None]:
# Get inference results from S3

fname = "{}/{}".format(test_batch_output, "test_batch_data.csv.out")
batch_df = pd.read_csv(fname, header=None)

In [None]:
# Calculate evaluation metrics

y_true = test_df['Class'].values
y_pred = batch_df.values.astype(int)
# y_pred = batch_df.to_numpy()

print(y_pred.T)
print(y_true)

print("Accuracy : %.3f" % accuracy_score(y_true,y_pred))

## Hyperparameter Tuning 

### Create tuning job

In [None]:
xgb_model.hyperparameters()

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost-tuning.html

from sagemaker.tuner import IntegerParameter

# Specify tuning job parameters
hyperparameter_ranges = {
    'max_depth': IntegerParameter(1, 10),
    'min_child_weight': IntegerParameter(1,10)}

# Create tuning job
Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=xgb_model,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name='XGBoost-Tuner',
    objective_type='Minimize',
    objective_metric_name='validation:merror',
    max_jobs=10,
    max_parallel_jobs=5)

In [None]:
%%time

# Note:  This step may take several minutes

# Launch tuning job
Optimizer.fit({'train': s3_input_train, 'validation': s3_input_val})

In [None]:
# Get tuning results in a df

tuning_results = Optimizer.analytics().dataframe()

### Deploy tuned model

In [None]:
%%time

# Deploy tuned model

tuned_model_predictor = Optimizer.deploy(initial_instance_count=1,
                    instance_type='ml.m5.xlarge', serializer = sagemaker.serializers.CSVSerializer())

In [None]:
# Get hyperparameters of tuned model

Optimizer.best_estimator().hyperparameters()

### Evaluate results

In [None]:
# Get predictions from tuned model

predictions_tuned = tuned_model_predictor.predict(data=test_df_array).decode('utf-8') # predict!
predictions_array_tuned = np.fromstring(predictions_tuned, sep=',') # and turn the prediction into an array

In [None]:
# Calculate evaluation metrics

y_true = test_df['Class'].values
y_pred = predictions_array.astype(int)

print(y_pred)
print(y_true)

print("Accuracy : %.3f" % accuracy_score(y_true,y_pred))

### Delete endpoint

In [None]:
tuned_model_predictor.delete_endpoint()