# Train/Deploy Model Using AWS SageMaker (Batch transformation)
Author: Yiran Jing

Date: 28-06-2019


**AWS Requirements**

Amazon SageMaker XGBoost can train on data in either a CSV or LibSVM format. For this example, we use CSV. It should have the following
- Have the predictor variable in the first column
- Not have a header row (as per AWS requirements)
- no customer_id (Useless column)
- numerical entry only (cleaned dataset as per AWS requirements)

**Data Description**
- train.csv (70%) (include target)
- validation.csv (20%) (include target) <i> : validation is the test dataset despite the confusing name </i>
- test.csv (10%) (include target) <i> : is for evaluating the model once deployed </i>
- test_data_Batch.csv (10%) (remove the target column from test.csv) <i> : this is test.csv with prediction col removed </i>


In [None]:
%%time
# import useful packages
import pandas as pd
import os
import boto3
import re
import json
import sagemaker
import numpy as np
from sagemaker import get_execution_role
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sagemaker.amazon.amazon_estimator import get_image_uri
from scipy import stats
import xgboost as xgb
import sklearn as sk 
import os.path 
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from scipy.stats import norm, skew
from sklearn.externals import joblib
blue = sns.color_palette('Blues')[-2]
color = sns.color_palette() 
sns.set_style('darkgrid') 

In [2]:
# The get_execution_role function retrieves the IAM role you created when you created your notebook instance.
role = get_execution_role()
# get the XGBoost container so we can run the XGBModel
container = get_image_uri(boto3.Session().region_name, 'xgboost')

## Train Model

In [3]:
# The S3 bucket and prefix that you want to use for training and model data. 
bucket = 'taysolsdev'
prefix = 'datasets/churn'

# read in data from S3
s3_input_train =sagemaker.s3_input(s3_data='s3://{}/{}/train/'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')


In [None]:
#  The session object that manages interactions with Amazon SageMaker APIs and any other AWS service that the training job uses.
sess = sagemaker.Session()

# Create an instance of the sagemaker.estimator.Estimator class
# output_path – The path to the S3 bucket where Amazon SageMaker stores the training results.
# train_instance_count: generally use only a single training instance.
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

# Set the hyperparameter values for the XGBoost training job
xgb.set_hyperparameters(max_depth=3,
                        verbosity=1,
                        random_stae=960428,
                        gamma=0,
                        subsample=1,
                        reg_lambda=1,
                        silent=0, # silent must be integer, cannot be none
                        colsample_bytree=1,
                        min_child_weight=1,  
                        learning_rate = 0.02,
                        tree_method='hist',
                        n_estimators=200,
                        class_weight='balanced',
                        objective='binary:logistic',#logistic regression for binary classification, output probability
                        num_round=50 #The number of rounds for boosting (only used in the console version of XGBoost)
                        )

# start model training
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}, logs=True)

## Deploy Model with Batch Transform
Batch Transform manages all necessary compute resources, including launching instances to deploy endpoints and deleting them afterward. 

To run a batch transform job, call the create_transform_job. method using the model that you trained before

https://sagemaker.readthedocs.io/en/stable/overview.html#sagemaker-batch-transform

Initial_instance_count :The initial number of instances to run in the Endpoint created from this Model.

https://docs.aws.amazon.com/batch/latest/userguide/job_states.html

In [5]:
# the batch dataset used for prediction cannot have target column
batch_input = 's3://taysolsdev/datasets/churn/batch/test_data_Batch.csv' # test data used for prediction

batch_output = 's3://{}/{}/batch/batch-inference'.format(bucket, prefix) # specify the location of batch output

In [6]:
# creates a transformer object from the trained model
transformer = xgb.transformer(
                          instance_count=1,
                          instance_type='ml.m4.xlarge',
                          output_path=batch_output)

# calls that object's transform method to create a transform job
transformer.transform(data=batch_input, data_type='S3Prefix', content_type='text/csv', split_type='Line')

transformer.wait()

..............................................!


### Validate Model Deployed with Batch Transform

In [7]:
# test dataset with target
test_data = 's3://taysolsdev/datasets/churn/test/test.csv'
test_data = pd.read_csv(test_data, header=None, encoding = "ISO-8859-1")   # header = none 

# batch output based on test data
batch_output = 's3://taysolsdev/datasets/churn/batch/batch-inference/test_data_Batch.csv.out'
batch_output = pd.read_csv(batch_output, header=None, encoding = "ISO-8859-1") # header = none 


In [8]:
def get_score(y_true,y_pred):
    f1 = metrics.f1_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    return precision, recall, f1, accuracy, tn, fp, fn, tp

In [9]:
y_test = test_data.iloc[:, 0]
y_pred = np.round(batch_output) # threshold is 0.5


#get scores
temp_precision, temp_recall, temp_f1, temp_accuracy, tn, fp, fn, tp = get_score(y_test,y_pred)
output = [temp_precision,temp_recall,temp_f1,temp_accuracy,tp, fp, tn, fn]
output = pd.Series(output, index=['precision', 'recall', 'f1', 'accuracy', 'tp', 'fp', 'tn', 'fn']) 
print(output[['accuracy', 'tp', 'fp', 'tn', 'fn']])

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

accuracy     0.798
tp         106.000
fp          41.000
tn         455.000
fn         101.000
dtype: float64
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       496
           1       0.72      0.51      0.60       207

   micro avg       0.80      0.80      0.80       703
   macro avg       0.77      0.71      0.73       703
weighted avg       0.79      0.80      0.79       703



### Clean up
When we are ready to be done with this notebook, please run the cell below. This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on.

#### Appendix: Another way to train model
The version in this notebook follows the SDK format that calls to the AWS API for building an ML model. Another way to do this would be to configure the model using JSON format as below. 

In [None]:
%%time
from time import gmtime, strftime

job_name = 'CHURN-xgboost-regression-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)
bucket_path = 's3://{}/{}/output'.format(bucket, prefix)


create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
   "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": bucket_path
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        #"max_depth":"3",
        #"gamma":"0",
        #"min_child_weight":"1",
        #"silent":"None",
        "num_round":"50", 
        "objective":"binary:logistic",
        "class_weight":"balanced",
        "n_estimators":"200",
        "learning_rate":"0.02",
        "tree_method":"hist",
        "random_state":"960428",   
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri" : 's3://{}/{}/train/'.format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri" : 's3://{}/{}/validation/'.format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker', region_name=region)
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)