## SageMaker Model Building and Deployment 


In this notebook we show how to use Amazon SageMaker to develop, train, tune and deploy a XGBoost model. Sythetic customer churn data is used. 

The data is in AWS public S3 bucket: s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt

Sklearn Processor is used to process the raw data.

* XGBoost https://sagemaker.readthedocs.io/en/stable/frameworks/xgboost/using_xgboost.html?highlight=xgboost
* Doc https://sagemaker.readthedocs.io/en/stable/using_sklearn.html
* SDK https://sagemaker.readthedocs.io/en/stable/sagemaker.sklearn.html
* boto3 https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#client
 
**This sample is provided for demonstration purposes, make sure to conduct appropriate testing if derivating this code for your own use-cases!**

In [None]:
%matplotlib inline
import os
import time
import logging
import pandas as pd
import numpy as np
import sagemaker
import json
import boto3
from sagemaker import get_execution_role

sm_client = boto3.client('sagemaker')

In [None]:
# Retrieve the bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()  # this could also be a hard-coded bucket name
region = sagemaker_session.boto_region_name
print(region)
role = get_execution_role()

project_name = "test_pro"
project_id = "test_id"
#model_package_group_name = project_name
print(f"sagemaker role arn <{role}>")

assert(len(project_name) <= 15 ) # the project name should not have more than 15 chars

In [None]:
print(bucket)

## Load Raw Data to S3

Load raw data from the public S3 bucket to your own S3 bucket.

In [None]:
#load raw data to S3 bucket

In [None]:
!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/synthetic/churn.txt s3://{bucket}/sagemaker/DEMO-xgboost-churn/data/RawData.csv 

## Prepare script to process raw data

Create preprocessing script. This script will be used by SageMaker process job instance to preocess raw data.

In [None]:
%%writefile preprocess.py

"""Preprocess the customer churn dataset."""

import argparse
import logging
import pathlib

import boto3
import numpy as np
import pandas as pd

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

if __name__ == "__main__":
    logger.info("Starting preprocessing.")
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, required=True)
    args = parser.parse_args()

    base_dir = "/opt/ml/processing"
    pathlib.Path(f"{base_dir}/data").mkdir(parents=True, exist_ok=True)
    input_data = args.input_data
    print(input_data)
    bucket = input_data.split("/")[2]
    key = "/".join(input_data.split("/")[3:])

    logger.info("Downloading data from bucket: %s, key: %s", bucket, key)
    fn = f"{base_dir}/data/raw-data.csv"
    s3 = boto3.resource("s3")
    s3.Bucket(bucket).download_file(key, fn)

    logger.info("Reading downloaded data.")

    # read in csv
    df = pd.read_csv(fn)

    # drop the "Phone" feature column
    df = df.drop(["Phone"], axis=1)

    # Change the data type of "Area Code"
    df["Area Code"] = df["Area Code"].astype(object)

    # Drop several other columns
    df = df.drop(["Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

    # Convert categorical variables into dummy/indicator variables.
    model_data = pd.get_dummies(df)

    # Create one binary classification target column
    model_data = pd.concat(
        [
            model_data["Churn?_True."],
            model_data.drop(["Churn?_False.", "Churn?_True."], axis=1),
        ],
        axis=1,
    )

    # Split the data
    train_data, validation_data, test_data = np.split(
        model_data.sample(frac=1, random_state=1729),
        [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
    )

    pd.DataFrame(train_data).to_csv(
        f"{base_dir}/train/train.csv", header=False, index=False
    )
    pd.DataFrame(validation_data).to_csv(
        f"{base_dir}/validation/validation.csv", header=False, index=False
    )
    pd.DataFrame(test_data).to_csv(
        f"{base_dir}/test/test.csv", header=False, index=False
    )


## Prepare data for model training

In [None]:
from sagemaker.workflow.parameters import ParameterInteger, ParameterString

In [None]:
processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
processing_instance_type = ParameterString(
        name="ProcessingInstanceType", default_value="ml.m5.xlarge"
    )

training_instance_type = ParameterString(
        name="TrainingInstanceType", default_value="ml.m5.xlarge"
    )

model_approval_status = ParameterString(
        name="ModelApprovalStatus",
        default_value="Approved",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
    )

input_data = ParameterString(
        name="InputDataUrl",
        default_value=f"s3://{sagemaker_session.default_bucket()}/sagemaker/DEMO-xgboost-churn/data/RawData.csv",  # Change this to point to the s3 location of your raw input data.
    )

SageMaker Process instance with sklearn image is used to process raw data.

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

sklearn_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,

)

Processed data is saved back to S3 bucket.

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(
    code="preprocess.py",  
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    arguments=["--input-data", input_data],
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

## Model Training

Get training and validation data paths.

In [None]:
s3_input_train=preprocessing_job_description['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']

In [None]:
s3_input_validation=preprocessing_job_description['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri']

Define XGBoost model

In [None]:
from sagemaker.inputs import TrainingInput

content_type = "csv"
train_input = TrainingInput(s3_input_train, content_type=content_type)
validation_input = TrainingInput(s3_input_validation, content_type=content_type)


In [None]:
import sagemaker
from sagemaker.serializers import CSVSerializer

In [None]:
prefix = 'sagemaker/xgboost_cutomer_churn'

In [None]:
container=sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print(container)

In [None]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"}

In [None]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    hyperparameters=hyperparameters,
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)

Train the XGboost model

In [None]:
xgb.fit({'train': train_input, 'validation': validation_input})

In [None]:
sm_boto3 = boto3.client("sagemaker")

In [None]:
artifact = sm_boto3.describe_training_job(
    TrainingJobName=xgb.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

## Create Endpoint

Create an endpoint using SageMaker SDK

In [None]:
xgb_predictor = xgb.deploy(
initial_instance_count = 1,
instance_type = 'ml.m4.xlarge',
serializer = CSVSerializer())

In [None]:
print(f'Endpoint name: {xgb_predictor.endpoint_name}')

## Invoke Endpoint

In [None]:
test_data=pd.read_csv('test.csv',header=None)

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.to_numpy()[:1,1:])
predictions