In [None]:
# Regression with Amazon SageMaker XGBoost algorithm

# Introduction

Source of this notebook: https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/xgboost_abalone/xgboost_abalone.html

Please refer to this document for citations and detailled explanations

This notebook has been adapted to run inside Studio

# Requirements 

- preinstalled SageMaker Studio image Python3 for DataScience (tested on DataScience 3.0)
- packages: sagemaker

# Initialisation

In [2]:
%%time

import boto3
import sagemaker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3_client = boto3.client("s3")


CPU times: user 875 ms, sys: 171 ms, total: 1.05 s
Wall time: 1.18 s


# Data Preparation

Kind of, data is not split. Same file is used as train and test datasets. 

TODO Check methods at end of this file and implement the data split

In [22]:
%%time

import os
import re

# S3 bucket where the training data is located.
data_bucket = "sagemaker-sample-files"
data_prefix = "datasets/tabular/uci_abalone"
data_bucket_path = f"s3://{data_bucket}"

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
output_bucket = sagemaker.Session().default_bucket()
output_prefix = "stage-labbench/abalone/xgboost-default"
output_bucket_path = f"s3://{output_bucket}"

# tmp directory
output_folder="generated"
os.makedirs(output_folder, exist_ok=True)

for data_category in ["train", "test", "validation"]:
    data_key = f"{data_prefix}/{data_category}/abalone.{data_category}"
    output_key = f"{output_prefix}/{data_category}/abalone.{data_category}"
    data_filename = os.path.join(output_folder, f"abalone.{data_category}")
    print(f"download {data_bucket} {data_key} -> {data_filename}")
    s3_client.download_file(data_bucket, data_key, data_filename)
    print(f"upload {data_filename} -> {output_bucket} {output_key}")
    s3_client.upload_file(data_filename, output_bucket, output_key)


download sagemaker-sample-files datasets/tabular/uci_abalone/train/abalone.train -> generated/abalone.train
upload generated/abalone.train -> sagemaker-eu-west-1-102959664345 stage-labbench/abalone/xgboost-default/train/abalone.train
download sagemaker-sample-files datasets/tabular/uci_abalone/test/abalone.test -> generated/abalone.test
upload generated/abalone.test -> sagemaker-eu-west-1-102959664345 stage-labbench/abalone/xgboost-default/test/abalone.test
download sagemaker-sample-files datasets/tabular/uci_abalone/validation/abalone.validation -> generated/abalone.validation
upload generated/abalone.validation -> sagemaker-eu-west-1-102959664345 stage-labbench/abalone/xgboost-default/validation/abalone.validation
CPU times: user 226 ms, sys: 30.6 ms, total: 257 ms
Wall time: 1.6 s


# Training the XGBoost model

Training takes between 5 and 6 minutes.

Training can be done by either calling SageMaker Training with a set of hyperparameters values to train with, or by leveraging hyperparameter tuning (HPO) which finds the best version of a model by running many training jobs on your dataset using the algorithm and ranges of hyperparameters that you specify. It then chooses the hyperparameter values that result in a model that performs the best, as measured by a metric that you choose.

In this notebook, both methods are used for demonstration purposes, but the model that the HPO job creates is the one that is eventually hosted. You can instead choose to deploy the model created by the standalone training job by changing the below variable deploy_amt_model to False.


In [6]:
# Initializing common variables

container = sagemaker.image_uris.retrieve("xgboost", region, "1.5-1")
client = boto3.client("sagemaker", region_name=region)
deploy_amt_model = True


## Training with hyperparameters

In [7]:
%%time
import boto3
from time import gmtime, strftime
import time

training_job_name = f"DEMO-xgboost-regression-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"

# Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": f"{output_bucket_path}/{output_prefix}/single-xgboost"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.2xlarge", "VolumeSizeInGB": 5},
    "TrainingJobName": training_job_name,
    "HyperParameters": {
        "max_depth": "5",
        "eta": "0.2",
        "gamma": "4",
        "min_child_weight": "6",
        "subsample": "0.7",
        "objective": "reg:linear",
        "num_round": "50",
        "verbosity": "2",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 3600},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{output_bucket_path}/{output_prefix}/train",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{output_bucket_path}/{output_prefix}/validation",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None",
        },
    ],
}

print(f"Creating a training job with name: {training_job_name}. It will take between 5 and 6 minutes to complete.")
client.create_training_job(**create_training_params)
status = client.describe_training_job(TrainingJobName=training_job_name)["TrainingJobStatus"]
print(status)
while status != "Completed" and status != "Failed":
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=training_job_name)["TrainingJobStatus"]
    print(status)


Creating a training job with name: DEMO-xgboost-regression-2022-12-26-20-10-20. It will take between 5 and 6 minutes to complete.
InProgress
InProgress
InProgress
Completed
CPU times: user 52.6 ms, sys: 13.6 ms, total: 66.2 ms
Wall time: 3min


Note that the “validation” channel has been initialized too. The SageMaker XGBoost algorithm actually calculates RMSE and writes it to the CloudWatch logs on the data passed to the “validation” channel.



## Tuning with SageMaker Automatic Model Tuning

To create a tuning job using the AWS SageMaker Automatic Model Tuning API, you need to define 3 attributes.

- the tuning job name (string)
- the tuning job config (to specify settings for the hyperparameter tuning job - JSON object)
- training job definition (to configure the training jobs that the tuning job launches - JSON object).

To learn more about that, refer to the Configure and Launch a Hyperparameter Tuning Job documentation.

Note that the tuning job will 12-17 minutes to complete.



In [23]:
%%time
from time import gmtime, strftime, sleep

tuning_job_name = "DEMO-xgboost-reg-HPO-" + strftime("%d-%H-%M-%S", gmtime())

tuning_job_config = {
    "ParameterRanges": {
        "CategoricalParameterRanges": [],
        "ContinuousParameterRanges": [
            {
                "MaxValue": "0.5",
                "MinValue": "0.1",
                "Name": "eta",
            },
            {
                "MaxValue": "5",
                "MinValue": "0",
                "Name": "gamma",
            },
            {
                "MaxValue": "120",
                "MinValue": "0",
                "Name": "min_child_weight",
            },
            {
                "MaxValue": "1",
                "MinValue": "0.5",
                "Name": "subsample",
            },
            {
                "MaxValue": "2",
                "MinValue": "0",
                "Name": "alpha",
            },
        ],
        "IntegerParameterRanges": [
            {
                "MaxValue": "10",
                "MinValue": "0",
                "Name": "max_depth",
            },
            {
                "MaxValue": "4000",
                "MinValue": "1",
                "Name": "num_round",
            }
        ],
    },
    # SageMaker sets the following default limits for resources used by automatic model tuning:
    # https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-limits.html
    "ResourceLimits": {
        # Increase the max number of training jobs for increased accuracy (and training time).
        "MaxNumberOfTrainingJobs": 6,
        # Change parallel training jobs run by AMT to reduce total training time. Constrained by your account limits.
        # if max_jobs=max_parallel_jobs then Bayesian search turns to Random.
        "MaxParallelTrainingJobs": 2
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {"MetricName": "validation:rmse", "Type": "Minimize"},
}

training_job_definition = {
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{output_bucket_path}/{output_prefix}/train",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"{output_bucket_path}/{output_prefix}/validation",
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "libsvm",
            "CompressionType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": f"{output_bucket_path}/{output_prefix}/single-xgboost"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.2xlarge", "VolumeSizeInGB": 5},
    "RoleArn": role,
    "StaticHyperParameters": {
        "objective": "reg:linear",
        "verbosity": "2",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 43200},
}

print(f"Creating a tuning job with name: {tuning_job_name}. It will take between 12 and 17 minutes to complete.")
client.create_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name,
    HyperParameterTuningJobConfig=tuning_job_config,
    TrainingJobDefinition=training_job_definition,
)

status = client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
    "HyperParameterTuningJobStatus"
]
print(status)
while status != "Completed" and status != "Failed":
    time.sleep(60)
    status = client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)[
            "HyperParameterTuningJobStatus"
    ]
    print(status)

Creating a tuning job with name: DEMO-xgboost-reg-HPO-26-20-37-34. It will take between 12 and 17 minutes to complete.
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed
CPU times: user 105 ms, sys: 7.34 ms, total: 112 ms
Wall time: 6min 1s


# Set up hosting for the mode

In order to set up hosting, we have to import the model from training to hosting.




## Import model into hosting
Register the model with hosting. This allows the flexibility of importing models trained elsewhere.


In [10]:
%%time
import boto3
from time import gmtime, strftime

if deploy_amt_model == True:
    training_of_model_to_be_hosted = client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)["BestTrainingJob"]["TrainingJobName"]
else:
    training_of_model_to_be_hosted = training_job_name

model_name = f"{training_of_model_to_be_hosted}-model"
print(model_name)

info = client.describe_training_job(TrainingJobName=training_of_model_to_be_hosted)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]
print(model_data)

primary_container = {"Image": container, "ModelDataUrl": model_data}

create_model_response = client.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])


DEMO-xgboost-reg-26-20-13-21-005-582c76ff-model
s3://sagemaker-eu-west-1-102959664345/stage-labbench/abalone/xgboost-default/single-xgboost/DEMO-xgboost-reg-26-20-13-21-005-582c76ff/output/model.tar.gz
arn:aws:sagemaker:eu-west-1:102959664345:model/demo-xgboost-reg-26-20-13-21-005-582c76ff-model
CPU times: user 25.3 ms, sys: 3.03 ms, total: 28.3 ms
Wall time: 527 ms


## Create endpoint configuration

SageMaker supports configuring REST endpoints in hosting with multiple models, e.g. for A/B testing purposes. In order to support this, customers create an endpoint configuration, that describes the distribution of traffic across the models, whether split, shadowed, or sampled in some way. In addition, the endpoint configuration describes the instance type required for model deployment.



In [11]:
from time import gmtime, strftime

endpoint_config_name = f"DEMO-XGBoostEndpointConfig-{strftime('%Y-%m-%d-%H-%M-%S', gmtime())}"
print(f"Creating endpoint config with name: {endpoint_config_name}.")
create_endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.m5.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print(f"Endpoint Config Arn: {create_endpoint_config_response['EndpointConfigArn']}")



Creating endpoint config with name: DEMO-XGBoostEndpointConfig-2022-12-26-20-23-03.
Endpoint Config Arn: arn:aws:sagemaker:eu-west-1:102959664345:endpoint-config/demo-xgboostendpointconfig-2022-12-26-20-23-03


## Create endpoint 

Lastly, the customer creates the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into production applications. This takes 9-11 minutes to complete.



In [12]:
%%time
import time

endpoint_name = f'DEMO-XGBoostEndpoint-{strftime("%Y-%m-%d-%H-%M-%S", gmtime())}'
print(f"Creating endpoint with name: {endpoint_name}. This will take between 9 and 11 minutes to complete.")
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
print(create_endpoint_response["EndpointArn"])

resp = client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
while status == "Creating":
    print(f"Status: {status}")
    time.sleep(60)
    resp = client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]

print(f"Arn: {resp['EndpointArn']}")
print(f"Status: {status}")



Creating endpoint with name: DEMO-XGBoostEndpoint-2022-12-26-20-23-26. This will take between 9 and 11 minutes to complete.
arn:aws:sagemaker:eu-west-1:102959664345:endpoint/demo-xgboostendpoint-2022-12-26-20-23-26
Status: Creating
Status: Creating
Status: Creating
Arn: arn:aws:sagemaker:eu-west-1:102959664345:endpoint/demo-xgboostendpoint-2022-12-26-20-23-26
Status: InService
CPU times: user 53.6 ms, sys: 15 ms, total: 68.6 ms
Wall time: 3min


# Validate the model for use

Finally, the customer can now validate the model for use. They can obtain the endpoint from the client library using the result from previous operations, and generate classifications from the trained model using that endpoint.


In [15]:
runtime_client = boto3.client("runtime.sagemaker", region_name=region)


## Download test data

In [16]:
import os

FILE_TEST = "abalone.test"
s3 = boto3.client("s3")

# tmp directory
output_folder="generated"
os.makedirs(output_folder, exist_ok=True)
test_data_key = f"{data_prefix}/test/{FILE_TEST}"
test_data_filename = os.path.join(output_folder, FILE_TEST)
print(f"download {data_bucket} {test_data_key} -> {test_data_filename}")
s3.download_file(data_bucket, test_data_key, test_data_filename)


download sagemaker-sample-files datasets/tabular/uci_abalone/test/abalone.test -> generated/abalone.test


## Start with a single prediction.

In [17]:
!head -1 generated/abalone.test > generated/abalone.single.test

In [18]:
%%time
import json
from itertools import islice
import math
import struct

single_test_data_filename = os.path.join(output_folder, "abalone.single.test" )
file_name = single_test_data_filename
with open(file_name, "r") as f:
    payload = f.read().strip()

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="text/x-libsvm", Body=payload
)
result = response["Body"].read()
result = result.decode("utf-8")
result = result.split(",")
result = [math.ceil(float(i)) for i in result]
label = payload.strip(" ").split()[0]
print(f"Label: {label}\nPrediction: {result[0]}")


Label: 12
Prediction: 19
CPU times: user 13.8 ms, sys: 4.31 ms, total: 18.1 ms
Wall time: 121 ms


## Run prediction against the batch file and compute the predictions accuracy.

The following functions are helpers to run the prediction agains each item

In [19]:
import sys
import math

def do_predict(data, endpoint_name, content_type):
    payload = "\n".join(data)
    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=payload
    )
    result = response["Body"].read()
    result = result.decode("utf-8")
    result = result.strip("\n").split("\n")
    preds = [float(num) for num in result]
    preds = [math.ceil(num) for num in preds]
    return preds


def batch_predict(data, batch_size, endpoint_name, content_type):
    items = len(data)
    arrs = []

    for offset in range(0, items, batch_size):
        if offset + batch_size < items:
            results = do_predict(data[offset : (offset + batch_size)], endpoint_name, content_type)
            arrs.extend(results)
        else:
            arrs.extend(do_predict(data[offset:items], endpoint_name, content_type))
        sys.stdout.write(".")
    return arrs


The following helps us calculate the Median Absolute Percent Error (MdAPE) on the batch dataset.


In [20]:
%%time
import json
import numpy as np

with open(test_data_filename, "r") as f:
    payload = f.read().strip()

labels = [int(line.split(" ")[0]) for line in payload.split("\n")]
test_data = [line for line in payload.split("\n")]
preds = batch_predict(test_data, 100, endpoint_name, "text/x-libsvm")

print(
    "\n Median Absolute Percent Error (MdAPE) = ",
    np.median(np.abs(np.array(labels) - np.array(preds)) / np.array(labels)),
)

.......
 Median Absolute Percent Error (MdAPE) =  0.14285714285714285
CPU times: user 23.8 ms, sys: 6.31 ms, total: 30.1 ms
Wall time: 197 ms


# Delete Endpoint

Once you are done using the endpoint, you can use the following to delete it.


In [21]:
client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'e14fb2f2-99e8-490d-aa9f-85a676cf96a9',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e14fb2f2-99e8-490d-aa9f-85a676cf96a9',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 26 Dec 2022 20:33:47 GMT'},
  'RetryAttempts': 0}}

# Data Preparation

TODO  data used in this notebook is not splitted

## Data split and upload

Following methods split the data into train/test/validation datasets and upload files to S3.

In [None]:
import io
import boto3
import random


def data_split(
    FILE_DATA,
    FILE_TRAIN,
    FILE_VALIDATION,
    FILE_TEST,
    PERCENT_TRAIN,
    PERCENT_VALIDATION,
    PERCENT_TEST,
):
    data = [l for l in open(FILE_DATA, "r")]
    train_file = open(FILE_TRAIN, "w")
    valid_file = open(FILE_VALIDATION, "w")
    tests_file = open(FILE_TEST, "w")

    num_of_data = len(data)
    num_train = int((PERCENT_TRAIN / 100.0) * num_of_data)
    num_valid = int((PERCENT_VALIDATION / 100.0) * num_of_data)
    num_tests = int((PERCENT_TEST / 100.0) * num_of_data)

    data_fractions = [num_train, num_valid, num_tests]
    split_data = [[], [], []]

    rand_data_ind = 0

    for split_ind, fraction in enumerate(data_fractions):
        for i in range(fraction):
            rand_data_ind = random.randint(0, len(data) - 1)
            split_data[split_ind].append(data[rand_data_ind])
            data.pop(rand_data_ind)

    for l in split_data[0]:
        train_file.write(l)

    for l in split_data[1]:
        valid_file.write(l)

    for l in split_data[2]:
        tests_file.write(l)

    train_file.close()
    valid_file.close()
    tests_file.close()


def write_to_s3(fobj, bucket, key):
    return (
        boto3.Session(region_name=region)
        .resource("s3")
        .Bucket(bucket)
        .Object(key)
        .upload_fileobj(fobj)
    )


def upload_to_s3(bucket, channel, filename):
    fobj = open(filename, "rb")
    key = f"{prefix}/{channel}"
    url = f"s3://{bucket}/{key}/{filename}"
    print(f"Writing to {url}")
    write_to_s3(fobj, bucket, key)


## Data ingestion

Next, we read the dataset from the existing repository into memory, for preprocessing prior to training. This processing could be done in situ by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as this one, reading into memory isn’t onerous, though it would be for larger datasets.

In [None]:
%%time
s3 = boto3.client("s3")

bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/DEMO-xgboost-abalone-default"
# Load the dataset
FILE_DATA = "abalone"
s3.download_file(
    "sagemaker-sample-files", f"datasets/tabular/uci_abalone/abalone.libsvm", FILE_DATA
)

# split the downloaded data into train/test/validation files
FILE_TRAIN = "abalone.train"
FILE_VALIDATION = "abalone.validation"
FILE_TEST = "abalone.test"
PERCENT_TRAIN = 70
PERCENT_VALIDATION = 15
PERCENT_TEST = 15
data_split(
    FILE_DATA,
    FILE_TRAIN,
    FILE_VALIDATION,
    FILE_TEST,
    PERCENT_TRAIN,
    PERCENT_VALIDATION,
    PERCENT_TEST,
)

# upload the files to the S3 bucket
upload_to_s3(bucket, "train", FILE_TRAIN)
upload_to_s3(bucket, "validation", FILE_VALIDATION)
upload_to_s3(bucket, "test", FILE_TEST)