In [1]:
import numpy as np
import pandas as pd

# Define IAM Roles
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
bucket_name= 'ani-sagemaker-edu' # create an S3 bucket
training_file_key = 'biketrain/bike_train.csv'
validation_file_key = 'biketrain/bike_val.csv'
test_file_key = 'biketrain/bike_test.csv'

In [3]:
s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name, training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name, test_file_key)


In [4]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://ani-sagemaker-edu/biketrain/model
s3://ani-sagemaker-edu/biketrain/bike_train.csv
s3://ani-sagemaker-edu/biketrain/bike_val.csv
s3://ani-sagemaker-edu/biketrain/bike_test.csv


In [5]:
# S3 naming Conventions
# files = objects in S3
# filename = key name in S3

def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f: # Read in binary mode
            return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [6]:
write_to_s3('bike_train.csv', bucket_name, training_file_key)
write_to_s3('bike_val.csv', bucket_name, validation_file_key)
write_to_s3('bike_test.csv', bucket_name, test_file_key)

### Training Algorithm Docker Image


AWS maintains a separate image for every region and algorithm

In [7]:
# Registry patch for algorithms provided by SageMaker
# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

containers = {'us-east-1':'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1'} # N.virginia

In [8]:
role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [9]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::205653679088:role/service-role/AmazonSageMaker-ExecutionRole-20231012T130147


### Build Model

In [10]:
sess = sagemaker.Session()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [11]:
# Access appropriate algorithm container image
# Specify how many instances to use for distributed training and what type of machine to use
# finally specify where the trained model artifacts needs to be stored
# reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
# Optionally, give a name to the training job using the base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                         role,
                                         instance_count = 1,
                                         instance_type = 'ml.m4.xlarge',
                                         output_path = s3_model_output_location,
                                         sagemaker_session = sess,
                                         base_job_name = 'xgboost-biketrain-vl')

In [35]:
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig
from sagemaker.estimator import Estimator
save_interval = 5

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                         role,
                                         instance_count = 1,
                                         instance_type = 'ml.m4.xlarge',
                                         output_path = s3_model_output_location,
                                         sagemaker_session = sess,
                                         base_job_name = 'xgboost-biketrain-vl',
                                          debugger_hook_config=DebuggerHookConfig(
                                                s3_output_path=s3_model_output_location,  # Required
                                                collection_configs=[
                                                    CollectionConfig(name="metrics", parameters={"save_interval": str(save_interval)}),
                                                    CollectionConfig(
                                                        name="feature_importance", parameters={"save_interval": str(save_interval)}
                                                    ),
                                                    CollectionConfig(name="full_shap", parameters={"save_interval": str(save_interval)}),
                                                    CollectionConfig(name="average_shap", parameters={"save_interval": str(save_interval)}),
                                                ],
                                            ),
                                            rules=[
                                                Rule.sagemaker(
                                                    rule_configs.loss_not_decreasing(),
                                                    rule_parameters={
                                                        "collection_names": "metrics",
                                                        "num_steps": str(save_interval * 2),
                                                    },
                                                ),
                                            ],
                                         )

In [36]:
# Specify hyperparameters that appropriate for the training algorithm
# XGBoost Training Parameters

#max_depth = 5, eta = 0.1, subsample = 0.7, num_round = 150
estimator.set_hyperparameters(max_depth = 5, objective = "reg:linear", eta = 0.1, subsample = 0.7, num_round = 200)
estimator.hyperparameters()

{'max_depth': 5,
 'objective': 'reg:linear',
 'eta': 0.1,
 'subsample': 0.7,
 'num_round': 200}

In [20]:
# not for execution
# How to specify hyperparameters for autotuning

# hyperparameter_ranges_linear = {
# 'alpha':ContinuousParameterRange(0.01, 10, scaling_type = "Linear"),
# 'lambda':ContinuousParameterRange(0.01, 10, scaling_type = "Linear"),
# 'num_round':ContinuousParameterRange(1, 200, scaling_type = "Linear")
# }


# tuner_linear = HyperparameterTuner(
#                 estimator,
#                 objective_metric_name,
#                 hyperparameter_ranges_linear,
#                 max_jobs = 20,
#                 max_parallel_jobs = 10,
#                 strategy= 'Random')


# tuner_linear.fit({'train': s3_input_train, 'validation': s3_input_validation}, include_cls_metadata = False)


### Specify Training Data Location and Optionally, Validation Data Location

In [37]:
# content type can be libsvm or csv for XGBoost
#training_input_config = sagemaker.session.s3_input(s3_data = s3_training_file_location, content_type = "csv")
#validation_input_config = sagemaker.session.s3_input(s3_data = s3_validation_file_location, content_type = "csv")

training_input_config = sagemaker.inputs.TrainingInput(s3_data = s3_training_file_location, content_type = "csv")
validation_input_config = sagemaker.inputs.TrainingInput(s3_data = s3_validation_file_location, content_type = "csv")

In [38]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ani-sagemaker-edu/biketrain/bike_train.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://ani-sagemaker-edu/biketrain/bike_val.csv', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


### Train the model

In [39]:
# XGBoost supports "train", "validation" channels
estimator.fit({'train': training_input_config, "validation": validation_input_config})

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgboost-biketrain-vl-2023-10-27-19-28-14-340


2023-10-27 19:28:14 Starting - Starting the training job...LossNotDecreasing: InProgress
......
2023-10-27 19:29:32 Starting - Preparing the instances for training......
2023-10-27 19:30:45 Downloading - Downloading input data......
2023-10-27 19:31:32 Training - Downloading the training image...
2023-10-27 19:32:12 Training - Training image download completed. Training in progress...[34m[2023-10-27 19:32:19.248 ip-10-2-195-249.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-10-27 19:32:19.281 ip-10-2-195-249.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-10-27:19:32:19:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-10-27:19:32:19:INFO] Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34m[2023-10-27:19:32:19:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-10-27:19:32:19:INFO] Running XGBoost Sag

[34m[91]#011train-rmse:0.25196#011validation-rmse:0.29434[0m
[34m[92]#011train-rmse:0.25152#011validation-rmse:0.29397[0m
[34m[93]#011train-rmse:0.25092#011validation-rmse:0.29361[0m
[34m[94]#011train-rmse:0.25028#011validation-rmse:0.29339[0m
[34m[95]#011train-rmse:0.24956#011validation-rmse:0.29306[0m
[34m[96]#011train-rmse:0.24867#011validation-rmse:0.29278[0m
[34m[97]#011train-rmse:0.24796#011validation-rmse:0.29256[0m
[34m[98]#011train-rmse:0.24741#011validation-rmse:0.29235[0m
[34m[99]#011train-rmse:0.24705#011validation-rmse:0.29221[0m
[34m[100]#011train-rmse:0.24681#011validation-rmse:0.29229[0m
[34m[101]#011train-rmse:0.24603#011validation-rmse:0.29180[0m
[34m[102]#011train-rmse:0.24534#011validation-rmse:0.29154[0m
[34m[103]#011train-rmse:0.24486#011validation-rmse:0.29125[0m
[34m[104]#011train-rmse:0.24422#011validation-rmse:0.29078[0m
[34m[105]#011train-rmse:0.24354#011validation-rmse:0.29057[0m
[34m[106]#011train-rmse:0.24292#011validation-rm

In [63]:
# import pickle as pkl
# model_file = 'xgboost-model'

# booster = pkl.load(open(model_file, 'rb'))
# booster.get_score()
# booster.get_fscore()

In [64]:
# to get the feature importance from the xgboost model trained


# import s3fs
# import pickle
# import tarfile



# model_path = 's3://ani-sagemaker-edu/biketrain/model/xgboost-biketrain-vl-2023-10-27-19-28-14-340/output/model.tar.gz'

# fs = s3fs.S3FileSystem()

# with fs.open(model_path, 'rb') as f:
#     with tarfile.open(fileobj=f, mode='r') as tar_f:
#         with tar_f.extractfile('xgboost-model') as extracted_f:
#             xgbooster = pickle.load(extracted_f, 'rb')

# xgbooster.get_fscore()

### Deploy Model

In [66]:
predictor= estimator.deploy(initial_instance_count =1, 
                           instance_type = 'ml.m4.xlarge',
                           endpoint_name = 'xgboost-biketrain-vl')

INFO:sagemaker:Creating model with name: xgboost-biketrain-vl-2023-10-27-19-56-13-868
INFO:sagemaker:Creating endpoint-config with name xgboost-biketrain-vl
INFO:sagemaker:Creating endpoint with name xgboost-biketrain-vl


-------!

### Run predictions

In [27]:
#from sagemaker.predictor import csv_serializer, json_deserializer


# predictor.content_type = 'text/csv'
# predictor.serializer = CSVSerializer
# predictor.deserializer = CSVDeserializer

In [67]:
from sagemaker.serializers import CSVSerializer
predictor.serializer = CSVSerializer()


In [94]:
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3,8]])

{'predictions': [{'score': 6.008012294769287}]}

In [107]:
result = predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3,8], [3,0,1,1,32.8,34.85,33,7.0015,2012,8,13,0,14]])

In [109]:
score = result['predictions'][1]['score']
score

5.5548529624938965

In [101]:
score = float(score)

In [102]:
score

6.008012294769287

In [103]:
np.expm1(score)  # since log transformations were applied

405.6741679912885

In [76]:
from sagemaker.predictor import Predictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor('xgboost-biketrain-vl')
predictor.content_type = CONTENT_TYPE_CSV
predictor.accept = CONTENT_TYPE_JSON
predictor.serializer = CSVSerializer()

#payload = '3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3'
predictor.deserializer = JSONDeserializer()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [77]:
print(predictor.predict([[3,0,1,1,32.8,34.85,33,7.0015,2012,8,13,0,14]]))

{'predictions': [{'score': 5.5548529624938965}]}


In [105]:
result = {'predictions': [{'score': 5.5548529624938965}]}
score = result['predictions'][0]['score']
score

5.5548529624938965

In [106]:
np.expm1(score)  # actual was 248

257.4889541549934