### XGBoost Built-in Algorithm - Bike Rental Regression Example 

In [None]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role

##### Upload Data to S3

In [None]:
# Specify your bucket name
bucket_name = 'bsb557254-bp-pred-bucket'

training_folder = r'bikerental/training/'
validation_folder = r'bikerental/validation/'
test_folder = r'bikerental/test/'

s3_model_output_location = r's3://{0}/bikerental/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [None]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

In [None]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [None]:
write_to_s3('bike_train.csv', 
            bucket_name,
            training_folder + 'bike_train.csv')

write_to_s3('bike_validation.csv',
            bucket_name,
            validation_folder + 'bike_validation.csv')

write_to_s3('bike_test.csv',
            bucket_name,
            test_folder + 'bike_test.csv')

##### Training Algorithm Docker Image - SageMaker maintains a separate image for algorithm and region

In [None]:
# We will use spot instances for training
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 7200 if use_spot_instances else None # in seconds

job_name = 'xgboost-bikerental-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/bikerental/checkpoints/{job_name}'
    
print (f'Checkpoint uri: {checkpoint_s3_uri}')

In [None]:
# Establish a session with AWS
sess = sagemaker.Session()

In [None]:
role = get_execution_role()

In [None]:
# This role contains the permissions needed to train, deploy models 
# SageMaker Service is trusted to assume this role
print(role)

In [None]:
container = sagemaker.image_uris.retrieve("xgboost",sess.boto_region_name,version="1.2-2")
print (f'Using XGBoost Container {container}')

##### Build Model

In [None]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name = job_name,
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
    checkpoint_s3_uri=checkpoint_s3_uri)

In [None]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference

estimator.set_hyperparameters(max_depth=5,
                              objective="reg:squarederror",
                              eta=0.1,
                              num_round=150)

In [None]:
estimator.hyperparameters()

##### Specify Training Data Location and Optionally, Validation Data Location

In [None]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [None]:
print(training_input_config.config)
print(validation_input_config.config)

##### Train the model

In [None]:
estimator.fit(data_channels)

##### Deploy Model

In [None]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

##### Run Predictions

In [None]:
from sagemaker.serializers import CSVSerializer

predictor.serializer = CSVSerializer()
predictor.predict([[3,0,1,2,28.7,33.335,79,12.998,2011,7,7,3]])