In [None]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role

## Upload Data to S3

In [None]:
bucket_name = 'bsb4018-ml-sagemaker'
training_file_key = 'biketrain/bike_train_numeric_columns.recordio'

s3_model_output_location = r's3://{0}/biketrain/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)

In [None]:
print(s3_model_output_location)
print(s3_training_file_location)

In [None]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [None]:
write_to_s3('bike_train_numeric_columns.recordio',bucket_name,training_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [None]:
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 3600 if use_spot_instances else None # in seconds

job_name = 'pca-biketrain-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/bikerental/checkpoints/{job_name}'
    
print (f'Checkpoint uri: {checkpoint_s3_uri}')

In [None]:
sess = sagemaker.Session()

In [None]:
role = get_execution_role()

In [None]:
# SDK 2 uses image_uris.retrieve the container image location
container = sagemaker.image_uris.retrieve("pca",sess.boto_region_name)

print (f'Using pca Container {container}')

## Build Model

In [None]:
estimator = sagemaker.estimator.Estimator(container,
                                          role, 
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name = job_name,
                                          use_spot_instances=use_spot_instances,
                                          max_run=max_run,
                                          max_wait=max_wait,
                                          checkpoint_s3_uri=checkpoint_s3_uri)

In [None]:
# Specify hyper parameters that appropriate for the training algorithm
estimator.set_hyperparameters(feature_dim=4,
                        num_components=3,
                        subtract_mean=False,
                        algorithm_mode='regular',
                        mini_batch_size=200)

In [None]:
estimator.hyperparameters()

### Train the model

In [None]:
# XGBoost supports "train", "validation" channels
estimator.fit({'train':s3_training_file_location})

## Deploy Model

In [None]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

## Run Predictions

In [None]:
# SDK 2.0 serializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [None]:
predictor.predict([[-1.333660693,-1.092736969,0.993213054,1.567753667]])

In [None]:
# Delete Endpoint to prevent unnecessary charges
predictor.delete_endpoint()