In [3]:
# Adding imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

In [4]:
# Defining utility methods
# Reference: http://boto3.readthedocs.io/en/latest/guide/s3.html
# bucket: Name of bucket
# key: File name stored in S3


def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)


def download_from_s3(filename, bucket, key):
    with open(filename,'wb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [5]:
# Downloading file from S3
download_from_s3('iris_all.csv', 'bornshrewd-aws-sagemaker-demo', 'iris_all.csv')

# Reading CSV File
df = pd.read_csv('iris_all.csv')

# Let's see how our data looks like
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [6]:
# We need to predict class (dependent Variable) from sepal_length, sepal_width, petal_length, petal_width (independent variables)
# As we can see class is a categorical variable so we need to convert it to equivalent numerical values.
# we will use preprocessing module from sklearn

le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
df['encoded_class'] = le.transform(df['class'])
df.head(2)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,encoded_class
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0


In [7]:
columns = ['encoded_class','sepal_length','sepal_width','petal_length','petal_width']

# Randomising the dataset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

# Generating training and Validation rows
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

# Write Training Set to file without header
df[:train].to_csv('iris_train.csv'
                          ,index=False,header=False
                          ,columns=columns)

# Write Validation Set to file without header
df[train:].to_csv('iris_validation.csv'
                          ,index=False,header=False
                          ,columns=columns)

# Write Column List
with open('iris_train_column_list.txt','w') as f:
    f.write(','.join(columns))

In [8]:
# Sagemaker cloud instance takes data from S3 so lets upload our test and validation data to S3

bucket_name = 'bornshrewd-aws-sagemaker-demo'
training_file_key = 'iris/iris_train.csv'
validation_file_key = 'iris/iris_validation.csv'

s3_model_output_location = r's3://{0}/iris/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_file_key)

# Uploading data to S3
write_to_s3('iris_train.csv',bucket_name,training_file_key)
write_to_s3('iris_validation.csv',bucket_name,validation_file_key)

In [9]:
# AWS Team has packaged ML algos as docker containers. Each container is stored in container registry
# Each container has unique entry known as container Registry Path
# We need to provide container Registry Path to sagemaker training job to indicate what algorithm to use for training

# https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

# containers is a dictionary mapping the region with Registry Path
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

In [10]:
# The role that we gave while launching the notebook instance to grant required permission to instance
role = get_execution_role()
print(role)

arn:aws:iam::138934935150:role/service-role/AmazonSageMaker-ExecutionRole-20190126T125419


In [11]:
# Establishing a SageMaker Session
sess = sagemaker.Session()

# Creating the estimator: Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
# role: passing the role that estimator can assume so that it can access our data files and resources
# train_instance_count: Specifying how many instances to use for distributed training 
# train_instance_type: what type of machine to use
# output_path: specify where the trained model artifacts needs to be stored
# base_job_name: Giving a name to the training job

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='xgboost-iris-v1')

In [12]:
# Specifying hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

estimator.set_hyperparameters(max_depth=5,
                              objective="multi:softmax",
                              num_class=3,
                              num_round=50)

estimator.hyperparameters()

{'max_depth': 5, 'objective': 'multi:softmax', 'num_class': 3, 'num_round': 50}

In [13]:
# Creating content variable which can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

In [14]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://bornshrewd-aws-sagemaker-demo/iris/iris_train.csv'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated', 'S3DataType': 'S3Prefix', 'S3Uri': 's3://bornshrewd-aws-sagemaker-demo/iris/iris_validation.csv'}}, 'ContentType': 'csv'}


In [15]:
# XGBoost supports "train", "validation" channels
# Reference: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html

# Training the model
estimator.fit({'train':training_input_config, 'validation':validation_input_config})

INFO:sagemaker:Creating training-job with name: xgboost-iris-v1-2019-02-10-12-31-31-826


2019-02-10 12:31:31 Starting - Starting the training job...
2019-02-10 12:31:34 Starting - Launching requested ML instances......
2019-02-10 12:32:41 Starting - Preparing the instances for training.........
2019-02-10 12:34:28 Downloading - Downloading input data
2019-02-10 12:34:28 Training - Downloading the training image.
[31mArguments: train[0m
[31m[2019-02-10:12:34:33:INFO] Running standalone xgboost training.[0m
[31m[2019-02-10:12:34:33:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8443.91mb[0m
[31m[2019-02-10:12:34:33:INFO] Determined delimiter of CSV input is ','[0m
[31m[12:34:33] S3DistributionType set as FullyReplicated[0m
[31m[12:34:33] 105x4 matrix with 420 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[31m[2019-02-10:12:34:33:INFO] Determined delimiter of CSV input is ','[0m
[31m[12:34:33] S3DistributionType set as FullyReplicated[0m
[31m[12:34:33] 45x4 matrix with 180 en


2019-02-10 12:34:42 Uploading - Uploading generated training model
2019-02-10 12:34:42 Completed - Training job completed
Billable seconds: 39


In [16]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
# initial_instance_count: Number of compute instance for hosting the model
# instance_type: Type of instance
# endpoint_name: Name of endpoint to be created


# Deploying the model
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'xgboost-iris-v1')

INFO:sagemaker:Creating model with name: xgboost-2019-02-10-13-15-55-760
INFO:sagemaker:Creating endpoint with name xgboost-iris-v1


---------------------------------------------------------------------------!

In [17]:
# Run Predictions
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [19]:
predictor.predict([[4.8,3.4,1.6,0.2],[5.8,2.7,4.1,1.0]])

b'0.0,1.0'