In [None]:
!pip install yapf

## Iris Multi-class Classifier on Sagemaker

### Imports 

In [88]:
from sklearn.model_selection import train_test_split
import pandas as pd 
import boto3

### Load iris data and create train and test splits 

In [89]:
cols = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = pd.read_csv('iris.data', names=cols)
data.head(5)

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [90]:
print(data.shape)

(150, 5)


In [91]:
# random sample (shuffle) data | here frac=1 means 100% of data
data = data.sample(frac=1).reset_index(drop=True)

In [92]:
data.head(5)

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.0,2.3,3.3,1.0,Iris-versicolor
1,5.9,3.0,5.1,1.8,Iris-virginica
2,5.7,3.8,1.7,0.3,Iris-setosa
3,4.3,3.0,1.1,0.1,Iris-setosa
4,5.8,2.7,5.1,1.9,Iris-virginica


In [93]:
train, test = train_test_split(data, test_size=0.3)

In [94]:
test.shape

(45, 5)

In [95]:
train.to_csv('./input/train.csv')
test.to_csv('./input/test.csv')

### Upload train csv to S3 

In [96]:
bucket = 'arunprsh-sg-scikit-example'
region = 'us-east-1'
s3_session = boto3.Session().resource('s3')
s3_session.create_bucket(Bucket=bucket)
s3_session.Bucket(bucket).Object('train/train.csv').upload_file('./input/train.csv')

### Prep the Model script 

In [97]:
%%file sg_iris_train.py

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import argparse
import os


# Dictionary to convert labels to indices
LABEL_TO_INDEX = {'Iris-virginica': 0, 'Iris-versicolor': 1, 'Iris-setosa': 2}
# Dictionary to convert indices to labels
INDEX_TO_LABEL = {0: 'Iris-virginica', 1: 'Iris-versicolor', 2: 'Iris-setosa'}


def model_fn(model_dir):
    """
    :param model_dir: (string) specifies location of saved model.
    
    This function is used by AWS Sagemaker to load the model for deployment. 
    
    It does this by simply loading the model that was saved at the end of the 
    __main__ training block above and returning it to be used by the predict_fn
    function below.
    """
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model


def input_fn(request_body, request_content_type):
    """
    :param request_body: the body of the request sent to the model. The type can vary.
    :param request_content_type: (string) specifies the format/variable type of the request.
    
    This function is used by AWS Sagemaker to format a request body that is sent to 
    the deployed model.
    
    In order to do this, we must transform the request body into a numpy array and
    return that array to be used by the predict_fn function below.
    
    Note: Often times, you will have need to handle other request_content_types. 
    However, in this simple case, we are only going to accept text/csv and raise an error 
    for all other formats.
    """
    if request_content_type == 'text/csv':
        samples = []
        for r in request_body.split('|'):
            samples.append(list(map(float, r.split(','))))
        return np.array(samples)
    else:
        raise ValueError("The model only supports text/csv input")


def predict_fn(input_data, model):
    """
    :param input_data: (numpy array) returned array from input_fn above. 
    :param model (sklearn model) returned model loaded from model_fn above.
    
    This function is used by AWS Sagemaker to make the prediction on the data
    formatted by the input_fn above using the trained model.
    """
    return model.predict(input_data)


def output_fn(prediction, content_type):
    """
    :param prediction: the returned value from predict_fn above.
    :param content_type: (string) the content type the endpoint expects to be returned.
    
    This function reformats the predictions returned from predict_fn to the final
    format that will be returned as the API call response.
    
    Note: Often times, you will have to handle other request_content_types. 
    """
    return '|'.join([INDEX_TO_LABEL[idx] for idx in prediction])
    


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    args = parser.parse_args()
    print(args.output_data_dir)
    print(args.model_dir)
    print(args.train)
    print(args.test)
    
    # Load data from the location specified by args.train (In this case, an S3 bucket)
    data = pd.read_csv(os.path.join(args.train,'train.csv'), index_col=0, engine="python")

    # Separate input variables and labels
    train_X = data[[col for col in data.columns if col != 'label']]
    train_Y = data[['label']]

    # Convert labels from text to indices
    train_Y_enc = train_Y['label'].map(LABEL_TO_INDEX)
    print(train_X.head(5))
    print(train_Y_enc.head(5))
    
    # Train the logistic regression model using the fit method
    model = LogisticRegression().fit(train_X, train_Y_enc)
    
    # Save the model to the location specified by args.model_dir
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))

Overwriting sg_iris_train.py


In [98]:
%%!

python sg_iris_train.py --output-data-dir ./output/ --model-dir ./model/ --train ./input/ --test ./input/

 './output/',
 './model/',
 './input/',
 './input/',
 '     sepal length  sepal width  petal length  petal width',
 '104           6.4          2.7           5.3          1.9',
 '12            7.7          2.8           6.7          2.0',
 '11            6.0          2.9           4.5          1.5',
 '144           6.9          3.1           5.1          2.3',
 '130           4.6          3.2           1.4          0.2',
 '104    0',
 '12     0',
 '11     1',
 '144    0',
 '130    2',
 'Name: label, dtype: int64']

### Train the model using Sagemaker Estimator 

In [99]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

In [100]:
role = get_execution_role()

In [101]:
# Create the SKLearn estimator by directing it to the sg_iris_train.py script
iris_estimator = SKLearn(entry_point='sg_iris_train.py',
                         train_instance_type='ml.m4.xlarge',
                         role=role)
iris_estimator.__dict__

{'role': 'arn:aws:iam::892313895307:role/service-role/AmazonSageMaker-ExecutionRole-20200609T132696',
 'train_instance_count': 1,
 'train_instance_type': 'ml.m4.xlarge',
 'train_volume_size': 30,
 'train_volume_kms_key': None,
 'train_max_run': 86400,
 'input_mode': 'File',
 'tags': None,
 'metric_definitions': None,
 'model_uri': None,
 'model_channel_name': 'model',
 'code_uri': None,
 'code_channel_name': 'code',
 'sagemaker_session': <sagemaker.session.Session at 0x7f60eaaf95c0>,
 'base_job_name': None,
 '_current_job_name': None,
 'output_path': None,
 'output_kms_key': None,
 'latest_training_job': None,
 'jobs': [],
 'deploy_instance_type': None,
 '_compiled_models': {},
 'subnets': None,
 'security_group_ids': None,
 'encrypt_inter_container_traffic': False,
 'train_use_spot_instances': False,
 'train_max_wait': None,
 'checkpoint_s3_uri': None,
 'checkpoint_local_path': None,
 'rules': None,
 'debugger_hook_config': None,
 'tensorboard_output_config': None,
 'debugger_rule_con

In [102]:
# Train the model by passing the path to the S3 bucket containing the training data
iris_estimator.fit({'train': 's3://arunprsh-sg-scikit-example/train'})

2020-06-11 01:03:13 Starting - Starting the training job...
2020-06-11 01:03:15 Starting - Launching requested ML instances......
2020-06-11 01:04:35 Starting - Preparing the instances for training......
2020-06-11 01:05:27 Downloading - Downloading input data...
2020-06-11 01:06:00 Training - Downloading the training image..[34m2020-06-11 01:06:21,266 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-06-11 01:06:21,269 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-06-11 01:06:21,281 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-06-11 01:06:21,850 sagemaker-containers INFO     Module sg_iris_train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-06-11 01:06:21,850 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-06-11 01:06:21,850 sagemaker-containers INFO     Generating MANIFEST.in[0m
[34m2020-06-11 01:06

### Deploy the model and create an endpoint

In [103]:
# Deploy model
iris_predictor = iris_estimator.deploy(instance_type='ml.m4.xlarge', 
                                           initial_instance_count=1)

# Print the endpoint to test in next step
print(iris_predictor.endpoint)

# Uncomment and run to terminate the endpoint after you are finished
# predictor.delete_endpoint()

---------------!sagemaker-scikit-learn-2020-06-11-01-03-12-645


### Test optional functions

In [104]:
from sklearn.externals import joblib
import numpy as np
import sagemaker
import tarfile
import boto3



def input_fn(request_body, request_content_type):
    if request_content_type == 'text/csv':
        samples = []
        for r in request_body.split('|'):
            samples.append(list(map(float, r.split(','))))
        return np.array(samples)
    else:
        raise ValueError("The model only supports text/csv input")
        


def predict_fn(input_data, model):
    return model.predict(input_data)



def output_fn(prediction, content_type):
    return '|'.join([INDEX_TO_LABEL[idx] for idx in prediction])




def download_s3_model_to_local(bucket, key, local_model_name):    
    s3 = boto3.resource('s3')  
    s3.Object(bucket, key).download_file(local_model_name)

bucket = 'sagemaker-us-east-1-892313895307'
key = 'sagemaker-scikit-learn-2020-06-11-01-03-12-645/output/model.tar.gz'

local_model_name = "iris-model.tar.gz"

download_s3_model_to_local(bucket, key, local_model_name)

# uncompress
tar = tarfile.open(local_model_name, 'r:gz')
tar.extractall()
tar.close()

# load model with joblib
trained_model = joblib.load('model.joblib')



INDEX_TO_LABEL = {0: 'Iris-virginica', 1: 'Iris-versicolor', 2: 'Iris-setosa'}
print(request_body)
input_fn_out = input_fn(request_body=request_body, request_content_type=content_type)
print(input_fn_out)
predict_fn_out = predict_fn(input_fn_out, trained_model)
print(predict_fn_out)
output_fn_out = output_fn(prediction=predict_fn_out, content_type='text/csv')
output_fn_out

5.5,2.6,4.4,1.2|5.6,2.9,3.6,1.3|5.6,2.8,4.9,2.0|6.8,2.8,4.8,1.4|5.4,3.9,1.7,0.4|5.7,2.5,5.0,2.0|4.8,3.4,1.6,0.2|5.1,3.8,1.9,0.4|5.7,2.9,4.2,1.3|5.4,3.9,1.3,0.4|5.8,2.7,5.1,1.9|5.6,2.5,3.9,1.1|6.3,2.5,4.9,1.5|6.0,2.9,4.5,1.5|6.7,3.3,5.7,2.5|7.0,3.2,4.7,1.4|6.3,3.3,4.7,1.6|6.5,3.0,5.5,1.8|5.9,3.0,4.2,1.5|6.8,3.0,5.5,2.1|5.1,3.5,1.4,0.2|6.4,3.1,5.5,1.8|5.7,4.4,1.5,0.4|6.1,3.0,4.9,1.8|4.6,3.6,1.0,0.2|5.8,2.6,4.0,1.2|6.1,3.0,4.6,1.4|7.3,2.9,6.3,1.8|4.6,3.1,1.5,0.2|6.9,3.1,5.4,2.1|5.5,4.2,1.4,0.2|7.2,3.2,6.0,1.8|6.1,2.9,4.7,1.4|4.6,3.2,1.4,0.2|6.5,2.8,4.6,1.5|4.4,3.0,1.3,0.2|5.2,3.4,1.4,0.2|6.4,2.7,5.3,1.9|4.8,3.1,1.6,0.2|5.8,2.7,5.1,1.9|5.5,2.3,4.0,1.3|6.9,3.1,4.9,1.5|5.0,3.0,1.6,0.2|4.4,3.2,1.3,0.2|6.7,3.0,5.2,2.3
[[5.5 2.6 4.4 1.2]
 [5.6 2.9 3.6 1.3]
 [5.6 2.8 4.9 2. ]
 [6.8 2.8 4.8 1.4]
 [5.4 3.9 1.7 0.4]
 [5.7 2.5 5.  2. ]
 [4.8 3.4 1.6 0.2]
 [5.1 3.8 1.9 0.4]
 [5.7 2.9 4.2 1.3]
 [5.4 3.9 1.3 0.4]
 [5.8 2.7 5.1 1.9]
 [5.6 2.5 3.9 1.1]
 [6.3 2.5 4.9 1.5]
 [6.  2.9 4.5 1.5]
 [6.7 3.3 5.7 



'Iris-versicolor|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-setosa|Iris-setosa|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-versicolor|Iris-versicolor|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-virginica|Iris-setosa|Iris-virginica|Iris-setosa|Iris-virginica|Iris-setosa|Iris-versicolor|Iris-versicolor|Iris-virginica|Iris-setosa|Iris-virginica|Iris-setosa|Iris-virginica|Iris-versicolor|Iris-setosa|Iris-versicolor|Iris-setosa|Iris-setosa|Iris-virginica|Iris-setosa|Iris-virginica|Iris-versicolor|Iris-versicolor|Iris-setosa|Iris-setosa|Iris-virginica'

### Test the endpoint using test df

In [105]:
import pandas as pd
import boto3

In [106]:
# Load in the deploy_test data
test_data = pd.read_csv('./input/test.csv').values.tolist()

In [107]:
# Format test data features
request_body = ''
for row in test_data:
    row = [str(item) for item in row]
    row = row[1:-1]
    row_string = ','.join(row)
    request_body += row_string + '|'
request_body = request_body[:-1]

In [108]:
request_body

'5.7,3.0,4.2,1.2|5.6,2.7,4.2,1.3|5.6,2.8,4.9,2.0|6.2,2.2,4.5,1.5|6.7,3.0,5.2,2.3|6.0,2.2,4.0,1.0|5.9,3.2,4.8,1.8|5.0,3.4,1.6,0.4|6.4,3.2,5.3,2.3|5.7,3.8,1.7,0.3|6.3,2.8,5.1,1.5|6.0,3.0,4.8,1.8|4.9,2.5,4.5,1.7|7.0,3.2,4.7,1.4|6.5,2.8,4.6,1.5|6.2,2.9,4.3,1.3|4.8,3.4,1.9,0.2|6.8,3.0,5.5,2.1|5.5,2.4,3.7,1.0|5.1,3.7,1.5,0.4|6.9,3.1,5.4,2.1|5.8,2.8,5.1,2.4|6.0,3.4,4.5,1.6|4.6,3.4,1.4,0.3|4.7,3.2,1.6,0.2|5.1,3.5,1.4,0.3|7.4,2.8,6.1,1.9|7.1,3.0,5.9,2.1|7.7,2.6,6.9,2.3|5.0,3.0,1.6,0.2|5.8,2.7,3.9,1.2|6.4,3.1,5.5,1.8|4.4,3.2,1.3,0.2|5.1,3.8,1.5,0.3|6.5,3.0,5.5,1.8|5.1,3.8,1.6,0.2|5.0,3.5,1.6,0.6|5.7,2.9,4.2,1.3|6.7,2.5,5.8,1.8|6.3,2.5,5.0,1.9|4.3,3.0,1.1,0.1|5.2,3.4,1.4,0.2|5.0,2.0,3.5,1.0|6.7,3.0,5.0,1.7|5.6,3.0,4.1,1.3'

In [120]:
# Create Sagemaker client using boto3
client = boto3.client('sagemaker-runtime')
# Specify endpoint and content_type
endpoint_name = 'sagemaker-scikit-learn-2020-06-11-01-03-12-645'
content_type = 'text/csv'

In [121]:
# Make call to endpoint
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=request_body
    )

In [122]:
response

{'ResponseMetadata': {'RequestId': '58a76e62-7172-4c99-8b65-eb2537e0ab73',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '58a76e62-7172-4c99-8b65-eb2537e0ab73',
   'x-amzn-invoked-production-variant': 'AllTraffic',
   'date': 'Thu, 11 Jun 2020 01:18:05 GMT',
   'content-type': 'text/html; charset=utf-8',
   'content-length': '645'},
  'RetryAttempts': 0},
 'ContentType': 'text/html; charset=utf-8',
 'InvokedProductionVariant': 'AllTraffic',
 'Body': <botocore.response.StreamingBody at 0x7f60ea2de278>}

In [123]:
import json 

# Print out expected and returned labels
print(f"Expected: {'|'.join([row[-1] for row in test_data])}")
print("Returned:")
print()
result = response['Body'].read()
result

Expected: Iris-versicolor|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-setosa|Iris-virginica|Iris-virginica|Iris-virginica|Iris-versicolor|Iris-versicolor|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-virginica|Iris-versicolor|Iris-setosa|Iris-setosa|Iris-setosa|Iris-virginica|Iris-virginica|Iris-virginica|Iris-setosa|Iris-versicolor|Iris-virginica|Iris-setosa|Iris-setosa|Iris-virginica|Iris-setosa|Iris-setosa|Iris-versicolor|Iris-virginica|Iris-virginica|Iris-setosa|Iris-setosa|Iris-versicolor|Iris-versicolor|Iris-versicolor
Returned:



b'Iris-versicolor|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-virginica|Iris-versicolor|Iris-virginica|Iris-setosa|Iris-virginica|Iris-setosa|Iris-virginica|Iris-virginica|Iris-virginica|Iris-versicolor|Iris-versicolor|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-versicolor|Iris-setosa|Iris-virginica|Iris-virginica|Iris-virginica|Iris-setosa|Iris-setosa|Iris-setosa|Iris-virginica|Iris-virginica|Iris-virginica|Iris-setosa|Iris-versicolor|Iris-virginica|Iris-setosa|Iris-setosa|Iris-virginica|Iris-setosa|Iris-setosa|Iris-versicolor|Iris-virginica|Iris-virginica|Iris-setosa|Iris-setosa|Iris-versicolor|Iris-versicolor|Iris-versicolor'