### Create Train Script 

In [1]:
%%file train.py

from sklearn.neighbors import KNeighborsClassifier
from io import BytesIO
import pandas as pd
import numpy as np
import argparse
import pickle
import json
import os


def model_fn(model_dir):
    print('[-------------- INSIDE MODEL FN --------------]')
    print(f'MODEL DIR: {model_dir}')
    model = pickle.load(open(os.path.join(model_dir, 'model'), 'rb'))
    return model


def input_fn(request_body, request_content_type):
    print('[-------------- INSIDE INPUT FN --------------]')
    print(f'REQUEST BODY: {request_body}')
    print(f'REQUEST CONTENT TYPE: {request_content_type}')
    if request_content_type == 'application/x-npy':
        stream = BytesIO(request_body)
        return np.load(stream)
    else:
        raise ValueError('Content type must be application/x-npy')


def predict_fn(input_data, model):
    print('[-------------- INSIDE PREDICT FN --------------]')
    print(f'INPUT DATA: {input_data}')
    print(f'MODEL: {model}')
    X = input_data.reshape(1, -1)
    prediction = model.predict(X)
    return prediction


def output_fn(prediction, content_type):
    print('[-------------- INSIDE OUTPUT FN --------------]')
    print(f'PREDICTION: {prediction}')
    print(f'CONTENT TYPE: {content_type}')
    if content_type == 'application/x-npy':
        buffer = BytesIO()
        np.save(buffer, prediction)
        return buffer.getvalue()
    else:
        raise ValueError('Accept header must be application/x-npy')


def train():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    args = parser.parse_args()
    
    # ------------------------- YOUR MODEL TRAINING LOGIC STARTS HERE -------------------------
    # Load data from the location specified by args.train (In this case, an S3 bucket)
    print("------- [STARTING TRAINING] -------")
    train_df = pd.read_csv(os.path.join(args.train, 'train.csv'), names=['class', 'mass', 'width', 'height', 'color_score'])
    train_df.head()
    X_train = train_df[['mass', 'width', 'height', 'color_score']]
    y_train = train_df['class']
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    # Save the trained Model inside the Container
    pickle.dump(knn, open(os.path.join(args.model_dir, 'model'), 'wb'))
    print("------- [TRAINING COMPLETE!] -------")
    
    print("------- [STARTING EVALUATION] -------")
    test_df = pd.read_csv(os.path.join(args.test, 'test.csv'), names=['class', 'mass', 'width', 'height', 'color_score'])
    X_test = train_df[['mass', 'width', 'height', 'color_score']]
    y_test = train_df['class']
    acc = knn.score(X_test, y_test)
    print('Accuracy = {:.2f}%'.format(acc * 100))
    print("------- [EVALUATION DONE!] -------")

if __name__ == '__main__':
    train()

Overwriting train.py


### Imports 

In [2]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
import pandas as pd

### Essentials 

In [3]:
role = get_execution_role()

### Train Model (Local Mode)

In [4]:
estimator = SKLearn(entry_point='train.py',
                    train_instance_type='local',
                    train_instance_count=1,
                    framework_version='0.23-1',
                    role=role)

In [5]:
estimator.fit({'train': 'file://.././DATA/train/train.csv', 'test': 'file://.././DATA/test/test.csv'})

Creating tmpxky4mx6n_algo-1-tch93_1 ... 
[1BAttaching to tmpxky4mx6n_algo-1-tch93_12mdone[0m
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:46,587 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:46,590 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:46,601 sagemaker_sklearn_container.training INFO     Invoking user training script.
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:47,777 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:47,794 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:47,807 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-tch93_1  |[0m 2020-11-05 04:41:47,817 sagemaker-training-toolkit INFO     Invoking us

### Deploy Model as Local Endpoint

In [6]:
predictor = estimator.deploy(1, 'local', endpoint_name='byos-sklearn')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


Attaching to tmpf0mztj4u_algo-1-gagtj_1
[36malgo-1-gagtj_1  |[0m 2020-11-05 04:41:52,757 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)
[36malgo-1-gagtj_1  |[0m 2020-11-05 04:41:52,760 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)
[36malgo-1-gagtj_1  |[0m 2020-11-05 04:41:52,761 INFO - sagemaker-containers - nginx config: 
[36malgo-1-gagtj_1  |[0m worker_processes auto;
[36malgo-1-gagtj_1  |[0m daemon off;
[36malgo-1-gagtj_1  |[0m pid /tmp/nginx.pid;
[36malgo-1-gagtj_1  |[0m error_log  /dev/stderr;
[36malgo-1-gagtj_1  |[0m 
[36malgo-1-gagtj_1  |[0m worker_rlimit_nofile 4096;
[36malgo-1-gagtj_1  |[0m 
[36malgo-1-gagtj_1  |[0m events {
[36malgo-1-gagtj_1  |[0m   worker_connections 2048;
[36malgo-1-gagtj_1  |[0m }
[36malgo-1-gagtj_1  |[0m 
[36malgo-1-gagtj_1  |[0m http {
[36malgo-1-gagtj_1  |[0m   include /etc/nginx/mime.types;
[36malgo-1-gagtj_1  |[0m   default_type application/octet-stream;
[

In [7]:
predictor.endpoint

'byos-sklearn'

### Evaluate Real Time Inference Locally 

In [8]:
df = pd.read_csv('.././DATA/test/test.csv', header=None)
test_df = df.sample(1)

In [9]:
test_df.drop(test_df.columns[[0]], axis=1, inplace=True)
test_df

Unnamed: 0,1,2,3,4
12,0.342857,0.382353,0.553846,0.970588


In [10]:
test_df.values

array([[0.34285714, 0.38235294, 0.55384615, 0.97058824]])

In [11]:
prediction = predictor.predict(test_df.values)

[36malgo-1-gagtj_1  |[0m 2020-11-05 04:42:01,432 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)
[36malgo-1-gagtj_1  |[0m [-------------- INSIDE MODEL FN --------------]
[36malgo-1-gagtj_1  |[0m MODEL DIR: /opt/ml/model
[36malgo-1-gagtj_1  |[0m [-------------- INSIDE INPUT FN --------------]
[36malgo-1-gagtj_1  |[0m REQUEST BODY: b"\x93NUMPY\x01\x00v\x00{'descr': '<f8', 'fortran_order': False, 'shape': (1, 4), }                                                          \n\x15_\xf1\x15_\xf1\xd5?|xxxxx\xd8?\x92\x1b\xb9\x91\x1b\xb9\xe1?\x10\x0f\x0f\x0f\x0f\x0f\xef?"
[36malgo-1-gagtj_1  |[0m REQUEST CONTENT TYPE: application/x-npy
[36malgo-1-gagtj_1  |[0m [-------------- INSIDE PREDICT FN --------------]
[36malgo-1-gagtj_1  |[0m INPUT DATA: [[0.34285714 0.38235294 0.55384615 0.97058824]]
[36malgo-1-gagtj_1  |[0m MODEL: KNeighborsClassifier()
[36malgo-1-gagtj_1  |[0m [-------------- INSIDE OUTPUT FN --------------]
[36malgo-1-gagtj_1  |[0m P

In [12]:
prediction

array([0])