## Train a Scikit-Learn Model using SageMaker Script Mode
#### Bring Your Own Script (BYOS)

### Create Train Script 

In [1]:
%%file train.py
from sklearn.neighbors import KNeighborsClassifier
from os.path import join
from io import BytesIO
import pandas as pd
import numpy as np
import argparse
import logging
import pickle
import time
import json
import sys
import os

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

if 'SAGEMAKER_METRICS_DIRECTORY' in os.environ:
    log_file_handler = logging.FileHandler(join(os.environ['SAGEMAKER_METRICS_DIRECTORY'], "metrics.json"))
    log_file_handler.setFormatter("{'time':'%(asctime)s', 'name': '%(name)s', 'level': '%(levelname)s', 'message': '%(message)s'}")
    logger.addHandler(log_file_handler)
    
    
def model_fn(model_dir):
    print('[-------------- INSIDE MODEL FN --------------]')
    print(f'MODEL DIR: {model_dir}')
    model = pickle.load(open(os.path.join(model_dir, 'model'), 'rb'))
    return model


def input_fn(request_body, request_content_type):
    print('[-------------- INSIDE INPUT FN --------------]')
    print(f'REQUEST BODY: {request_body}')
    print(f'REQUEST CONTENT TYPE: {request_content_type}')
    if request_content_type == 'application/x-npy':
        stream = BytesIO(request_body)
        return np.load(stream)
    else:
        raise ValueError('Content type must be application/x-npy')


def predict_fn(input_data, model):
    print('[-------------- INSIDE PREDICT FN --------------]')
    print(f'INPUT DATA: {input_data}')
    print(f'MODEL: {model}')
    X = input_data.reshape(1, -1)
    prediction = model.predict(X)
    return prediction


def output_fn(prediction, content_type):
    print('[-------------- INSIDE OUTPUT FN --------------]')
    print(f'PREDICTION: {prediction}')
    print(f'CONTENT TYPE: {content_type}')
    if content_type == 'application/x-npy':
        buffer = BytesIO()
        np.save(buffer, prediction)
        return buffer.getvalue(), 'application/x-npy'
    else:
        raise ValueError('Accept header must be application/x-npy')


def train():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    # hyperparameters
    parser.add_argument('--nneighbors', type=int, default=5)
    args = parser.parse_args()
    
    # ------------------------- YOUR MODEL TRAINING LOGIC STARTS HERE -------------------------
    # Load data from the location specified by args.train (In this case, an S3 bucket)
    print("------- [STARTING TRAINING] -------")
    train_df = pd.read_csv(os.path.join(args.train, 'train.csv'), names=['class', 'bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate'])
    train_df.head()
    X_train = train_df[['bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate']]
    y_train = train_df['class']
    knn = KNeighborsClassifier(n_neighbors=args.nneighbors)
    knn.fit(X_train, y_train)
    # Save the trained Model inside the Container
    pickle.dump(knn, open(os.path.join(args.model_dir, 'model'), 'wb'))
    print("------- [TRAINING COMPLETE!] -------")
    
    print("------- [STARTING EVALUATION] -------")
    test_df = pd.read_csv(os.path.join(args.test, 'test.csv'), names=['class', 'bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate'])
    X_test = train_df[['bmi', 'diastolic_bp_change', 'systolic_bp_change', 'respiratory_rate']]
    y_test = train_df['class']
    acc = knn.score(X_test, y_test)
    print('Accuracy = {:.4f}%'.format(acc * 100))
    logger.info('Test Accuracy: {:.4f}%'.format(acc * 100))
    print("------- [EVALUATION DONE!] -------")

if __name__ == '__main__':
    train()

Overwriting train.py


### Imports 

In [2]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
import pandas as pd
import sagemaker

### Essentials 

In [3]:
role = get_execution_role()
session = sagemaker.Session()

### Train using SageMaker

In [4]:
WORK_DIRECTORY = '.././DATA'

train_data_s3_pointer = session.upload_data(f'{WORK_DIRECTORY}/train', key_prefix='byos-sklearn/train')
test_data_s3_pointer = session.upload_data(f'{WORK_DIRECTORY}/test', key_prefix='byos-sklearn/test')

In [5]:
estimator = SKLearn(entry_point='train.py',
                    instance_type='ml.m5.large',
                    instance_count=1,
                    framework_version='0.23-1',
                    role=role)

In [6]:
estimator.fit({'train': train_data_s3_pointer, 'test': test_data_s3_pointer})

2020-12-09 23:26:33 Starting - Starting the training job...
2020-12-09 23:26:57 Starting - Launching requested ML instancesProfilerReport-1607556393: InProgress
.........
2020-12-09 23:28:18 Starting - Preparing the instances for training...
2020-12-09 23:29:02 Downloading - Downloading input data...
2020-12-09 23:29:19 Training - Downloading the training image...
2020-12-09 23:29:59 Training - Training image download completed. Training in progress..[34m2020-12-09 23:29:59,835 sagemaker-training-toolkit INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-12-09 23:29:59,837 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-09 23:29:59,846 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-12-09 23:30:00,208 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-12-09 23:30:00,908 sagemaker-training-toolkit INFO     No GPUs detec

### Deploy Trained Model as SageMaker Endpoint

In [7]:
predictor = estimator.deploy(instance_type='ml.m5.large', 
                             initial_instance_count=1)

---------------!

### Test Real-Time Inference

In [8]:
df = pd.read_csv('.././DATA/test/test.csv', header=None)
test_df = df.sample(1)

In [9]:
test_df.drop(test_df.columns[[0]], axis=1, inplace=True)
test_df

Unnamed: 0,1,2,3,4
2455,1.021647,0.685836,0.613173,-0.817379


In [10]:
test_df.values

array([[ 1.02164732,  0.68583564,  0.61317348, -0.81737924]])

In [11]:
prediction = predictor.predict(test_df.values)

In [12]:
prediction

array([0])