# SageMaker with scikit-learn

I use a [knn model](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier) from scikit-learn to predict the classes of flowers (based on the [iris dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html?highlight=iris#sklearn.datasets.load_iris)). The knn model is deployed as an API on AWS SageMaker.

In [None]:
# import modules
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter

## Prepare and upload iris data to AWS S3

In [None]:
# detach dataset
iris = load_iris()
X = iris['data']
y = iris['target']
feature_names = iris['feature_names']
target_names = iris['target_names']

In [None]:
# scale features
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.66, random_state=1)

In [None]:
# default sagemaker parameters
role = get_execution_role()
sagemaker_session = Session()
default_bucket = sagemaker_session.default_bucket()
print('Current SageMaker session: {}'.format(sagemaker_session))
print('Current SageMaker default bucket: {}'.format(default_bucket))

In [None]:
# upload training data to s3 as csv without header and index
data_name = 'iris-train.csv'
prefix = 'iris-scikit-learn'
pd.concat([pd.DataFrame(y_train), pd.DataFrame(X_train)], axis=1).to_csv(data_name, header=False, index=False)
input_data = sagemaker_session.upload_data(data_name, bucket=default_bucket, key_prefix=prefix)
print(input_data)

## knn model

In [None]:
# create scikit-learn estimator
estimator = SKLearn(
    role=role,
    instance_count=1,
    instance_type='ml.c4.xlarge',
    entry_point='train.py', 
    source_dir='source', 
    framework_version='0.23-1', 
    py_version='py3', 
    hyperparameters={
        'param_n_neighbors': 11,
        'param_weight': 'uniform',
        'param_p': 2,
        'param_cv': 5
    }
)

In [None]:
# fit model
estimator.fit({'train': input_data})

In [None]:
# create scikit-learn model from training artifacts
model = SKLearnModel(
    model_data=estimator.model_data,
    role=role,
    entry_point='train.py', 
    source_dir='source', 
    framework_version='0.23-1',
    py_version='py3'
)

In [None]:
# deploy endpoint
predictor = model.deploy(
    initial_instance_count=1, 
    instance_type='ml.t2.medium'
)

In [None]:
# make predictions based on test data
y_pred = predictor.predict(X_test)

In [None]:
# get metrics
accuracy = accuracy_score(y_test, y_pred)
print('accuracy: {} %'.format(round(accuracy * 100, 1)))

## Hyperparameter tuning

In [None]:
# range definition
ranges = {
    'param_n_neighbors': CategoricalParameter(list(np.arange(3, 13, 2))),
    'param_weight': CategoricalParameter(['uniform', 'distance']),
    'param_p': CategoricalParameter(list(np.arange(2, 8, 1)))
}

In [None]:
# metric definition
metrics = {
    'Name': 'accuracy',
    'Regex': 'accuracy: ([0-9\\.]+)'
}

In [None]:
# configure hyperparameter tuning
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name='accuracy',
    hyperparameter_ranges=ranges,
    metric_definitions=[metrics],
    max_parallel_jobs=3,
    max_jobs=10
)

In [None]:
# start hyperparameter tuning job
tuner.fit({'train': input_data})

In [None]:
# best training job model artifact
best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())
print(best_model_data)

In [None]:
# create scikit-learn model from training artifacts
best_model = SKLearnModel(
    model_data=best_model_data,
    role=role,
    entry_point='train.py', 
    source_dir='source', 
    framework_version='0.23-1',
    py_version='py3'
)

In [None]:
# deploy endpoint
best_predictor = best_model.deploy(
    initial_instance_count=1, 
    instance_type='ml.t2.medium'
)

In [None]:
# make predictions based on test data
best_y_pred = best_predictor.predict(X_test)

In [None]:
# get metrics
best_accuracy = accuracy_score(y_test, best_y_pred)
print('accuracy: {} %'.format(round(best_accuracy * 100, 1)))