# Model knn

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter

In [None]:
# default sagemaker parameters
role = get_execution_role()
sagemaker_session = Session()
default_bucket = sagemaker_session.default_bucket()
print('Current SageMaker session: {}'.format(sagemaker_session))
print('Current SageMaker default bucket: {}'.format(default_bucket))

In [None]:
# upload training data to s3 as csv without header and index
data_path = ['train-tf-500-1', 'test-tf-500-1']
input_data = []
for specific_path in data_path:
    tmp_path = sagemaker_session.upload_data(specific_path, bucket=default_bucket, key_prefix=specific_path)
    input_data.append(tmp_path)

print(input_data)

In [None]:
# create scikit-learn estimator
estimator = SKLearn(
    role=role,
    instance_count=1,
    instance_type='ml.c4.xlarge',
    entry_point='train-knn.py', 
    source_dir='source', 
    framework_version='0.23-1', 
    py_version='py3', 
    hyperparameters={
        'param_n_neighbors': 5,
        'param_weight': 'uniform',
        'param_p': 2
    }   
)

In [None]:
# fit model
estimator.fit({'train': input_data[0], 'test': input_data[1]})

# Hyperparameter tuning

This acts as a baseline case for development of a tuning function.

In [None]:
# range definition
ranges = {
    'param_n_neighbors': CategoricalParameter(list(np.arange(3, 13, 2))),
    'param_weight': CategoricalParameter(['uniform', 'distance']),
    'param_p': CategoricalParameter(list(np.arange(2, 8, 1)))
}

In [None]:
# metric definition
metrics = {
    'Name': 'test-accuracy',
    'Regex': 'test-accuracy: ([0-9\\.]+)'
}

In [None]:
# configure hyperparameter tuning
tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name='test-accuracy',
    hyperparameter_ranges=ranges,
    metric_definitions=[metrics],
    max_parallel_jobs=3,
    max_jobs=10
)

In [None]:
# start hyperparameter tuning job
tuner.fit({'train': input_data[0], 'test': input_data[1]})

In [None]:
# best training job model artifact
best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())
print(best_model_data)

In [None]:
# create scikit-learn model from training artifacts
best_model = SKLearnModel(
    model_data=best_model_data,
    role=role,
    entry_point='train-knn.py', 
    source_dir='source', 
    framework_version='0.23-1',
    py_version='py3'
)

In [None]:
# deploy endpoint
best_predictor = best_model.deploy(
    initial_instance_count=1, 
    instance_type='ml.t2.medium'
)

In [None]:
# read validation data
validation_data = pd.read_csv('{}/{}'.format('validation-tf-500-1', 'validation.csv'))
validation_y = validation_data.iloc[:, 0]
validation_X = validation_data.iloc[:, 1:]

In [None]:
# make predictions based on test data
best_pred_y = best_predictor.predict(validation_X)

In [None]:
# get metrics
best_accuracy = accuracy_score(validation_y, best_pred_y)
print('accuracy: {} %'.format(round(best_accuracy * 100, 1)))

# Function to test different dataset

In [None]:
def test_data(dataset_list, base_hyperparameter_dict, sweep_hyperparameters_dict):
    
    print('define some sagemaker base parameters...', end='')
    
    # default sagemaker parameters
    role = get_execution_role()
    sagemaker_session = Session()
    default_bucket = sagemaker_session.default_bucket()
    
    # metric definition
    metrics = {
        'Name': 'test-accuracy',
        'Regex': 'test-accuracy: ([0-9\\.]+)'
    }
    
    # create scikit-learn estimator
    estimator = SKLearn(
        role=role,
        instance_count=1,
        instance_type='ml.c4.xlarge',
        entry_point='train-knn.py', 
        source_dir='source', 
        framework_version='0.23-1', 
        py_version='py3', 
        hyperparameters=base_hyperparameter_dict
    )
    
    # configure hyperparameter tuning
    tuner = HyperparameterTuner(
        estimator=estimator,
        objective_metric_name='test-accuracy',
        hyperparameter_ranges=sweep_hyperparameters_dict,
        metric_definitions=[metrics],
        max_parallel_jobs=3,
        max_jobs=10
    )
    
    print('done')
    
    accuracy_list = []
    for dataset in dataset_list:
        
        train_data
        
    # upload data to s3, must be done in before any fitting happens
    input_data = []
    print('upload data to s3...', end='')
    for dataset in dataset_list:
        tmp_path = sagemaker_session.upload_data(specific_path, bucket=default_bucket, key_prefix=specific_path)
        input_data.append(tmp_path)    
    print('done')
    
        # start hyperparameter tuning job
        tuner.fit({'train': input_data[0], 'test': input_data[1]})

        # best training job model artifact
        best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())

        # create scikit-learn model from training artifacts
        best_model = SKLearnModel(
            model_data=best_model_data,
            role=role,
            entry_point='train-knn.py', 
            source_dir='source', 
            framework_version='0.23-1',
            py_version='py3'
        )

        # deploy endpoint
        best_predictor = best_model.deploy(
            initial_instance_count=1, 
            instance_type='ml.t2.medium'
        )

        # read validation data
        validation_data = pd.read_csv('{}/{}'.format('validation-tf-500-1', 'validation.csv'))
        validation_y = validation_data.iloc[:, 0]
        validation_X = validation_data.iloc[:, 1:]

        # get metrics
        best_accuracy = accuracy_score(validation_y, best_pred_y)
        print('accuracy: {} %'.format(round(best_accuracy * 100, 1)
                                      