# Model knn

In [None]:
import os
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter

# Function to validate different datasets

validate_data can act as a base function for all different machine learning models to be validated.

In [None]:
def validate_data(dataset_list, base_hyperparameter_dict, sweep_hyperparameters_dict, jobs=10, parallel_jobs=3):
    
    print('define some sagemaker base parameters...', end='')
    
    # default sagemaker parameters
    role = get_execution_role()
    sagemaker_session = Session()
    default_bucket = sagemaker_session.default_bucket()
    
    # metric definition
    metrics = {
        'Name': 'test-accuracy',
        'Regex': 'test-accuracy: ([0-9\\.]+)'
    }
    
    # create scikit-learn estimator
    estimator = SKLearn(
        role=role,
        instance_count=1,
        instance_type='ml.c4.xlarge',
        entry_point='train-knn.py', 
        source_dir='source', 
        framework_version='0.23-1', 
        py_version='py3', 
        hyperparameters=base_hyperparameter_dict
    )
    
    print('done')
    
    accuracy_list = []
    for dataset in dataset_list:
        
        print('evaluate model {}'.format(dataset))
        
        # upload data to s3, must be done before any fitting happens
        print('upload data to s3...', end='')
        input_data = []
        prefix_list = ['train', 'test', 'validation']
        for prefix in prefix_list:
            specific_path = '{}-{}'.format(prefix, dataset)
            tmp_path = sagemaker_session.upload_data(specific_path, bucket=default_bucket, key_prefix=specific_path)
            input_data.append(tmp_path)
            
        print('done')
        
        # configure hyperparameter tuning
        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name='test-accuracy',
            hyperparameter_ranges=sweep_hyperparameters_dict,
            metric_definitions=[metrics],
            max_parallel_jobs=parallel_jobs,
            max_jobs=jobs,
            base_tuning_job_name=dataset
        )
        
        # start hyperparameter tuning job
        print('start hyperparameters tunig...', end='')
        tuner.fit({'train': input_data[0], 'test': input_data[1]})
        print('done')

        # best training job model artifact
        best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())

        # create scikit-learn model from training artifacts
        best_model = SKLearnModel(
            model_data=best_model_data,
            role=role,
            entry_point='train-knn.py', 
            source_dir='source', 
            framework_version='0.23-1',
            py_version='py3',
            name=dataset
        )

        # deploy endpoint
        print('deploy best model...', end='')
        best_predictor = best_model.deploy(
            initial_instance_count=1, 
            instance_type='ml.t2.medium'
        )
        print('done')

        # read validation data
        validation_data = pd.read_csv('{}/{}'.format(input_data[2], 'validation.csv'))
        validation_y = validation_data.iloc[:, 0]
        validation_X = validation_data.iloc[:, 1:]
        
        # make predictions based on validation data
        best_pred_y = best_predictor.predict(validation_X)

        # get accuracy metrics
        best_accuracy = accuracy_score(validation_y, best_pred_y)
        accuracy_list.append(best_accuracy)
        print('validate model')
        print('accuracy: {} %'.format(round(best_accuracy * 100, 1)))
        
        # remove resources
        best_predictor.delete_endpoint()
                                           
        return accuracy_list
                                      

### Define parameters for initial estimator and hyperparameter tuning ranges

In [None]:
base = {
    'param_n_neighbors': 5,
    'param_weight': 'uniform',
    'param_p': 2
}  

In [None]:
ranges = {
    'param_n_neighbors': CategoricalParameter(list(np.arange(3, 13, 2))),
    'param_weight': CategoricalParameter(['uniform', 'distance']),
    'param_p': CategoricalParameter(list(np.arange(2, 8, 1)))
}

### Select data to be validated

In [None]:
data = ['tf-5000-250-1', 'tf-5000-500-1', 'tf-idf-5000-250-1', 'tf-idf-5000-500-1']

### Start validate data function

In [None]:
accuracy_list = validate_data(data, base, ranges, 3, 3)

### Write results to file system

In [None]:
def save_model_results(accuracy_list, data_list, path, file):
    data_path = '{}/{}'.format(path, file)
    if not os.path.exists(path):
        os.mkdir(path)
        
    save_dict = {
        'data': data_list,
        'accuracy': accuracy_list
    }
    
    accuracy_df = pd.from_dict(save_dict)
    accuracy_df.to_csv(data_path, index=False)
    print('save done')

In [None]:
save_model_results(accuracy_list, data, 'evaluate', 'knn.csv')