# Machine learning models based on scikit-learn framework

In [None]:
# define libraries
import os
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter, IntegerParameter, ContinuousParameter

In [None]:
def save_model_results(accuracy, data, path):
    """
    Save accuracy result from validation data on local instance.
    
    Args:
    - accuracy (float): Accuracy result.
    - data (str): Defines the validated dataset.
    - path (str): Path where file is saved.
    
    Returns:
    - None: Writes file directly on local filesystem.
    """
        
    data_path = '{}/{}.csv'.format(path, data)
    if not os.path.exists(path):
        os.mkdir(path)
        
    save_dict = {
        'data': data,
        'accuracy': accuracy
    }
    
    accuracy_df = pd.DataFrame.from_dict([save_dict])
    accuracy_df.to_csv(data_path, index=False)
    print('save done')

In [None]:
def validate_data(model, data_prefix, dataset_list, base_dict, sweep_dict, n_jobs=10, parallel_jobs=3):
    """
    Perform hyperparmeter search and use best model to predict accuracy on validation data. For hyperparmeter search, 
    the test data is used. Final evaluated accuracy value will be stored in a directory called validation-{model}.
    
    Args:
    - bucket (str): SageMaker s3 bucket.
    - data_prefix (str): Path on s3 where training, test and validation data is found.
    - dataset_list (list of str): Datasets to be tested as list.
    - base_dict (dict): Default hyperparameter for knn model.
    - sweep_dict (dict): Hyperparameter dictionary for search.
    - n_jobs (int): Number of hyperparameter combinations.
    - parallel_jobs (int): Number of searches executed at the same time.
    
    Returns:
    - None: Accuracy result is stored directly in a file.
    """
    
    print('define some SageMaker base parameters...', end='')
    
    # default sagemaker parameters
    role = get_execution_role()
    sagemaker_session = Session()
    default_bucket = sagemaker_session.default_bucket()
    
    # metric definition
    metrics = {
        'Name': 'test-accuracy',
        'Regex': 'test-accuracy: ([0-9\\.]+)'
    }
    
    # create estimator
    set_entry_point = 'train-{}.py'.format(model)
    estimator = SKLearn(
        role=role,
        instance_count=1,
        instance_type='ml.c4.xlarge',
        entry_point=set_entry_point, 
        source_dir='source', 
        framework_version='0.23-1', 
        py_version='py3', 
        hyperparameters=base_dict
    )
    
    print('done')
    
    accuracy_list = []
    for dataset in dataset_list:
        
        model_name = '{}-{}'.format(model, dataset)
        print('evaluate model {}...'.format(model_name))
        
        # define input data
        input_data = []
        data_type_list = ['train', 'test', 'validation']
        for data_type in data_type_list:
            input_data.append('s3://{}/{}/{}-{}'.format(default_bucket, data_prefix, data_type, dataset))
        
        # configure hyperparameter tuning
        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name='test-accuracy',
            hyperparameter_ranges=sweep_dict,
            metric_definitions=[metrics],
            max_parallel_jobs=parallel_jobs,
            max_jobs=n_jobs,
        )
        
        # start hyperparameter tuning job
        print('start hyperparameters tunig', end='')
        tuner.fit({'train': input_data[0], 'test': input_data[1]})

        # best training job model artifact
        best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())

        # create model from training artifacts
        best_model = SKLearnModel(
            model_data=best_model_data,
            role=role,
            entry_point=set_entry_point, 
            source_dir='source', 
            framework_version='0.23-1',
            py_version='py3',
        )

        # deploy endpoint
        print('deploy best model', end='')
        best_predictor = best_model.deploy(
            initial_instance_count=1, 
            instance_type='ml.t2.medium'
        )

        # read validation data
        validation_data = pd.read_csv('{}/{}'.format(input_data[2], 'validation.csv'))
        validation_y = validation_data.iloc[:, 0]
        validation_X = validation_data.iloc[:, 1:]
        
        # we have a lot of validation data, so we'll split it into batches of 100
        # split the validate data set into batches and evaluate using prediction endpoint  
        print('')
        print('perform prediction', end='')
        splitted_validation_X = np.array_split(validation_X, 100)
        length_splitted_validation_X = len(splitted_validation_X)
        iteration = 0
        prediction_batches = []
        for batch in splitted_validation_X:
            iteration += 1
            if iteration % 10 == 0:
                print('.', end='')
            prediction_batches.append(best_predictor.predict(batch))
            
        print('done')
        
        # merge batches together
        best_pred_y = np.concatenate(prediction_batches)                 

        # get accuracy metrics
        best_accuracy = accuracy_score(validation_y, best_pred_y)
        print('validate model...accuracy: {} %'.format(round(best_accuracy * 100, 1)))
        
        # save model results continuously into files in case something crashes, we have at least old results
        validation_prefix = 'validation-{}'.format(model) # add prefix where data will be stored
        save_model_results(best_accuracy, dataset, validation_prefix)
        
        # remove resources
        best_predictor.delete_endpoint()
                                      

 ## Parameter for all models

In [None]:
# data to be validated
#data = [
#    'tf-5000-500-1', 'tf-idf-5000-500-1', 
#    'tf-5000-1000-1', 'tf-idf-5000-1000-1', 
#    'tf-5000-5000-1', 'tf-idf-5000-5000-1', 
#    'tf-5000-500-2', 'tf-idf-5000-500-2', 
#    'tf-5000-1000-2', 'tf-idf-5000-1000-2', 
#    'tf-5000-5000-2', 'tf-idf-5000-5000-2'
#]

In [None]:
data = ['tf-idf-5000-500-1']

In [None]:
# hyperparameter search job definition
jobs = 20 # we search for 20 combinations in parameter space
parallel_jobs = 4 # in parallel, we execute 4 jobs

## k nearest neighbors model

In [None]:
# initial estimator parameters
base = {
    'param_n_neighbors': 5,
    'param_weight': 'uniform',
    'param_p': 2
}  

In [None]:
# tunable hyperparameters
ranges = {
    'param_n_neighbors': IntegerParameter(3, 15),
    'param_weight': CategoricalParameter(['uniform', 'distance']),
    'param_p': IntegerParameter(1, 8)
}

In [None]:
# perform validation
validate_data('knn', 'data', data, base, ranges, jobs, parallel_jobs)

## Support vector model

In [None]:
# initial estimator parameters
base = {
    'param_random_state': 1,
    'param_kernel': 'poly',
    'param_C': 1.0,
    'param_degree': 3,
}  

In [None]:
# tunable hyperparameters
ranges = {
    'param_C': ContinuousParameter(0.001, 3.0),
    'param_degree': IntegerParameter(2, 3),
}

In [None]:
# perform validation
validate_data('svc', 'data', data, base, ranges, jobs, parallel_jobs)

## Logistic regression model

In [None]:
# initial estimator parameters
base = {
    'param_max_iter': 10000,
    'param_C': 1.0,
}  

In [None]:
# tunable hyperparameters
ranges = {
    'param_C': ContinuousParameter(0.001, 3.0),
}

In [None]:
# perform validation
validate_data('log', 'data', data, base, ranges, jobs, parallel_jobs)

## Gradient boosting model

In [None]:
# initial estimator parameters
base = {
    'param_learning_rate': 0.1,
    'param_n_estimators': 100,
    'param_random_state': 1,
    'param_max_depth': 3,
} 

In [None]:
# tunable hyperparameters
ranges = {
    'param_learning_rate': ContinuousParameter(0.001, 0.5),
    'param_n_estimators': IntegerParameter(100, 1000),
    'param_max_depth': IntegerParameter(2, 10),
}

In [None]:
# perform validation
validate_data('gbc', 'data', data, base, ranges, jobs, parallel_jobs)

## Recurrent neural network model

In [None]:
# initial estimator parameters
base = {
    'param_hidden_layer_size': 3,
    'param_start_hidden_layer': 100,
    'param_end_hidden_layer': 10,
    'param_learning_rate': 0.001,
    'param_random_state': 1,
    'param_max_iter': 200,
    'param_activation': 'relu',
} 

In [None]:
# tunable hyperparameters
ranges = {
    'param_hidden_layer_size': IntegerParameter(2, 10),
    'param_start_hidden_layer': IntegerParameter(10, 1000),
    'param_end_hidden_layer': IntegerParameter(10, 1000),
    'param_learning_rate': ContinuousParameter(0.0001, 0.1),
    'param_max_iter': IntegerParameter(100, 10000),
    'param_activation': CategoricalParameter(['identity', 'logistic', 'relu', 'tanh'])
}

In [None]:
validate_data('mlp', 'data', data, base, ranges, jobs, parallel_jobs)