# Machine learning models based on scikit-learn framework

## Libraries

In [None]:
import joblib
import os
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter, IntegerParameter, ContinuousParameter

## Definition of training job functions

In [None]:
def save_model_results(validation_accuracy, test_accuracy, data, path):
    """
    Save accuracy result from validation data on local instance.
    
    Args:
    - accuracy (float): Accuracy result.
    - data (str): Defines the validated dataset.
    - path (str): Path where file is saved.
    
    Returns:
    - None: Writes file directly on local filesystem.
    """
        
    data_path = '{}/{}.csv'.format(path, data)
    if not os.path.exists(path):
        os.mkdir(path)
        
    save_dict = {
        'data': data,
        'validation_accuracy': validation_accuracy,
        'test_accuracy': test_accuracy
    }
    
    accuracy_df = pd.DataFrame.from_dict([save_dict])
    accuracy_df.to_csv(data_path, index=False)
    print('accuracy save done')

In [None]:
def save_best_model_parameters(model_data_dict, data, path):
    """
    Saves the model hyperparameters to a file {path}/{data}.
    
    Args:
    - model_data_dict (dict): Dictionary of model hyperparameters.
    - data (str): Filename (without .csv).
    - path (str): Folder where file will be stored.
    
    Returns:
    - None: A file will be saved.
    """
    
    data_path = '{}/{}.csv'.format(path, data)
    if not os.path.exists(path):
        os.mkdir(path)
    
    save_dict = {
        'data': data,
    }
    save_dict.update(model_data_dict)
    
    model_data_df = pd.DataFrame.from_dict([save_dict])
    model_data_df.to_csv(data_path, index=False)
    print('best model parameter save done') 

In [None]:
def send_predictions_to_model(status_type, predictor, rows, X):
    """
    Send data where a prediction should be made of to an estimator.
    
    Args:
    - status_type (str): Status ['test', 'validation'] to be identified in output.
    - predictor (SageMaker predictor endpoint): SageMaker endpoint.
    - rows (int): Number of rows of data to be sent to endpoint.
    - X (numpy array): Complete numpy array where prediction should be made of.
    
    Return:
    - y_pred (numpy array): Predicted class labels.
    """
    
    print('perform {} prediction'.format(status_type), end='')
    X_split = np.array_split(X, int(X.shape[0] / float(rows) + 1))
        
    iteration = 0
    prediction_batches = []
    for batch in X_split:
        iteration += 1
        if iteration % 10 == 0:
            print('.', end='')
        prediction_batches.append(predictor.predict(batch))
            
    print('done')
        
    # merge batches together
    y_pred = np.concatenate(prediction_batches)  
        
    return y_pred 

In [None]:
def download_and_extract_model_data(bucket, model_data, session, extract_model_data):
    """
    Read the model data from s3 storage and extracts some parameters.
    
    Args:
    - bucket (str): SageMaker s3 bucket name.
    - model_data (str): Model identifier (training job) from SageMaker.
    - session (SageMaker session): SageMaker session.
    - extract_model_data (dict): Model data to be extracted from the file (keys).
    
    Returns:
    - information (dict): Dictionary with extracted information as requested by the keys extract_model_data.
    """
    
    session.download_data('tmp', bucket, key_prefix='{}/output'.format(model_data))
    !tar -xzf tmp/model.tar.gz
    model_data = joblib.load('model.joblib').get_params()
    information = {select_key: model_data[select_key] for select_key in extract_model_data}
    
    # clean up
    !rm -rfd tmp
    !rm model.joblib
    
    return information

In [None]:
def validate_data(model, data_prefix, dataset_list, base_dict, sweep_dict, n_jobs=10, parallel_jobs=3):
    """
    Perform hyperparmeter search and use best model to predict accuracy on validation data. For hyperparmeter search, 
    the test data is used. Final evaluated accuracy value will be stored in a directory called validation-{model}.
    
    Args:
    - model (str): Model identifier.
    - data_prefix (str): Path on s3 where training, test and validation data is found.
    - dataset_list (list of str): Datasets to be tested as list.
    - base_dict (dict): Default hyperparameter for the model.
    - sweep_dict (dict): Hyperparameter dictionary for search.
    - n_jobs (int): Number of hyperparameter combinations.
    - parallel_jobs (int): Number of searches executed at the same time.
    
    Returns:
    - None: Accuracy result is stored directly in a file.
    """
    
    print('define some SageMaker base parameters...', end='')
    
    # default sagemaker parameters
    role = get_execution_role()
    sagemaker_session = Session()
    default_bucket = sagemaker_session.default_bucket()
    
    # metric definition
    metrics = {
        'Name': 'test-accuracy',
        'Regex': 'test-accuracy: ([0-9\\.]+)'
    }
    
    # create estimator
    set_entry_point = 'train-{}.py'.format(model)
    estimator = SKLearn(
        role=role,
        instance_count=1,
        instance_type='ml.c5.2xlarge',
        entry_point=set_entry_point, 
        source_dir='source', 
        framework_version='0.23-1', 
        py_version='py3', 
        hyperparameters=base_dict
    )
    
    print('done')
    
    for dataset in dataset_list:
        
        model_name = '{}-{}'.format(model, dataset)
        print('evaluate model {}...'.format(model_name))
        
        # define input data
        input_data = []
        data_type_list = ['train', 'test', 'validation']
        for data_type in data_type_list:
            input_data.append('s3://{}/{}/{}-{}'.format(default_bucket, data_prefix, data_type, dataset))
        
        # configure hyperparameter tuning
        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name='test-accuracy',
            hyperparameter_ranges=sweep_dict,
            metric_definitions=[metrics],
            max_parallel_jobs=parallel_jobs,
            max_jobs=n_jobs,
        )
        
        # start hyperparameter tuning job
        print('start tuning', end='')
        tuner.fit({'train': input_data[0], 'test': input_data[1]})

        # best training job model artifact
        best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())
        
        # download model data and track required tuned parameters        
        if model == 'gbc':
            extract_parameters = ['max_depth', 'random_state', 'n_estimators', 'learning_rate']
        elif model == 'knn':
            extract_parameters = ['p', 'weights', 'n_neighbors']
        elif model == 'log':
            extract_parameters = ['C', 'max_iter']
        elif model == 'mlp':
            extract_parameters = ['activation', 'hidden_layer_sizes', 'max_iter', 'random_state', 'learning_rate_init']
        elif model == 'svc':
            extract_parameters = ['random_state', 'degree', 'kernel', 'C']

        best_model_parameters = download_and_extract_model_data(
            default_bucket, 
            tuner.best_training_job(), 
            sagemaker_session,
            extract_parameters
        )

        # create model from training artifacts
        best_model = SKLearnModel(
            model_data=best_model_data,
            role=role,
            entry_point=set_entry_point, 
            source_dir='source', 
            framework_version='0.23-1',
            py_version='py3',
        )

        # deploy endpoint
        print('deploy best model', end='')
        best_predictor = best_model.deploy(
            initial_instance_count=1, 
            instance_type='ml.t2.large'
        )
        print('')

        # read validation data
        validation_data = pd.read_csv('{}/{}'.format(input_data[2], 'validation.csv'))
        validation_y = validation_data.iloc[:, 0]
        validation_X = validation_data.iloc[:, 1:]
        
        # read test data
        test_data = pd.read_csv('{}/{}'.format(input_data[1], 'test.csv'))
        test_y = test_data.iloc[:, 0]
        test_X = test_data.iloc[:, 1:]
        
        # we have a lot of validation data, so we'll split it into batches of 100
        # split the validate data set into batches and evaluate using prediction endpoint  
        test_pred_y = send_predictions_to_model('test', best_predictor, 100, test_X.to_numpy())
        validation_pred_y = send_predictions_to_model('validation', best_predictor, 100, validation_X.to_numpy())               

        # get accuracy metrics
        test_accuracy = accuracy_score(test_y, test_pred_y)
        print('test model...accuracy: {} %'.format(round(test_accuracy * 100, 1)))
        validation_accuracy = accuracy_score(validation_y, validation_pred_y)
        print('validate model...accuracy: {} %'.format(round(validation_accuracy * 100, 1)))
        
        # save model results continuously into files in case something crashes, we have at least old results
        validation_prefix = 'validation-{}'.format(model) # add prefix where data will be stored
        tuned_model_prefix = 'tuned-model-{}'.format(model) # add prefix where best model data will be stored
        save_model_results(validation_accuracy, test_accuracy, dataset, validation_prefix)
        save_best_model_parameters(best_model_parameters, dataset, tuned_model_prefix)
        
        # remove resources
        best_predictor.delete_endpoint()
                                      

 ## Parameter for all models

In [None]:
# data to be validated
data = [
    'tf-44898-250-1', 'tf-idf-44898-250-1', 
    'tf-44898-125-1', 'tf-idf-44898-125-1', 
    'tf-44898-250-2', 'tf-idf-44898-250-2', 
    'tf-44898-125-2', 'tf-idf-44898-125-2'
]

In [None]:
# hyperparameter search job definition
jobs = 8 # we search for 8 combinations in parameter space
parallel_jobs = 8 # in parallel, we execute 4 jobs

## k nearest neighbors model

In [None]:
# initial estimator parameters
base = {
    'param_n_neighbors': 5,
    'param_weight': 'uniform',
    'param_p': 2
}  

In [None]:
# tunable hyperparameters
ranges = {
    'param_n_neighbors': IntegerParameter(3, 15),
    'param_weight': CategoricalParameter(['uniform', 'distance']),
    'param_p': IntegerParameter(1, 8)
}

In [None]:
# perform validation
validate_data('knn', 'data', data, base, ranges, jobs, parallel_jobs)

## Support vector model

In [None]:
# initial estimator parameters
base = {
    'param_random_state': 1,
    'param_kernel': 'poly',
    'param_C': 1.0,
    'param_degree': 3,
}  

In [None]:
# tunable hyperparameters
ranges = {
    'param_C': ContinuousParameter(0.001, 3.0),
    'param_degree': IntegerParameter(2, 3),
}

In [None]:
# perform validation
validate_data('svc', 'data', data, base, ranges, jobs, parallel_jobs)

## Logistic regression model

In [None]:
# initial estimator parameters
base = {
    'param_max_iter': 10000,
    'param_C': 1.0,
}  

In [None]:
# tunable hyperparameters
ranges = {
    'param_C': ContinuousParameter(0.001, 3.0),
}

In [None]:
# perform validation
validate_data('log', 'data', data, base, ranges, jobs, parallel_jobs)

## Gradient boosting model

In [None]:
# initial estimator parameters
base = {
    'param_learning_rate': 0.1,
    'param_n_estimators': 100,
    'param_random_state': 1,
    'param_max_depth': 3,
} 

In [None]:
# tunable hyperparameters
ranges = {
    'param_learning_rate': ContinuousParameter(0.001, 0.5),
    'param_n_estimators': IntegerParameter(100, 1000),
    'param_max_depth': IntegerParameter(2, 10),
}

In [None]:
# perform validation
validate_data('gbc', 'data', data, base, ranges, jobs, parallel_jobs)

## Recurrent neural network model

In [None]:
# initial estimator parameters
base = {
    'param_hidden_layer_size': 2,
    'param_start_hidden_layer': 50,
    'param_end_hidden_layer': 10,
    'param_learning_rate': 0.001,
    'param_random_state': 1,
    'param_max_iter': 1000,
    'param_activation': 'relu',
    'param_start_hidden_layer': 125,
    'param_end_hidden_layer': 2,
} 

In [None]:
# tunable hyperparameters
ranges = {
    'param_hidden_layer_size': IntegerParameter(1, 5),
    'param_learning_rate': ContinuousParameter(0.001, 0.1),
}

In [None]:
validate_data('mlp', 'data', data, base, ranges, jobs, parallel_jobs)