# Machine learning model

In [17]:
# define libraries
import os
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from sagemaker.session import Session
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.session import get_execution_role
from sagemaker.tuner import HyperparameterTuner, CategoricalParameter

In [62]:
def save_model_results(accuracy, data, path):
    """
    Save accuracy result from validation data on local instance.
    
    Args:
    - accuracy (float): Accuracy result.
    - data (str): Defines the validated dataset.
    - path (str): Path where file is saved.
    
    Returns:
    - None: Writes file directly on local filesystem.
    """
        
    data_path = '{}/{}.csv'.format(path, data)
    if not os.path.exists(path):
        os.mkdir(path)
        
    save_dict = {
        'data': data,
        'accuracy': accuracy
    }
    
    accuracy_df = pd.DataFrame.from_dict([save_dict])
    accuracy_df.to_csv(data_path, index=False)
    print('save done')

In [63]:
def create_estimator(model, role, base_dict):
    
    if model == 'knn':
        set_entry_point = 'train-knn.py'
    elif model == 'svc':
        set_entry_point = 'train-svc.py'
    else:
       set_entry_point = 'train-gbc.py' 
    
    estimator = SKLearn(
        role=role,
        instance_count=1,
        instance_type='ml.c4.xlarge',
        entry_point=set_entry_point, 
        source_dir='source', 
        framework_version='0.23-1', 
        py_version='py3', 
        hyperparameters=base_dict
    )
        
    return estimator

In [64]:
def create_model(model, role, best_model_data):
    
    if model == 'knn':
        set_entry_point = 'train-knn.py'
    elif model == 'svc':
        set_entry_point = 'train-svc.py'
    else:
       set_entry_point = 'train-gbc.py'
    
    model = SKLearnModel(
        model_data=best_model_data,
        role=role,
        entry_point='train-knn.py', 
        source_dir='source', 
        framework_version='0.23-1',
        py_version='py3',
    )
    
    return model

In [65]:
def validate_data(model, data_prefix, dataset_list, valid_prefix, base_dict, sweep_dict, n_jobs=10, parallel_jobs=3):
    """
    Perform hyperparmeter search and use best model to predict accuracy on validation data. For hyperparmeter search, 
    the test data is used.
    
    Args:
    - bucket (str): SageMaker s3 bucket.
    - data_prefix (str): Path on s3 where training, test and validation data is found.
    - dataset_list (list of str): Datasets to be tested as list.
    - valid_prefix (str): Accuracy results are stored in path defined as valid_prefix.
    - base_dict (dict): Default hyperparameter for knn model.
    - sweep_dict (dict): Hyperparameter dictionary for search.
    - n_jobs (int): Number of hyperparameter combinations.
    - parallel_jobs (int): Number of searches executed at the same time.
    
    Returns:
    - None: Accuracy result is stored directly in a file.
    """
    
    print('define some SageMaker base parameters...', end='')
    
    # default sagemaker parameters
    role = get_execution_role()
    sagemaker_session = Session()
    default_bucket = sagemaker_session.default_bucket()
    
    # metric definition
    metrics = {
        'Name': 'test-accuracy',
        'Regex': 'test-accuracy: ([0-9\\.]+)'
    }
    
    # create estimator
    estimator = create_estimator(model, role, base_dict)
    
    print('done')
    
    accuracy_list = []
    for dataset in dataset_list:
        
        model_name = '{}-{}'.format(valid_prefix, dataset)
        print('evaluate model {}...'.format(model_name))
        
        # define input data
        input_data = []
        data_type_list = ['train', 'test', 'validation']
        for data_type in data_type_list:
            input_data.append('s3://{}/{}/{}-{}'.format(default_bucket, data_prefix, data_type, dataset))
        
        # configure hyperparameter tuning
        tuner = HyperparameterTuner(
            estimator=estimator,
            objective_metric_name='test-accuracy',
            hyperparameter_ranges=sweep_dict,
            metric_definitions=[metrics],
            max_parallel_jobs=parallel_jobs,
            max_jobs=n_jobs,
        )
        
        # start hyperparameter tuning job
        print('start hyperparameters tunig', end='')
        tuner.fit({'train': input_data[0], 'test': input_data[1]})

        # best training job model artifact
        best_model_data = 's3://{}/{}/output/model.tar.gz'.format(default_bucket, tuner.best_training_job())

        # create model from training artifacts
        best_model = create_model(model, role, best_model_data)

        # deploy endpoint
        print('deploy best model', end='')
        best_predictor = best_model.deploy(
            initial_instance_count=1, 
            instance_type='ml.t2.medium'
        )

        # read validation data
        validation_data = pd.read_csv('{}/{}'.format(input_data[2], 'validation.csv'))
        validation_y = validation_data.iloc[:, 0]
        validation_X = validation_data.iloc[:, 1:]
        
        # we have a lot of validation data, so we'll split it into batches of 100
        # split the validate data set into batches and evaluate using prediction endpoint  
        print('perform prediction', end='')
        splitted_validation_X = np.array_split(validation_X, 100)
        length_splitted_validation_X = len(splitted_validation_X)
        iteration = 0
        prediction_batches = []
        for batch in splitted_validation_X:
            iteration += 1
            if iteration % 10 == 0:
                print('.')
            prediction_batches.append(best_predictor.predict(batch))
            
        print('done')
        
        # merge batches together
        best_pred_y = np.concatenate(prediction_batches)                 

        # get accuracy metrics
        best_accuracy = accuracy_score(validation_y, best_pred_y)
        print('validate model...accuracy: {} %'.format(round(best_accuracy * 100, 1)))
        
        # save model results continuously into files in case something crashes, we have at least old results
        save_model_results(best_accuracy, dataset, valid_prefix)
        
        # remove resources
        best_predictor.delete_endpoint()
                                      

 ## Parameter for all models

In [57]:
# data to be validated
data = ['tf-5000-1000-1', 'tf-idf-5000-1000-1']

## k nearest neighbors model

In [42]:
# initial estimator parameters
base = {
    'param_n_neighbors': 5,
    'param_weight': 'uniform',
    'param_p': 2
}  

In [43]:
# tunable hyperparameters
ranges = {
    'param_n_neighbors': CategoricalParameter(list(np.arange(3, 13, 2))),
    'param_weight': CategoricalParameter(['uniform', 'distance']),
    'param_p': CategoricalParameter(list(np.arange(2, 8, 1)))
}

In [45]:
# perform validation
validate_data('knn', 'data', data, 'validation-knn', base, ranges, 1, 1)

define some SageMaker base parameters...done
evaluate model validation-knn-tf-5000-1000-1...
start hyperparameters tunig.................................................................!
done
deploy best model...-----------------!done
perform prediction...
batch 10/100
batch 20/100
batch 30/100
batch 40/100
batch 50/100
batch 60/100
batch 70/100
batch 80/100
batch 90/100
batch 100/100
done
validate model...
accuracy: 79.3 %
save done
evaluate model validation-knn-tf-idf-5000-1000-1...
start hyperparameters tunig............................................................................!
done
deploy best model...-------------------!done
perform prediction...
batch 10/100
batch 20/100
batch 30/100
batch 40/100
batch 50/100
batch 60/100
batch 70/100
batch 80/100
batch 90/100
batch 100/100
done
validate model...
accuracy: 61.5 %
save done


## Support vector model

In [66]:
# initial estimator parameters
base = {
    'param_C': 1,
    'param_kernel': 'rbf',
}  

In [69]:
# tunable hyperparameters
ranges = {
    'param_C': CategoricalParameter(list(np.arange(1, 4, 1))),
    'param_kernel': CategoricalParameter(['linear', 'poly', 'rbf', 'sigmoid']),
}

In [None]:
# perform validation
validate_data('svc', 'data', data, 'validation-svc', base, ranges, 1, 1)

define some SageMaker base parameters...done
evaluate model validation-svc-tf-5000-1000-1...
start hyperparameters tunig.......................................................!
done
deploy best model...----------------