In [None]:
# S3 prefix
prefix = 'Scikit-iris'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [None]:
import numpy as np
import os
from sklearn import datasets

# Load Iris dataset, then join labels and features
iris = datasets.load_iris()
joined_iris = np.insert(iris.data, 0, iris.target, axis=1)

from sklearn.model_selection import train_test_split
train, test = train_test_split(joined_iris, test_size=0.25)

# Create directory and write csv
os.makedirs('./data', exist_ok=True)
os.makedirs('./test', exist_ok=True)
np.savetxt('./data/iris.csv', train, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')
np.savetxt('./test/iris.csv', test, delimiter=',', fmt='%1.1f, %1.3f, %1.3f, %1.3f, %1.3f')

WORK_DIRECTORY = 'data'
TEST_DIRECTORY = 'test'
train_input = sagemaker_session.upload_data(WORK_DIRECTORY, key_prefix="{}/{}".format(prefix, WORK_DIRECTORY) )
test_input = sagemaker_session.upload_data(TEST_DIRECTORY, key_prefix="{}/{}".format(prefix, TEST_DIRECTORY) )
print('train: ' + train_input)
print('test: ' + test_input)

In [None]:
from sagemaker.tuner import ContinuousParameter, IntegerParameter, CategoricalParameter
from sagemaker.tuner import HyperparameterTuner
from sagemaker.sklearn.estimator import SKLearn

script_path = 'scikit_learn_your_mission.py'

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    sagemaker_session=sagemaker_session)

hyperparameter_ranges = {'max_leaf_nodes': IntegerParameter(2, 7),
                         'criterion': CategoricalParameter(['gini', 'entropy']),
                         'splitter': CategoricalParameter(['random', 'best'])}

objective_metric_name = 'Score-accuracy'
metric_definitions = [{'Name': 'Score-accuracy',
                       'Regex': 'score:([0-9\\.]+)'}]

max_parallel_jobs = 3
max_jobs = 3

tuner = HyperparameterTuner(estimator=sklearn,
                                    objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges,
                                    metric_definitions=metric_definitions,
                                    max_jobs=max_jobs,
                                    max_parallel_jobs=max_parallel_jobs)

tuner.fit({'train': train_input, 'test': test_input})

In [None]:
job_name = tuner.latest_tuning_job.name
print('HPO jobname: ' + job_name)

In [None]:
tuner.wait()

In [None]:
import pandas as pd

tuner_analysis = sagemaker.HyperparameterTuningJobAnalytics(tuner.latest_tuning_job.name)

full_df = tuner_analysis.dataframe()

if len(full_df) > 0:
    df = full_df[full_df['FinalObjectiveValue'] > -float('inf')]
    if len(df) > 0:
        df = df.sort_values('FinalObjectiveValue', ascending=False)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest":min(df['FinalObjectiveValue']),"highest": max(df['FinalObjectiveValue'])})
        pd.set_option('display.max_colwidth', -1)  # Don't truncate TrainingJobName        
    else:
        print("No training jobs have reported valid results yet.")
        
df

In [None]:
import bokeh
import bokeh.io
bokeh.io.output_notebook()
from bokeh.plotting import figure, show
from bokeh.models import HoverTool

class HoverHelper():

    def __init__(self, tuning_analytics):
        self.tuner = tuning_analytics

    def hovertool(self):
        tooltips = [
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
            ("TrainingJobName", "@TrainingJobName"),
        ]
        for k in self.tuner.tuning_ranges.keys():
            tooltips.append( (k, "@{%s}" % k) )

        ht = HoverTool(tooltips=tooltips)
        return ht

    def tools(self, standard_tools='pan,crosshair,wheel_zoom,zoom_in,zoom_out,undo,reset'):
        return [self.hovertool(), standard_tools]

hover = HoverHelper(tuner_analysis)

p = figure(plot_width=900, plot_height=400, tools=hover.tools(), x_axis_type='datetime')
p.circle(source=df, x='TrainingStartTime', y='FinalObjectiveValue')
show(p)

In [None]:
ranges = tuner_analysis.tuning_ranges
figures = []
for hp_name, hp_range in ranges.items():
    categorical_args = {}
    if hp_range.get('Values'):
        # This is marked as categorical.  Check if all options are actually numbers.
        def is_num(x):
            try:
                float(x)
                return 1
            except:
                return 0           
        vals = hp_range['Values']
        if sum([is_num(x) for x in vals]) == len(vals):
            # Bokeh has issues plotting a "categorical" range that's actually numeric, so plot as numeric
            print("Hyperparameter %s is tuned as categorical, but all values are numeric" % hp_name)
        else:
            # Set up extra options for plotting categoricals.  A bit tricky when they're actually numbers.
            categorical_args['x_range'] = vals

    # Now plot it
    p = figure(plot_width=500, plot_height=500, 
               title="Objective vs %s" % hp_name,
               tools=hover.tools(),
               x_axis_label=hp_name, y_axis_label='objective_name',
               **categorical_args)
    p.circle(source=df, x=hp_name, y='FinalObjectiveValue')
    figures.append(p)
show(bokeh.layouts.Column(*figures))