# Hyperparameter Tuning using HyperDrive

In [1]:
# import dependencies
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace, Experiment, Dataset
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
#from azureml.train.hyperdrive import uniform, choice
from azureml.core import ScriptRunConfig
from azureml.core import Environment
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
import shutil

ws = Workspace.from_config()

In [2]:
# Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.
# source: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.amlcompute(class)?view=azure-ml-py

cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

This project uses the data from a DrivenData competition - [Pump it Up: Data Mining the Water Table](https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/).

The training data is devided into two files, with the target variable (labels) and the other variables (values). The target variable describe the functioning status of each pump (*functional*, *functional need repair* and *non functional*). Descriptive variables inlude waterpoint location, its founder, water quality and quantity, waterpoint type, etc.

As one need to be logged in to DrivenData in order to access the data, it cannot be downloaded via direct links and was stored as .csv files in the *data* folder. The original data stored to the Azure datastore, merged into a single data set and registered as a dataset.

In [3]:
#local paths to train data
path_labels = "data/train_labels.csv"
path_values = "data/train_values.csv"

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data', overwrite=True)

# create datasets referencing the cloud location
ds_labels = Dataset.Tabular.from_delimited_files(path = [(datastore, (path_labels))])
ds_values = Dataset.Tabular.from_delimited_files(path = [(datastore, (path_values))])

# join the target variable with other variables
df_labels = ds_labels.to_pandas_dataframe()
df_values = ds_values.to_pandas_dataframe()
df_joined = df_values.join(df_labels.set_index('id'), on='id')

Uploading an estimated of 3 files
Uploading data/train_labels.csv
Uploaded data/train_labels.csv, 1 files out of an estimated total of 3
Uploading data/train_values.csv
Uploaded data/train_values.csv, 2 files out of an estimated total of 3
Uploading data/train_pump.csv
Uploaded data/train_pump.csv, 3 files out of an estimated total of 3
Uploaded 3 files


In [4]:
# remove some columns
exclude_vars = ['id', 'recorded_by']
df_joined = df_joined.drop(exclude_vars, axis=1)

# store the merged data locally
path_df_joined = "data/train_pump.csv"
df_joined.to_csv(path_df_joined,index=False)

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data', overwrite=True)
ds_joined = Dataset.Tabular.from_delimited_files(path = [(datastore, (path_df_joined))])

# register dataset
# source: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets
ds_joined = ds_joined.register(workspace=ws,
                               name='train_pump',
                               description='Training data for the Pump it Up project',
                               create_new_version=True)

Uploading an estimated of 3 files
Uploading data/train_labels.csv
Uploaded data/train_labels.csv, 1 files out of an estimated total of 3
Uploading data/train_pump.csv
Uploaded data/train_pump.csv, 2 files out of an estimated total of 3
Uploading data/train_values.csv
Uploaded data/train_values.csv, 3 files out of an estimated total of 3
Uploaded 3 files


In [5]:
# store training data as a file in Datastore
ds_file = Dataset.File.from_files(path = [(datastore, (path_df_joined))])

In [6]:
# create experiment
experiment_name = 'pump_up'
experiment = Experiment(ws, experiment_name)

## Hyperdrive Configuration

Various boosting aproaches showed high performance during the AutoML training. This motivated to apply a gradient boosting approach and tune multiple hyperparameters using HyperDrive:

* number of boosting stages
* learning rate
* maximum depth

The random search of the parameter space was chosen for its relative computational efficiency (in comparison to the Bayesian sampling) and the ability to explore the parameter space with both continuous and discrete values. The random sampling is compatible with the early stopping policy that has a potential of lowering computation time and costs. The performance metric is evaluated every time the script reports the metric and the Bandit policy is configured to terminate any training runs that are below the calculated value with the slack factor of 0.15 (see details in the [documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-core/azureml.train.hyperdrive.banditpolicy?view=azure-ml-py)). A number of runs was configured to 12.

To achieve comparability with the AutoML models, weighted AUC was chosend as the target performance metric.


The `SKLearn` estimator is deprecated and `ScriptRunConfig` from `azureml.core.script_run_config` was used instead. This also requirs setting up an environment and mounting the dataset from the Datastore.

In [7]:
# source: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters
#  early termination policy
early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

# parameter sampler
param_sampling = RandomParameterSampling( {
    'n_estimators': choice(100, 200, 500),
    'learning_rate': uniform(0.1, 1.0),
    'max_depth': choice(1, 3, 5)
    }
)

# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-scikit-learn?view=azure-ml-py
sklearn_env = Environment.get(workspace=ws, name='AzureML-Tutorial')

# configure and submit your training run
run_config = ScriptRunConfig(source_directory='.',
                            script='train.py',
                            compute_target=cpu_cluster,
                            arguments=['--data_path', ds_file.as_named_input('input').as_mount()],
                            environment=sklearn_env)

hyperdrive_run_config = HyperDriveConfig(run_config=run_config,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='AUC_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=12,
                                     max_concurrent_runs=4)

In [8]:
# Submit your experiment
run = experiment.submit(config=hyperdrive_run_config)

## Run Details


In [9]:
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model

This section demonstrates the best performing model, downloads and registers it.

In [18]:
# source: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters
best_run = run.get_best_run_by_primary_metric()
print('Best model runId: ' + str(best_run.get_details()['runId']),
      'Best model hyperparameters: ' + str(best_run.get_details()['runDefinition']['arguments']), 
      'Best model  weighted AUC: ' + str(best_run.get_metrics()['AUC_weighted']), sep = '\n\n')

Best model runId: HD_6b990919-50b1-4929-bfac-3e98eef93c13_7

Best model hyperparameters: ['--data_path', 'DatasetConsumptionConfig:input', '--learning_rate', '0.3241192191641484', '--max_depth', '5', '--n_estimators', '500']

Best model  weighted AUC: 0.878499583427918


In [13]:
Register and save the best model
model = best_run.register_model(model_name='pump_it_up_hyperdrive_model', model_path='outputs/model.joblib')
model.download(target_dir='outputs_hyperdrive', exist_ok=True)

'outputs_hyperdrive/model.joblib'