# Hyperparameter Tuning using HyperDrive

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Environment
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive import BayesianParameterSampling
from azureml.train.hyperdrive import uniform, choice
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.widgets import RunDetails

import azureml.core

In [2]:
print("SDK version:", azureml.core.VERSION)

SDK version: 1.22.0


In [4]:
# Greate and check workspace
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: final-prj
Azure region: westus2
Subscription id: 0c66ad45-500d-48af-80d3-0039ebf1975e
Resource group: rgp


In [5]:
# Greate and check workspace
cluster_name = "cmp"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

Found existing compute target


## Dataset

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [6]:
ws = Workspace.from_config()
experiment_name = 'camels-exp'
project_folder = './dmik'
experiment=Experiment(ws, experiment_name)
dataset = ws.datasets['camels'] 
df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,Target,EQTA,EQTL,LLRTA,LLRGL,OEXTA,INCEMP,ROA,ROE,TDTL,TDTA,TATA
count,7020.0,7020.0,7020.0,7020.0,7020.0,7020.0,7014.0,7020.0,7020.0,7020.0,7020.0,7020.0
mean,0.019516,0.107825,8.02595,0.01232,0.021934,0.02402,33.65851,0.00202,-0.234058,44.756417,0.835683,0.176412
std,0.138338,0.048877,573.594468,0.009366,0.16089,0.030903,1156.779875,0.015031,11.39799,3147.677966,0.080119,0.142363
min,0.0,-0.160659,-0.195857,0.0,0.0,-0.012004,-3639.467742,-0.29575,-887.458333,0.0,0.0,0.0
25%,0.0,0.087487,0.125263,0.007216,0.012119,0.018253,3.084559,0.000907,0.009412,1.126635,0.805493,0.066298
50%,0.0,0.101018,0.156656,0.01004,0.015915,0.022036,18.162698,0.004832,0.045176,1.273882,0.850135,0.148018
75%,0.0,0.121013,0.212105,0.014293,0.022124,0.0264,34.348039,0.008417,0.078245,1.527407,0.883593,0.258563
max,1.0,0.968116,47829.25,0.161906,12.25,2.164806,73600.0,0.173673,21.9631,260238.5,1.151905,0.868327


**Note from Azure Docs**: Every hyperparameter run restarts the training from scratch, including rebuilding the model and all the data loaders. You can minimize this cost by using an Azure Machine Learning pipeline or manual process to do as much data preparation as possible prior to your training runs.


In [15]:
# Consider further speed up by breaking functionlity into scaling and trining:

# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# df.dropna(inplace=True)
# X = df[['EQTA', 'EQTL', 'LLRTA', 'LLRGL', 'OEXTA', 'INCEMP', 'ROA', 'ROE', 'TDTL', 'TDTA', 'TATA']].copy()
# y = df["Target"].values.reshape(-1, 1)
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
# scaler = StandardScaler()
# X_scaler = scaler.fit(X_train)
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

### Prepare environment and do 1 test run

In [7]:
# to install required packages
env = Environment('sklearn-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210104.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "sklearn-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"


In [11]:
args = ['--learning_rate', 0.1, '--n_estimators', 20, '--max_features', 5,  '--max_depth', 2]
src = ScriptRunConfig(source_directory=project_folder,
                      script='helpers.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)


In [12]:
run = experiment.submit(src)
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [13]:
run.wait_for_completion(show_output=True)

RunId: camels-exp_1616191224_bc26a312
Web View: https://ml.azure.com/experiments/camels-exp/runs/camels-exp_1616191224_bc26a312?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/rgp/workspaces/final-prj

Execution Summary
RunId: camels-exp_1616191224_bc26a312
Web View: https://ml.azure.com/experiments/camels-exp/runs/camels-exp_1616191224_bc26a312?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/rgp/workspaces/final-prj



{'runId': 'camels-exp_1616191224_bc26a312',
 'target': 'cmp',
 'status': 'Completed',
 'startTimeUtc': '2021-03-19T22:00:36.707852Z',
 'endTimeUtc': '2021-03-19T22:01:12.378765Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'e5ae11a2-e3cd-46b8-80bd-b930301b3c43',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'helpers.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--learning_rate',
   '0.1',
   '--n_estimators',
   '20',
   '--max_features',
   '5',
   '--max_depth',
   '2'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cmp',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'envi

## Hyperdrive Configuration

TODO: Explain the model you are using and the reason for chosing the different hyperparameters, termination policy and config settings.

In [None]:
# import os
# project_folder = './sklearn-iris'
# os.makedirs(project_folder, exist_ok=True)

In [14]:
# Create the different params that you will be using during training, no policy Bayesian sampling.
param_sampling = BayesianParameterSampling( {
        "learning_rate": choice(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95),
        "n_estimators" : choice(20, 30, 40, 50),
        "max_features": choice(2, 3, 4, 5),
        "max_depth" : choice(2, 3, 4, 5)
        }
)

# Specify the primary metric - 'recall' is warranted to minimize classification Type II  
primary_metric_name="norm_macro_recall" # similar to norm_macro_recall in AutoML
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

#Create the estimator
# src = ScriptRunConfig(source_directory=project_folder,
#                       script='helpers.py',
#                       arguments=args,
#                       compute_target=compute_target,
#                       environment=env)

#Create the hyperdrive config
hd_config = HyperDriveConfig(run_config=src,
                             hyperparameter_sampling=param_sampling,
                             policy=None,
                             primary_metric_name=primary_metric_name,
                             primary_metric_goal=primary_metric_goal,
                             max_total_runs=100,
                             max_concurrent_runs=2)

In [15]:
# Start the HyperDrive run
hyperdrive_run = experiment.submit(hd_config)

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [16]:
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [18]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: HD_a9edafb6-48d2-4955-b066-48ac824e3d30
Web View: https://ml.azure.com/experiments/camels-exp/runs/HD_a9edafb6-48d2-4955-b066-48ac824e3d30?wsid=/subscriptions/0c66ad45-500d-48af-80d3-0039ebf1975e/resourcegroups/rgp/workspaces/final-prj

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-03-19T22:02:18.108803][API][INFO]Experiment created<END>\n""<START>[2021-03-19T22:02:18.718442][GENERATOR][INFO]Trying to sample '2' jobs from the hyperparameter space<END>\n""<START>[2021-03-19T22:02:19.018447][GENERATOR][INFO]Successfully sampled '2' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-03-19T22:02:19.1283175Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END><START>[2021-03-19T22:02:50.4462648Z][SCHEDULER][INFO]Scheduling job, id='HD_a9edafb6-48d2-4955-b066-48ac824e3d30_1'<END><START>[2021-03-19T22:02:50.4767231Z][SCHEDULER][INFO]The execution environment was successfully prepared.<END><

In [21]:
assert(hyperdrive_run.get_status() == "Completed")

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [39]:
print(hyperdrive_run.get_metrics())

{'HD_a9edafb6-48d2-4955-b066-48ac824e3d30_99': {'Learning rate:': 0.4, 'Number of estimators:': 20, 'Number of features:': 4, 'Max tree depth:': 2, 'Accuracy': 0.9843230403800475}, 'HD_a9edafb6-48d2-4955-b066-48ac824e3d30_98': {'Learning rate:': 0.4, 'Number of estimators:': 30, 'Number of features:': 2, 'Max tree depth:': 5, 'Accuracy': 0.9667458432304038}, 'HD_a9edafb6-48d2-4955-b066-48ac824e3d30_96': {'Learning rate:': 0.6, 'Number of estimators:': 30, 'Number of features:': 2, 'Max tree depth:': 3, 'Accuracy': 0.9795724465558194}, 'HD_a9edafb6-48d2-4955-b066-48ac824e3d30_95': {'Learning rate:': 0.2, 'Number of estimators:': 50, 'Number of features:': 3, 'Max tree depth:': 3, 'Accuracy': 0.9833729216152018}, 'HD_a9edafb6-48d2-4955-b066-48ac824e3d30_94': {'Learning rate:': 0.95, 'Number of estimators:': 30, 'Number of features:': 2, 'Max tree depth:': 2, 'Accuracy': 0.9539192399049882}, 'HD_a9edafb6-48d2-4955-b066-48ac824e3d30_92': {'Learning rate:': 0.05, 'Number of estimators:': 20

In [43]:
print(hyperdrive_run.get_file_names())

['azureml-logs/hyperdrive.txt']


### Register the best model

In [52]:
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run.get_metrics()

AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [51]:
best_run.get_metrics()

AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [None]:
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']
best_run

In [38]:
print(hyperdrive_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_dbd0699a1e0687f9ab0d1e7d5997cd0a3a47b6ed43c991822726de86856909ed_d.txt', 'azureml-logs/65_job_prep-tvmps_dbd0699a1e0687f9ab0d1e7d5997cd0a3a47b6ed43c991822726de86856909ed_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_dbd0699a1e0687f9ab0d1e7d5997cd0a3a47b6ed43c991822726de86856909ed_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/101_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log']
