# Hyperparameter Tuning using HyperDrive


In [1]:
import os
import joblib
import pandas as pd
import numpy as np

In [2]:
from azureml.core.run import Run
from azureml.widgets import RunDetails
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.webservice import Webservice
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.model import InferenceConfig

In [4]:
ws = Workspace.from_config()
experiment_name = 'Bank_ruptcy_hyperdrive'
experiment = Experiment(ws, experiment_name)

run = experiment.start_logging()

In [5]:
# Creating a compute cluster or using an existing one
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "cpu-cluster" 

try:
    compute_target = ComputeTarget(workspace=ws, name =cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion()

## Dataset

In [6]:
data = Dataset.get_by_name(ws, name = 'bankruptcy_dataset')
data = data.to_pandas_dataframe()
data.head(3)

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,operating gross margin,realized sales gross margin,operating profit rate,tax Pre-net interest rate,after-tax net interest rate,non-industry income and expenditure/revenue,...,net income to total assets,total assets to GNP price,No-credit interval,Gross profit to Sales,Net income to stockholder's Equity,liability to equity,Degree of financial leverage (DFL),Interest coverage ratio( Interest expense to EBIT ),one if net income was negative for the last two year zero otherwise,equity to liability
0,1,0.370594257300249,0.424389446140427,0.40574977247176,0.601457213277793,0.601457213277793,0.998969203197885,0.796887145860514,0.808809360876843,0.302646433889668,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464290937454297,0.53821412996075,0.516730017666899,0.610235085544617,0.610235085544617,0.998945978205482,0.797380191277827,0.809300725667939,0.303556430290771,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071271876371,0.499018752725687,0.472295090743616,0.601450006486113,0.601363524985947,0.998857353483229,0.796403369254357,0.808387521469543,0.302035177342951,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474


## Hyperdrive Configuration

**Model**   
Here Logistic Regression algorithm is used, which is a supervisied binary classification algorithm that predicts the probability of a target varaible, returning either 1 or 0 (yes or no).  

**Parameter Sampler**  
* Here we use RandomParamaterSampler to determine the best values of the hyperparameters: **regularization strength, C** and **maximum number of iterations, max_iter**. 
* In this sampling algorithm, parameter values are randomly chosen from a set of discrete values or a distribution over a continuous range. Random Sampling is a great sampler to avoid bias, usually achieves great performance and it helps in discovering new hyperparameter values.
* Regularization strength is sampled over a uniform distribution with a minimum value of 0.5 and max value of 1, while the maximum number of iteration is sampled from a dicrete set of values which are 16, 32, 64 or 128.


**Early Stopping Policy**  
For this pipeline, Bandit Policy has been used, which is an early termination policy based on slack criteria, and the evaluation interval.
* Slack_factor is the ratio used to calculate the allowed distance from the best performing experiment run.  
* Evaluation_interval is the frequency for applying the policy.    
*The benefits of this stopping policy* is that any run that doesn't fall within the slack factor will be terminated so this helps us in making sure the experiment doesn't run for too long and burn up a lot of resources while trying to find the optimal paramater value. 

**Hyperdrive Configuration Settings**  
The HyperDriveConfig was configured using the chosen parameter sampler, early stopping policy, primary metric which is the *accuracy* and an estimator created for the training script *train.py*.


In [14]:
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import choice
from azureml.core import ScriptRunConfig
from azureml.core import Environment

# Specify a Policy
early_termination_policy = BanditPolicy(evaluation_interval=2,slack_factor=0.1)

# Specify parameter sampler
ps =  RandomParameterSampling( {
        "--C": uniform(0.5, 1.5),
        "--max_iter": choice(16, 32, 64, 128)
    }
)


if "training" not in os.listdir():
    os.mkdir("./training")
env = Environment.get(workspace=ws, name= "AzureML-Tutorial")


source = ScriptRunConfig(source_directory='.',
                      script ='train.py',
                      compute_target = compute_target,
                      environment = env)


hyperdrive_config = HyperDriveConfig(run_config = source,
                             hyperparameter_sampling=ps,
                             policy = early_termination_policy,
                             primary_metric_name = "average_precision_score_weighted",
                             primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs = 8,
                             max_concurrent_runs = 4)





In [15]:
hyperdrive_run = experiment.submit(hyperdrive_config)

## Run Details

In [16]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## Best Model


In [17]:
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()

# Details about the best run
print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['average_precision_score_weighted'])
print(best_run.get_details()['runDefinition']['arguments'])

os.makedirs('outputs', exist_ok=True)

# Registering the model
best_run.download_file("outputs/model.joblib","./outputs/model.joblib")
model=best_run.register_model(model_name='model',model_path='outputs/model.joblib',tags={'Training context':'Hyperdrive'},
                        properties={'Precision': best_run_metrics['average_precision_score_weighted']})

AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [18]:
best_run()

TypeError: 'NoneType' object is not callable