# Azure ML Hyperparameter Search Pipeline for COVID-CXR
This notebook defines an Azure machine learning pipeline for a hyperparameter search and submits the pipeline as an experiment to be run on an Azure virtual machine.

In [87]:
# Import statements
import azureml.core
from azureml.core import Experiment
from azureml.core import Workspace, Datastore
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep, HyperDriveStep, HyperDriveStepRun
from azureml.train.dnn import TensorFlow
from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform, loguniform
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.environment import Environment
from azureml.core.runconfig import RunConfiguration
from azureml.widgets import RunDetails
import shutil
import math

In [88]:
# Define some constants
CT_NAME = "d2-cpu-cluster"          # Name of our compute cluster
VM_SIZE = "STANDARD_D2_V2"          # Specify the Azure VM for execution of our pipeline
MAX_NODES = 3                       # Max number of compute nodes in cluster
TOTAL_RUNS = 12                     # Total training runs in hyperparameter search
PRIMARY_METRIC = 'validation_auc'   # Primary metric for optimization in hyperparameter search
WARM_START_RUNS = []                # List of HyperDriveRuns already ran to guide this experiment

### Register the workspace and configure its Python environment.

In [89]:
# Get reference to the workspace
ws = Workspace.from_config("./ws_config.json")
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = ' -- ')

# Set workspace's environment
env = Environment.from_pip_requirements(name = "covid-cxr_env", file_path = "./../requirements.txt")
env.register(workspace=ws)
runconfig = RunConfiguration(conda_dependencies=env.python.conda_dependencies)
print(env.python.conda_dependencies.serialize_to_string())

# Move AML ignore file to root folder
aml_ignore_path = shutil.copy('./.amlignore', './../.amlignore') 

covid-cxr -- res_grp1 -- canadacentral -- 52f01a1a-c766-41a4-9062-a73cffbef727
# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - pyyaml==5.2
  - matplotlib==3.1.1
  - tqdm==4.40.2
  - opencv_python==4.1.0.25
  - tensorflow_gpu==2.0.1
  - dill==0.3.0
  - scikit_image==0.15.0
  - numpy==1.17.4
  - pandas==0.25.3
  - pydicom==1.4.2
  - imbalanced_learn==0.6.1
  - tensorboard==2.0.2
  - lime==0.1.1.37
channels:
- anaconda
- conda-forge



### Create references to persistent and intermediate data
Create DataReference objects that point to our raw data on the blob. Configure a PipelineData object to point to preprocessed images stored on the blob.

In [90]:
# Get the blob datastore associated with this workspace
blob_store = Datastore(ws, name='covidcxrdata0')

# Create data references to folders on the blob
covid_data_dr = DataReference(
    datastore=blob_store,
    data_reference_name="raw_covid_data",
    path_on_datastore="data/covid-chestxray-dataset/")
rsna_data_dr = DataReference(
    datastore=blob_store,
    data_reference_name="raw_rsna_data",
    path_on_datastore="data/rsna/")

# Set up references to pipeline data (intermediate pipeline storage).
processed_pd = PipelineData(
    "processed_data",
    datastore=blob_store,
    output_name="processed_data")
metrics_pd = PipelineData(
    name='hparam_metrics_data',
    datastore=blob_store,
    pipeline_output_name="hparam_metric_data")

### Compute Target
Specify and configure the compute target for this workspace. If a compute cluster by the name we specified does not exist, create a new compute cluster.

In [91]:
# Set up the compute target for this
try:
    compute_target = AmlCompute(ws, CT_NAME)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating new compute target")
    provisioning_config = AmlCompute.provisioning_configuration(VM_SIZE=VM_SIZE,
                                                                min_nodes=1, 
                                                                max_nodes=MAX_NODES)    
    compute_target = ComputeTarget.create(ws, CT_NAME, provisioning_config)  # Create the compute cluster
    
    # Wait for cluster to be provisioned
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) 
    
print("Azure Machine Learning Compute attached")
print("Compute targets: ", ws.compute_targets)
compute_target = ws.compute_targets[CT_NAME]

Found existing compute target.
Azure Machine Learning Compute attached
Compute targets:  {'my-gpu-cluster': AmlCompute(workspace=Workspace.create(name='covid-cxr', subscription_id='52f01a1a-c766-41a4-9062-a73cffbef727', resource_group='res_grp1'), name=my-gpu-cluster, id=/subscriptions/52f01a1a-c766-41a4-9062-a73cffbef727/resourceGroups/res_grp1/providers/Microsoft.MachineLearningServices/workspaces/covid-cxr/computes/my-gpu-cluster, type=AmlCompute, provisioning_state=Failed, location=canadacentral, tags=None), 'nc6-gpu-cluster': AmlCompute(workspace=Workspace.create(name='covid-cxr', subscription_id='52f01a1a-c766-41a4-9062-a73cffbef727', resource_group='res_grp1'), name=nc6-gpu-cluster, id=/subscriptions/52f01a1a-c766-41a4-9062-a73cffbef727/resourceGroups/res_grp1/providers/Microsoft.MachineLearningServices/workspaces/covid-cxr/computes/nc6-gpu-cluster, type=AmlCompute, provisioning_state=Failed, location=canadacentral, tags=None), 'nc6s-gpu-cluster': AmlCompute(workspace=Workspace.

### Configure hyperparameter search experiment
We will launch multiple runs on out compute cluster to run our hyperparameter search. Below we define the ranges over which to search for hyperparameters. We will randomly sample over the defined range and pass the samples to each training run as arguments to the training script. Then we specify the primary metric to optimize. Finally, we specify a termination policy, to prevent resource wastage during poorly performing runs.

In [92]:
# Define random sampling ranges
param_sampling = RandomParameterSampling( {
        "INIT_FILTERS": choice(8, 16, 32),
        "FILTER_EXP_BASE": choice(range(2, 4)),
        "CONV_BLOCKS": choice(range(3, 6)),
        "NODES_DENSE0": choice(32, 64, 128, 256, 512),
        "LR": loguniform(math.log(1e-5), math.log(1e-2)),
        "OPTIMIZER": choice('adam'),
        "DROPOUT": uniform(0.25, 0.5),
        "L2_LAMBDA": loguniform(math.log(1e-5), math.log(5e-3)),
        "PATIENCE": choice(range(5, 11)),
        "IMB_STRATEGY": choice('class_weight')
    }
)

# Specify how we would like to optimize our primary metric
primary_metric_goal=PrimaryMetricGoal.MINIMIZE if 'loss' in PRIMARY_METRIC else PrimaryMetricGoal.MAXIMIZE

# Set termination policy
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

### Define pipeline and submit experiment.
Define the steps of an Azure machine learning pipeline. Create an Azure Experiment that will run our pipeline. Submit the experiment to the execution environment.

In [93]:
# Define preprocessing step the ML pipeline
step1 = PythonScriptStep(name="preprocess_step",
                         script_name="azure/preprocess_step/preprocess_step.py",
                         arguments=["--coviddatadir", covid_data_dr, "--rsnadatadir", rsna_data_dr, "--preprocesseddir", 
                                    processed_pd],
                         inputs=[covid_data_dr, rsna_data_dr],
                         outputs=[processed_pd],
                         compute_target=compute_target, 
                         source_directory="./../",
                         runconfig=runconfig,
                         allow_reuse=True)

# Define hyperparameter search step in the ML pipeline
est = Estimator(source_directory='./../',
                   script_params=None,
                   compute_target=compute_target,
                   entry_script='azure/hparam_train_step/hparam_train_step.py',
                   pip_packages=['tensorflow', 'tensorboard', 'pandas', 'dill', 'numpy', 'imblearn'],
                   pip_requirements_file='./requirements.txt')
hd_config = HyperDriveConfig(estimator=est, 
                             hyperparameter_sampling=param_sampling,
                             policy=early_termination_policy,
                             primary_metric_name=PRIMARY_METRIC, 
                             primary_metric_goal=primary_metric_goal, 
                             max_total_runs=TOTAL_RUNS,
                             max_concurrent_runs=MAX_NODES,
                             resume_from=WARM_START_RUNS)
step2 = HyperDriveStep(name="hyperdrive_step",
                       hyperdrive_config=hd_config,
                       estimator_entry_script_arguments=["--preprocesseddir", processed_pd],
                       inputs=[processed_pd],
                       metrics_output=metrics_pd)

# Construct the ML pipeline from the steps
steps = [step1, step2]
hparams_pipeline = Pipeline(workspace=ws, steps=steps)
hparams_pipeline.validate()

# Define a new experiment and submit a new pipeline run to the compute target.
experiment = Experiment(workspace=ws, name='HyperDriveExperiment')
hyperdrive_run = experiment.submit(hparams_pipeline, regenerate_outputs=False)
print("HyperDrive pipeline is submitted for execution")

# Move AML ignore file back to original folder
aml_ignore_path = shutil.move(aml_ignore_path, './.amlignore')

Step preprocess_step is ready to be created [49eb82de]
Step hyperdrive_step is ready to be created [febcee89]
Created step preprocess_step [49eb82de][337fb3c6-3465-48f5-a7da-44d5db9ea7c6], (This step will run and generate new outputs)
Created step hyperdrive_step [febcee89][0c4a9d86-f654-42b4-a753-cf345bfcb282], (This step will run and generate new outputs)
Using data reference raw_covid_data for StepId [342783b2][e504c00e-cf0c-408e-8bc5-ebe6819f8135], (Consumers of this data are eligible to reuse prior runs.)
Using data reference raw_rsna_data for StepId [c77d6b0d][89ae9f7a-275f-42f8-aec8-f22978bb298c], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun bab37375-51c3-4328-ab75-4413271c81f1
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/HyperDriveExperiment/runs/bab37375-51c3-4328-ab75-4413271c81f1?wsid=/subscriptions/52f01a1a-c766-41a4-9062-a73cffbef727/resourcegroups/res_grp1/workspaces/covid-cxr
HyperDrive pipeline is submitted 

### Retrieve results
Get the metrics of the pipeline run and download them to the local project. Then get the file name of the model with the best results

In [95]:
# Wait for the pipeline to finish running.
hyperdrive_run.wait_for_completion()

# Use the Azure RunDetails widget to view results of the hyperparameter search
hd_step_run = HyperDriveStepRun(step_run=hyperdrive_run.find_step_run("hyperdrive_step")[0])
RunDetails(hd_step_run).show()
best_run = hd_step_run.get_best_run_by_primary_metric()

# Download the metrics from the hyperparameter search experiment.
metrics_output = hyperdrive_run.get_pipeline_output("hparam_metric_data")
num_file_downloaded = metrics_output.download('./../results/logs/hparam_search/', show_progress=True)

# Print all metrics from the best run
best_run_metrics = best_run.get_metrics()
print("Best model's metrics:")
for metric_name in best_run_metrics:
    print(str(metric_name) + ': ' + str(best_run_metrics[metric_name]))

PipelineRunId: bab37375-51c3-4328-ab75-4413271c81f1
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/HyperDriveExperiment/runs/bab37375-51c3-4328-ab75-4413271c81f1?wsid=/subscriptions/52f01a1a-c766-41a4-9062-a73cffbef727/resourcegroups/res_grp1/workspaces/covid-cxr
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 0fc2607b-2fbe-44ec-b191-3eb7279fd44d
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/HyperDriveExperiment/runs/0fc2607b-2fbe-44ec-b191-3eb7279fd44d?wsid=/subscriptions/52f01a1a-c766-41a4-9062-a73cffbef727/resourcegroups/res_grp1/workspaces/covid-cxr
StepRun( preprocess_step ) Status: NotStarted
StepRun( preprocess_step ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_003afd1c713823f85b96b8a8f56055cc40b7d76421536a158262ffcc93df7f08_d.txt
2020-04-09T19:44:04Z Starting output-watcher...
2020-04-09T19:44:04Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using d

 12%|█▏        | 103/862 [00:06<00:44, 17.04it/s]
 12%|█▏        | 105/862 [00:06<00:43, 17.22it/s]
 12%|█▏        | 107/862 [00:07<00:42, 17.78it/s]
 13%|█▎        | 109/862 [00:07<00:49, 15.30it/s]
 13%|█▎        | 111/862 [00:07<00:47, 15.83it/s]
 13%|█▎        | 113/862 [00:07<00:50, 14.77it/s]
 13%|█▎        | 115/862 [00:07<00:47, 15.84it/s]
 14%|█▎        | 117/862 [00:07<00:49, 15.05it/s]
 14%|█▍        | 119/862 [00:07<00:47, 15.68it/s]
 14%|█▍        | 121/862 [00:07<00:45, 16.20it/s]
 14%|█▍        | 123/862 [00:08<00:57, 12.91it/s]
 15%|█▍        | 125/862 [00:08<00:54, 13.54it/s]
 15%|█▍        | 127/862 [00:08<00:55, 13.17it/s]
 15%|█▍        | 129/862 [00:08<00:56, 12.94it/s]
 15%|█▌        | 131/862 [00:08<00:57, 12.64it/s]
 15%|█▌        | 133/862 [00:08<00:55, 13.13it/s]
 16%|█▌        | 135/862 [00:09<00:51, 14.08it/s]
 16%|█▌        | 137/862 [00:09<00:47, 15.41it/s]
 16%|█▌        | 139/862 [00:09<00:46, 15.47it/s]
 16%|█▋        | 141/862 [00:09<00:43, 16.45it/s]


 58%|█████▊    | 501/862 [00:33<00:25, 14.05it/s]
 58%|█████▊    | 504/862 [00:34<00:22, 15.83it/s]
 59%|█████▊    | 506/862 [00:34<00:23, 15.18it/s]
 59%|█████▉    | 508/862 [00:34<00:22, 15.68it/s]
 59%|█████▉    | 510/862 [00:34<00:21, 16.43it/s]
 59%|█████▉    | 512/862 [00:34<00:20, 17.16it/s]
 60%|█████▉    | 514/862 [00:34<00:19, 17.47it/s]
 60%|█████▉    | 517/862 [00:34<00:18, 18.62it/s]
 60%|██████    | 520/862 [00:34<00:17, 19.60it/s]
 61%|██████    | 523/862 [00:35<00:18, 18.30it/s]
 61%|██████    | 526/862 [00:35<00:16, 20.26it/s]
 61%|██████▏   | 529/862 [00:35<00:16, 19.84it/s]
 62%|██████▏   | 532/862 [00:35<00:17, 19.03it/s]
 62%|██████▏   | 534/862 [00:35<00:22, 14.51it/s]
 62%|██████▏   | 536/862 [00:35<00:22, 14.22it/s]
 62%|██████▏   | 538/862 [00:36<00:21, 15.03it/s]
 63%|██████▎   | 540/862 [00:36<00:22, 14.61it/s]
 63%|██████▎   | 543/862 [00:36<00:19, 16.57it/s]
 63%|██████▎   | 545/862 [00:36<00:19, 16.57it/s]
 63%|██████▎   | 547/862 [00:36<00:30, 10.41it/s]


 19%|█▉        | 16/85 [00:01<00:04, 15.60it/s]
 21%|██        | 18/85 [00:01<00:04, 15.53it/s]
 25%|██▍       | 21/85 [00:01<00:03, 16.45it/s]
 27%|██▋       | 23/85 [00:01<00:03, 16.86it/s]
 29%|██▉       | 25/85 [00:01<00:03, 16.50it/s]
 32%|███▏      | 27/85 [00:01<00:03, 17.33it/s]
 34%|███▍      | 29/85 [00:01<00:03, 16.63it/s]
 36%|███▋      | 31/85 [00:01<00:03, 17.41it/s]
 39%|███▉      | 33/85 [00:02<00:03, 14.38it/s]
 41%|████      | 35/85 [00:02<00:03, 13.87it/s]
 44%|████▎     | 37/85 [00:02<00:03, 14.95it/s]
 46%|████▌     | 39/85 [00:02<00:02, 15.76it/s]
 48%|████▊     | 41/85 [00:02<00:03, 14.13it/s]
 51%|█████     | 43/85 [00:02<00:02, 15.08it/s]
 53%|█████▎    | 45/85 [00:02<00:03, 12.22it/s]
 55%|█████▌    | 47/85 [00:03<00:02, 12.74it/s]
 58%|█████▊    | 49/85 [00:03<00:02, 13.00it/s]
 60%|██████    | 51/85 [00:03<00:02, 13.16it/s]
 62%|██████▏   | 53/85 [00:03<00:02, 14.26it/s]
 65%|██████▍   | 55/85 [00:03<00:02, 14.23it/s]
 67%|██████▋   | 57/85 [00:03<00:01, 14.




StepRunId: 645a6f26-6df0-4448-8a53-7236e94729de
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/HyperDriveExperiment/runs/645a6f26-6df0-4448-8a53-7236e94729de?wsid=/subscriptions/52f01a1a-c766-41a4-9062-a73cffbef727/resourcegroups/res_grp1/workspaces/covid-cxr
StepRun( hyperdrive_step ) Status: NotStarted
StepRun( hyperdrive_step ) Status: Running

StepRun(hyperdrive_step) Execution Summary
StepRun( hyperdrive_step ) Status: Finished
{'runId': '645a6f26-6df0-4448-8a53-7236e94729de', 'status': 'Completed', 'startTimeUtc': '2020-04-09T19:50:57.692606Z', 'endTimeUtc': '2020-04-09T20:50:40.955511Z', 'properties': {'azureml.runsource': 'azureml.StepRun', 'ContentSnapshotId': '96ddd516-b142-4818-8012-d346336d82e8', 'StepType': 'HyperDriveStep', 'ComputeTargetType': 'HyperDrive', 'azureml.pipelinerunid': 'bab37375-51c3-4328-ab75-4413271c81f1'}, 'inputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://covidcxr7833484075.blob.core.windows.net/azure

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

Downloading azureml/645a6f26-6df0-4448-8a53-7236e94729de/hparam_metrics_data
Downloaded azureml/645a6f26-6df0-4448-8a53-7236e94729de/hparam_metrics_data, 1 files out of an estimated total of 1
Best model's metrics:
validation_auc: [0.9469895958900452, 0.9155709147453308, 0.9447751045227051, 0.9251211285591125, 0.9890658259391785, 0.9568166732788086, 0.9702422618865967]
test_loss: 41.10596684047154
test_accuracy: 0.9433962106704712
test_precision: 1.0
test_recall: 0.4000000059604645
test_auc: 0.9505161046981812
test_f1score: 0.5714285969734192
ROC: aml://artifactId/ExperimentRun/dcid.HD_e5a8ba4c-a1dc-4b1c-b89a-8ae36a57e1d8_2/ROC_1586463208.png
Confusion matrix: aml://artifactId/ExperimentRun/dcid.HD_e5a8ba4c-a1dc-4b1c-b89a-8ae36a57e1d8_2/Confusion matrix_1586463208.png
