# Set up workspace

In [1]:
import os
import shutil

from azureml.core.workspace import Workspace
from azureml.core import Experiment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

In [2]:
# Only required for the first time.
# subscription_id = '<subscription id>'
# resource_grp = '<resource grp>'
# ws_name = '<ws name>'
# ws  =  Workspace(subscription_id,resource_grp, ws_name)
# ws.write_config()

ws = Workspace.from_config()
ws

Workspace.create(name='SubstrateIntelligenceNLR-WS2', subscription_id='42ae47bd-b19b-42c1-b0b9-19fd5be9d51b', resource_group='bert-base')

# Create experiment

In [3]:
project_folder = './'
os.makedirs(project_folder, exist_ok=True)

experiment_name = 'Testing-AML-hyperdrive'
experiment = Experiment(ws, name=experiment_name)

# Extract and upload data

In [4]:
#!git clone https://github.com/google-research-datasets/gap-coreference.git
ds = ws.get_default_datastore()
blob_raw = ds.path('raw')
blob_processed = ds.path('precessed')
#ds.upload('gap-coreference')

# Create a compute target

In [5]:
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24s_v3', 
                                                           max_nodes=4,vm_priority='lowpriority')

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


# Preprocess
Follow run_aml

# Train

## Single node 

### FP16

In [26]:
script_params = {
    '--input_dir':'processed_data',
    '--isaml':"",
    '--epochs':1,
    '--fp16':""
}

## Using a public image published on Azure.
image_name = 'krishansubudhi/transformers_pytorch:1.3'

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    use_gpu=True,
                    #pip_packages=['transformers'],
                   
                    #Docker image
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True)
estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'
run = experiment.submit(estimator)

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

# Hyperdrive
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters

## Random

#### Sampling space

In [19]:
from azureml.train.hyperdrive import GridParameterSampling,RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import choice
parameter_sampling = RandomParameterSampling(
    {
        'lr':choice(1E-6,  1E-5, 1E-4 ),
        'per_gpu_batch_size':choice(16,32,64),
        'model_type':choice('mul','concat'),
        'bert_type':choice('bert-base-uncased','bert-large-uncased'),
        'epochs':choice(2,3,4)
    }
)

#### Primary metric

In [20]:
from  azureml.train.hyperdrive import PrimaryMetricGoal
primary_metric_name="val_acc"
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

#### Early termination

In [21]:
from azureml.train.hyperdrive import MedianStoppingPolicy
early_termination_policy = MedianStoppingPolicy(evaluation_interval=1)

#### Resources

In [22]:
max_total_runs=20
max_concurrent_runs=4

#### Experiment

In [23]:
from azureml.train.hyperdrive import HyperDriveConfig
hyperdrive_run_config = HyperDriveConfig(estimator = estimator,
                                        hyperparameter_sampling = parameter_sampling,
                                        primary_metric_name = primary_metric_name,
                                        primary_metric_goal= primary_metric_goal,
                                        max_total_runs = max_total_runs,
                                        max_concurrent_runs = max_concurrent_runs,
                                        policy=early_termination_policy)

In [24]:
hyperdrive_run = experiment.submit(hyperdrive_run_config)
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

dict_keys(['--input_dir', '--isaml', 1])


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Bug in hyperdrive code


Fix this in file

C:\users\krkusuk\AppData\Local\Continuum\miniconda3\envs\pytorch\lib\site-packages\azureml\train\hyperdrive\runconfig.py
    
if isinstance(param,str) and param.lstrip("-") in parameter_space:

## Errors

Bert large with 64 batch size failing