# Set up workspace

In [1]:
import os
import shutil

from azureml.core.workspace import Workspace
from azureml.core import Experiment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

In [2]:
# Only required for the first time.
# subscription_id = '<subscription id>'
# resource_grp = '<resource grp>'
# ws_name = '<ws name>'
# ws  =  Workspace(subscription_id,resource_grp, ws_name)
# ws.write_config()

ws = Workspace.from_config()
ws

Workspace.create(name='SubstrateIntelligenceNLR-WS2', subscription_id='42ae47bd-b19b-42c1-b0b9-19fd5be9d51b', resource_group='bert-base')

# Create experiment

In [3]:
project_folder = './'
os.makedirs(project_folder, exist_ok=True)

experiment_name = 'Testing-AML-hyperdrive'
experiment = Experiment(ws, name=experiment_name)

# Extract and upload data

In [4]:
#!git clone https://github.com/google-research-datasets/gap-coreference.git
ds = ws.get_default_datastore()
blob_raw = ds.path('raw')
blob_processed = ds.path('precessed')
#ds.upload('gap-coreference')

# Create a compute target

In [5]:
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24s_v3', 
                                                           max_nodes=4,vm_priority='lowpriority')

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


# Preprocess
Follow run_aml

# Train

## Estimator

In [55]:

from azureml.core.runconfig import MpiConfiguration
script_params = {
    #'--input_dir':blob_processed.as_mount(),
    '--backend':"nccl",
    '--input_dir':'processed_data',
    '--is_distributed':"",
    '--isaml':"",
    '--per_gpu_batch_size':8,
    '--val_batch_size':32,
    '--output_dir': './outputs' #saving output now
}

image_name = 'krishansubudhi/transformers_pytorch:1.3'
mpi=MpiConfiguration()
mpi.process_count_per_node = 4

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    node_count=1,
                    use_gpu=True,
                    distributed_training = mpi,                
    
                    #Docker image
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True)
estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'


#run2 = experiment.submit(estimator)
#RunDetails(run2).show()

# Hyperdrive
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters

## Random

#### Sampling space

In [56]:
from azureml.train.hyperdrive import GridParameterSampling,RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import choice
parameter_sampling = RandomParameterSampling(
    {
        'lr':choice( 1E-5, 3E-5, 1E-4 ),
        'per_gpu_batch_size':choice(16,32,64),
        'model_type':choice('mul','concat'),
        'bert_type':choice('bert-base-uncased','bert-large-uncased'),
        'epochs':choice(2,3,4)
    }
)

#### Primary metric

In [57]:
from  azureml.train.hyperdrive import PrimaryMetricGoal
primary_metric_name="val_acc"
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

#### Early termination

In [58]:
from azureml.train.hyperdrive import MedianStoppingPolicy
early_termination_policy = MedianStoppingPolicy(evaluation_interval=1)

#### Resources

In [59]:
max_total_runs=20
max_concurrent_runs=4

#### Leverage previous experiment knowledge
good for early termination
Skip if running for the first time

In [60]:
from azureml.train.hyperdrive import HyperDriveRun
warmstart_parent_1 = HyperDriveRun(experiment, "Testing-AML-hyperdrive_1572883987357281")
warmstart_parents_to_resume_from = [warmstart_parent_1]

#### Runconfig and run

In [61]:
from azureml.train.hyperdrive import HyperDriveConfig
hyperdrive_run_config = HyperDriveConfig(estimator = estimator,
                                        hyperparameter_sampling = parameter_sampling,
                                        primary_metric_name = primary_metric_name,
                                        primary_metric_goal= primary_metric_goal,
                                        max_total_runs = max_total_runs,
                                        max_concurrent_runs = max_concurrent_runs,
                                        policy=early_termination_policy,
                                        resume_from=warmstart_parents_to_resume_from)

In [62]:
hyperdrive_run = experiment.submit(hyperdrive_run_config)
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

dict_keys(['--backend', '--input_dir', '--is_distributed', '--per_gpu_batch_size', '--val_batch_size', '--output_dir'])


The same input parameter(s) are specified in estimator/run_config script params and HyperDrive parameter space. HyperDrive parameter space definition will override these duplicate entries. ['--per_gpu_batch_size'] is the list of overridden parameter(s).


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

# Best run

In [31]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['val_acc'])
print('\n params:',parameter_values)

Best Run Id:  Testing-AML-hyperdrive_1572883987357281_8

 Accuracy: [0.9030837004405287, 0.8744493392070485, 0.8898678414096917, 0.9096916299559471]

 params: ['--input_dir', 'processed_data', '--isaml', '--epochs', '1', '--fp16', '--val_batch_size', '32', '--bert_type', 'bert-large-uncased', '--epochs', '4', '--lr', '3E-05', '--model_type', 'mul', '--per_gpu_batch_size', '32']


In [32]:
best_run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_c8b35982b4ea56bd12404fec3b61f7efa949d7d7ca867abaf9385c59defdf3cc_p.txt',
 'azureml-logs/65_job_prep-tvmps_c8b35982b4ea56bd12404fec3b61f7efa949d7d7ca867abaf9385c59defdf3cc_p.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_c8b35982b4ea56bd12404fec3b61f7efa949d7d7ca867abaf9385c59defdf3cc_p.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/126_azureml.log',
 'logs/azureml/azureml.log']

In [63]:
#TODO: upload best model to blob
#Do prediction and upload results to kaggle.