# Set up workspace

In [1]:
import os
import shutil

from azureml.core.workspace import Workspace
from azureml.core import Experiment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

In [18]:
# Only required for the first time.
# subscription_id = '<subscription id>'
# resource_grp = '<resource grp>'
# ws_name = '<ws name>'
# ws  =  Workspace(subscription_id,resource_grp, ws_name)
# ws.write_config()

ws = Workspace.from_config()

ds = ws.get_default_datastore()
blob_raw = ds.path('raw')
blob_processed = ds.path('precessed')

# Create experiment

In [21]:
project_folder = './'
os.makedirs(project_folder, exist_ok=True)

experiment_name = 'Testing-AML'
experiment = Experiment(ws, name=experiment_name)

# Create a compute target

In [22]:
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24s_v3', 
                                                           max_nodes=4,vm_priority='lowpriority')

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


# Preprocess
Follow run_aml

# Train

## Estimator

In [41]:

from azureml.core.runconfig import MpiConfiguration
script_params = {
    '--input_dir':blob_processed.as_mount(),
    '--backend':"nccl",
    '--is_distributed':"",
    '--isaml':"",
    '--val_batch_size':32,
    '--epochs':4,
    '--output_dir': './outputs', #saving output now
    '--fp16':"",
    '--bert_type':'bert-large-uncased'
}

image_name = 'krishansubudhi/transformers_pytorch:1.3'
mpi=MpiConfiguration()
mpi.process_count_per_node = 4

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    node_count=1,
                    use_gpu=True,
                    distributed_training = mpi,                
    
                    #Docker image
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True)
estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'


#run2 = experiment.submit(estimator)
#RunDetails(run2).show()

In [42]:
torch.nn.functional.softmax( torch.tensor([[2.711256980895996094e+00,3.645849227905273438e+00,2.910761356353759766e+00]]), dim = 1)

tensor([[0.2098, 0.5341, 0.2561]])

In [43]:
run2.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_3abd023539818271cd1df36be6fc6a887099bc0df61fb2531b0374e5660e0c63_p.txt',
 'azureml-logs/65_job_prep-tvmps_3abd023539818271cd1df36be6fc6a887099bc0df61fb2531b0374e5660e0c63_p.txt',
 'azureml-logs/70_driver_log_0.txt',
 'azureml-logs/70_driver_log_1.txt',
 'azureml-logs/70_driver_log_2.txt',
 'azureml-logs/70_driver_log_3.txt',
 'azureml-logs/70_mpi_log.txt',
 'azureml-logs/75_job_post-tvmps_3abd023539818271cd1df36be6fc6a887099bc0df61fb2531b0374e5660e0c63_p.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/0_164_azureml.log',
 'logs/azureml/1_181_azureml.log',
 'logs/azureml/2_186_azureml.log',
 'logs/azureml/3_185_azureml.log',
 'logs/azureml/azureml.log',
 'outputs/amp_checkpoint.pt',
 'outputs/evaluate_labels.csv',
 'outputs/evaluate_logits.csv',
 'outputs/model_checkpoint.pt']

# Hyperdrive
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters

#### Sampling space

In [44]:
import math
from azureml.train.hyperdrive import GridParameterSampling,RandomParameterSampling
from azureml.train.hyperdrive.parameter_expressions import choice,loguniform
parameter_sampling = RandomParameterSampling(
    {
        'lr':loguniform( math.log(1E-5), math.log(1E-4) ),
        'per_gpu_batch_size':choice(4,8,16),
        'model_type':choice('mul','concat'),
        'bert_type':choice('bert-base-uncased','bert-large-uncased'),
    }
)

#### Primary metric

In [45]:
from  azureml.train.hyperdrive import PrimaryMetricGoal
primary_metric_name="val_acc"
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

#### Early termination

In [46]:
from azureml.train.hyperdrive import MedianStoppingPolicy
early_termination_policy = MedianStoppingPolicy(evaluation_interval=1)

#### Resources

In [47]:
max_total_runs=20
max_concurrent_runs=10

#### Runconfig and run

In [48]:
from azureml.train.hyperdrive import HyperDriveConfig
hyperdrive_run_config = HyperDriveConfig(estimator = estimator,
                                        hyperparameter_sampling = parameter_sampling,
                                        primary_metric_name = primary_metric_name,
                                        primary_metric_goal= primary_metric_goal,
                                        max_total_runs = max_total_runs,
                                        max_concurrent_runs = max_concurrent_runs,
                                        policy=early_termination_policy)

In [49]:
experiment_name = 'Testing-AML-hyperdrive'
experiment = Experiment(ws, name=experiment_name)

hyperdrive_run = experiment.submit(hyperdrive_run_config)
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

dict_keys(['--input_dir', '--backend', '--is_distributed', '--val_batch_size', '--epochs', '--output_dir', '--fp16'])


_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

# Best run

In [51]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['val_acc'])
print('\n params:',parameter_values)

AttributeError: 'NoneType' object has no attribute 'get_metrics'

In [None]:
best_run.get_file_names()

# Test data as validation file