# Set up workspace

In [1]:
import os
import shutil

from azureml.core.workspace import Workspace
from azureml.core import Experiment

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

In [2]:
# Only required for the first time.
# subscription_id = '<subscription id>'
# resource_grp = '<resource grp>'
# ws_name = '<ws name>'
# ws  =  Workspace(subscription_id,resource_grp, ws_name)
# ws.write_config()

ws = Workspace.from_config()
ws

Workspace.create(name='SubstrateIntelligenceNLR-WS2', subscription_id='42ae47bd-b19b-42c1-b0b9-19fd5be9d51b', resource_group='bert-base')

# Create experiment

In [3]:
project_folder = './'
os.makedirs(project_folder, exist_ok=True)

experiment_name = 'Testing-AML'
experiment = Experiment(ws, name=experiment_name)

# Extract and upload data

In [9]:
#!git clone https://github.com/google-research-datasets/gap-coreference.git
ds = ws.get_default_datastore()
blob_raw = ds.path('raw')
blob_processed = ds.path('precessed')
#ds.upload('gap-coreference')

# Create a compute target

In [5]:
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC24s_v3', 
                                                           max_nodes=4,vm_priority='lowpriority')

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


# Preprocess

In [None]:
script_params = {
    '--output_dir':blob_processed.as_mount()
}

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='PreprocessGapData.py',
                    use_gpu=True,
                    pip_packages=['transformers'])

run = experiment.submit(estimator)


In [None]:
RunDetails(run).show()

# Train

## Single node 

In [16]:
script_params = {
    '--input_dir':'processed_data',
    '--isaml':"",
    '--epochs':3
}

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    use_gpu=True,
                    pip_packages=['transformers'])

run = experiment.submit(estimator)

RunDetails(run).show()



_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

### FP16

TODO: Test with Bert Large

In [19]:
script_params = {
    '--input_dir':'processed_data',
    '--isaml':"",
    '--epochs':1,
    '--fp16':""
}

## Using a public image published on Azure.
image_name = 'krishansubudhi/transformers_pytorch:1.3'

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    use_gpu=True,
                    #pip_packages=['transformers'],
                   
                    #Docker image
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True)
estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'
run = experiment.submit(estimator)

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Multi node

### Horovod

In [48]:
from azureml.core.runconfig import MpiConfiguration
script_params = {
    '--use_horovod':"",
    '--is_distributed':"",
    #'--input_dir':'processed_data'
    '--input_dir':blob_processed.as_mount(),
    '--isaml':"",
    '--per_gpu_batch_size':8,
    '--gradient_accumulation':1    
}


mpi=MpiConfiguration()
mpi.process_count_per_node = 4

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    node_count=2,
                    use_gpu=True,
                    distributed_training = mpi,
                    pip_packages=['transformers'])




run2 = experiment.submit(estimator)
RunDetails(run2).show()



#### Horovod FP16

Not working
Error occurred: User program failed with ModuleNotFoundError: No module named 'horovod'


In [52]:
from azureml.core.runconfig import MpiConfiguration
script_params = {
    '--use_horovod':"",
    '--is_distributed':"",
    #'--input_dir':'processed_data'
    '--input_dir':blob_processed.as_mount(),
    '--isaml':"",
    '--per_gpu_batch_size':8,
    '--gradient_accumulation':1,
    '--fp16':""
}


mpi=MpiConfiguration()
mpi.process_count_per_node = 4

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    node_count=1,
                    use_gpu=True,
                    distributed_training = mpi,
                    
                    #Docker image
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True)
estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'


run2 = experiment.submit(estimator)
RunDetails(run2).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

### DDP

In [46]:
from azureml.core.runconfig import MpiConfiguration
script_params = {
    #'--input_dir':blob_processed.as_mount(),
    '--backend':"nccl",
    '--input_dir':'processed_data',
    '--is_distributed':"",
    '--isaml':"",
    '--per_gpu_batch_size':8,
    '--gradient_accumulation':1    
}


mpi=MpiConfiguration()
mpi.process_count_per_node = 4

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    node_count=2,
                    use_gpu=True,
                    distributed_training = mpi,
                    pip_packages=['transformers'])





In [47]:
run2 = experiment.submit(estimator)
RunDetails(run2).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

#### FP16

Not working for single node: Error occurred: User program failed with KeyError: 'AZ_BATCH_MASTER_NODE'


In [54]:
from azureml.core.runconfig import MpiConfiguration
script_params = {
    #'--input_dir':blob_processed.as_mount(),
    '--backend':"nccl",
    '--input_dir':'processed_data',
    '--is_distributed':"",
    '--isaml':"",
    '--per_gpu_batch_size':8,
    '--gradient_accumulation':1    
}


mpi=MpiConfiguration()
mpi.process_count_per_node = 4

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='TrainGAP.py',
                    node_count=2,
                    use_gpu=True,
                    distributed_training = mpi,                
    
                    #Docker image
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True)
estimator._estimator_config.environment.python.interpreter_path = '/opt/miniconda/envs/amlbert/bin/python'


run2 = experiment.submit(estimator)
RunDetails(run2).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Issues

cache_dir =  'cache0'
downloads everything to source directory and this was causing error. Probably because of size limitation