In [None]:
import os
import requests
import sys

# AzureML libraries
import azureml.core
from azureml.core import Experiment, Workspace, Datastore, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.container_registry import ContainerRegistry
from azureml.core.runconfig import MpiConfiguration, RunConfiguration, DEFAULT_GPU_IMAGE
from azureml.train.dnn import PyTorch
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [None]:
# Consult microsoft-onnxruntime-example/nvidia-bert/README.md for instructions prior to running this notebook.

# Create or retrieve Azure machine learning workspace
# see https://docs.microsoft.com/en-us/python/api/overview/azure/ml/?view=azure-ml-py
ws = Workspace.get(name="myworkspace", subscription_id='<azure-subscription-id>', resource_group='myresourcegroup')

# Print workspace attributes
print('Workspace name: ' + ws.name, 
      'Workspace region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [None]:
# Create a datastore from blob storage containing training data.
# Consult README.md for instructions downloading and uploading training data.
ds = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name='<datastore-name>',
                                             account_name='<storage-account-name>', 
                                             account_key='<storage-account-key>',
                                             container_name='<storage-container-name>')

In [None]:
# Print datastore attributes
print('Datastore name: ' + ds.name, 
      'Container name: ' + ds.container_name, 
      'Datastore type: ' + ds.datastore_type, 
      'Workspace name: ' + ds.workspace.name, sep = '\n')

In [None]:
# Create GPU cluster
gpu_cluster_name = "ndv2scus" 
try:
    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_ND40rs_v2', min_nodes=4, max_nodes=4)
    gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)
    gpu_compute_target.wait_for_completion(show_output=True)

In [None]:
# Create experiment for phase 1
experiment_name = 'nvbert-ort-pretraining-phase1'
experiment = Experiment(ws, name=experiment_name)

In [None]:
project_folder = '../workspace'

# see README.md for instructions pushing Docker image with onnxruntime build
image_name = 'bert-onnxruntime:latest'

# credentials to registry containing above Docker image
cr = ContainerRegistry()
cr.address = '<registry-name>.azurecr.io'
cr.username = '<registry-username>'
cr.password = '<registry-password>'

# set MPI configuration
# set processes per node to be equal to GPU count on SKU.
mpi = MpiConfiguration()
mpi.process_count_per_node = 8

import uuid
output_id = uuid.uuid1().hex

# Define training estimator for phase 1
# Consult https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-ml-models
# Fill in blob path to phase 1 training data in argument below
estimator_ph1 = Estimator(source_directory=project_folder,

                    # Compute configuration
                    compute_target = gpu_compute_target,
                    node_count=4,
                    process_count_per_node=1,  # separate MPI jobs
                    distributed_training = mpi,
                    use_gpu = True,
                    
                    # supply Docker image
                    use_docker = True,
                    custom_docker_image = image_name,
                    image_registry_details = cr,
                    user_managed = True,
                    
                    # Training script parameters
                    script_params = {
                        "--config_file": "bert_config.json",
                        '--input_dir' : ds.path('<blob-path-to-phase1-training-data>').as_mount(), 
                        '--output_dir': ds.path(f'output/{experiment_name}/{output_id}/').as_mount(),
                        '--bert_model' : 'bert-large-uncased',
                        '--train_batch_size' : 4096,
                        '--max_seq_length': 128,
                        '--max_predictions_per_seq': 20,
                        '--max_steps' : 7038,
                        '--warmup_proportion' : '0.2843',
                        '--num_steps_per_checkpoint' : 200,
                        '--learning_rate' : '6e-3',
                        '--seed': 42,
                        '--fp16' : '',
                        '--gradient_accumulation_steps' : 32,
                        '--allreduce_post_accumulation' : '',
                        '--allreduce_post_accumulation_fp16' : '',
                        '--do_train' : '',
                        '--use_ib' : '', # pass if infiniband available on SKU
                        '--gpu_memory_limit_gb' : 32 # set to per GPU memory in GB (check SKU)
                    },
                    
                    entry_script = 'run_pretraining_ort.py',
                    inputs = [ds.path('').as_mount()]
                   )

In [None]:
# Submit phase 1 (check logs from Outputs + logs tab of corresponding link)
run = experiment.submit(estimator_ph1)
RunDetails(run).show()
print(run.get_portal_url())

In [None]:
# Create experiment for phase 2
experiment_name = 'nvbert-ort-pretraining-phase2'
experiment = Experiment(ws, name=experiment_name)

In [None]:

# Define training estimator for phase 2
# Fill in blob path to phase 1 training data as well as phase 1 checkpoint in arguments below
estimator_ph2 = Estimator(source_directory=project_folder,

                    # Compute configuration
                    compute_target = gpu_compute_target,
                    node_count=4, 
                    process_count_per_node=1, # separate MPI jobs
                    distributed_training = mpi,
                    use_gpu = True,
                    
                    #Docker image
                    use_docker = True,
                    custom_docker_image = image_name,
                    image_registry_details = cr,
                    user_managed = True,
                    
                    # Training script parameters
                    script_params = {
                        # Required Params
                        "--config_file": "bert_config.json",
                        '--input_dir' : ds.path('<blob-path-to-phase2-training-data>').as_mount(), 
                        '--output_dir': ds.path(f'output/{experiment_name}/{output_id}/').as_mount(),
                        '--bert_model' : 'bert-large-uncased',
                        '--train_batch_size' : 4096,
                        '--max_seq_length': 512,
                        '--max_predictions_per_seq': 80,
                        '--max_steps' : 1563,
                        '--warmup_proportion' : '0.128',
                        '--num_steps_per_checkpoint' : 200,
                        '--learning_rate' : '4e-3',
                        '--seed': 42,
                        '--fp16' : '',
                        '--gradient_accumulation_steps' : 256,
                        '--allreduce_post_accumulation' : '',
                        '--allreduce_post_accumulation_fp16' : '',
                        '--do_train' : '',
                        '--phase2' : '',
                        '--resume_from_checkpoint' : '',
                        '--phase1_end_step' : '7038',
                        '--init_checkpoint' : ds.path('<path-to-checkpoint-from-phase-1>'),
                        '--use_ib' : '', # pass if infiniband available on SKU
                        '--gpu_memory_limit_gb' : 32 # set to per GPU memory in GB (check SKU)
                    },
                    
                    entry_script='run_pretraining_ort.py',
                    inputs=[ds.path('').as_mount()])

In [None]:
# Submit phase 2 run (check logs from Outputs + logs tab of corresponding link)
run = experiment.submit(estimator_ph2)
RunDetails(run).show()
print(run.get_portal_url())