In [83]:
import os 

import azureml.core
from azureml.core import Workspace, Dataset, Datastore, Experiment, Environment, ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
#from azureml.core.runconfig import PyTorchConfiguration
from azureml.core.runconfig import PyTorchConfiguration, DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

from azureml.data import OutputFileDatasetConfig
from azureml.telemetry import set_diagnostics_collection

from azureml.widgets import RunDetails

set_diagnostics_collection(send_diagnostics=True)

print("SDK version:", azureml.core.VERSION)

Turning diagnostics collection on. 
SDK version: 1.38.0


In [84]:
project_folder = '../dino'
#os.makedirs(project_folder, exist_ok=True)

In [85]:
os.listdir(project_folder)

['.amlignore',
 '.amlignore.amltmp',
 '.config',
 '.git',
 '.github',
 '.ipynb_aml_checkpoints',
 '.ipynb_checkpoints',
 'attn-head0.png',
 'attn-head1.png',
 'attn-head2.png',
 'attn-head3.png',
 'attn-head4.png',
 'attn-head5.png',
 'checkpoint.pth',
 'eval_copy_detection.py',
 'eval_image_retrieval.py',
 'eval_knn.py',
 'eval_linear.py',
 'eval_linear.py.amltmp',
 'eval_video_segmentation.py',
 'exp1-pytorchDist-Premium.ipynb',
 'hubconf.py',
 'img.png',
 'LICENSE',
 'main_dino.py',
 'main_dino.py.amltmp',
 'README.md',
 'run_with_submitit.py',
 'utils.py',
 'utils.py.amltmp',
 'video_generation.py',
 'vision_transformer.py',
 'visualize_attention.py',
 'vit_small-pytorchDist-Premium.ipynb',
 'vit_small-pytorchdist-premium.ipynb.amltmp',
 '__pycache__']

In [86]:
ws = Workspace.from_config()
datastore = ws.get_default_datastore()
dataset = Dataset.get_by_name(ws, name='imagenet_2015_premium_west_europe')


In [64]:
# choose a name for your cluster
cluster_name = 'A100-8x2'
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Cannot Find the compute cluster')

# use get_status() to get a detailed status for the current AmlCompute. 
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2022-03-06T08:52:56.538000+00:00', 'errors': None, 'creationTime': '2022-03-03T17:35:34.717374+00:00', 'modifiedTime': '2022-03-06T09:37:28.087633+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_ND96AMSR_A100_V4'}


In [6]:
experiment_name = 'exp-ViTS16'
experiment = Experiment(ws, name=experiment_name)

myenv = Environment(name = "myenv")
myenv.docker.enabled = True
dockerfile = r"""
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04
RUN apt-get update && apt-get install -y libgl1-mesa-glx 
RUN echo "Hello from custom container!"
"""
myenv.docker.base_image = None
myenv.docker.base_dockerfile = dockerfile

!wget https://github.com/parasailteam/sccl-presynth


In [87]:
#pytorch_env = Environment.from_conda_specification(name='AzureML-PyTorch-1.6-GPU',file_path='distributed-pytorch-with-distributeddataparallel.yml')
curated_env_name = 'AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu'
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
pytorch_env.environment_variables = {"AZUREML_DOWNLOAD_CONCURRENCY":384} 

dino_env = pytorch_env.clone("dino_env")

conda = CondaDependencies()

# # add pip packages
conda.add_pip_package('timm')
# # create environment
dino_env.python.conda_dependencies = conda
docker_config = DockerConfiguration(use_docker=True,shm_size='256g')






In [66]:
patch_size = 16
batch_size_per_gpu = 64
epochs = 100
node_count = 1
process_count = 8
communication_backend = 'NCCL'

In [67]:
output_folder = 'Output_node'+str(node_count)+'_'+'ViTs_gpus'+str(process_count)+'_bacthsize'+str(batch_size_per_gpu)

output= OutputFileDatasetConfig(destination=(datastore, output_folder))

In [68]:
output_folder

'Output_node1_ViTs_gpus8_bacthsize64'

In [11]:
# create distributed config
distr_config = PyTorchConfiguration(communication_backend=communication_backend,process_count=process_count, node_count=node_count)
# create args
args = ["--arch", "vit_small",
        "--data_path", dataset.as_download(), 
        "--patch_size", patch_size,
        "--norm_last_layer",False, 
        "--warmup_teacher_temp", 0.04, 
        "--teacher_temp", 0.04, 
        "--use_fp16", False, 
        "--weight_decay", 0.04,
        "--weight_decay_end", 0.4, 
        "--clip_grad", 3.0, 
        "--batch_size_per_gpu", batch_size_per_gpu, 
        "--epochs", epochs, 
        "--freeze_last_layer", 1, 
        "--lr", 0.0005, 
        "--warmup_epochs", 10, 
        "--warmup_teacher_temp_epochs",0,
        "--min_lr", 1e-05, 
        "--local_crops_number", 6, 
        "--seed", 0, 
        "--num_workers", 10,
        "--optimizer", 'adamw', 
        "--momentum_teacher", 0.996,
        "--use_bn_in_head", False, 
        "--out_dim", 65536,
        "--drop_path_rate", 0.1,
        "--global_crops_scale", 0.25 , 1.0,
        "--local_crops_scale",0.05, 0.25,
        "--saveckp_freq",5, 
        "--output_dir", output.as_mount()]



        ## ref: command: ["/bin/sh", "-c", "cd /code; \ python -m torch.distributed.launch --nproc_per_node=8 main_dino.py \ --data_path /dataset/imagenet-raw/train --output_dir ./exp_dino/ \ --freeze_last_layer 1
        ## --lr 0.0005 --weight_decay 0.04 --batch_size_per_gpu 64 \ --drop_path_rate 0.1 --epochs 100 --warmup_epochs 10 \ 
        ##--out_dim 65536 --norm_last_layer false --use_bn_in_head false \ --teacher_temp 0.04 --warmup_teacher_temp 0.04
        ## --warmup_teacher_temp_epochs 0 \ --use_fp16 false --clip_grad 3.0 \ --momentum_teacher 0.996
        ## \ --global_crops_scale 0.25 1 --local_crops_scale 0.05 0.25 \ --local_crops_number 6 "]

print(args)

['--arch', 'vit_small', '--data_path', <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x7fa6bc03e550>, '--patch_size', 16, '--norm_last_layer', False, '--warmup_teacher_temp', 0.04, '--teacher_temp', 0.04, '--use_fp16', False, '--weight_decay', 0.04, '--weight_decay_end', 0.4, '--clip_grad', 3.0, '--batch_size_per_gpu', 64, '--epochs', 100, '--freeze_last_layer', 1, '--lr', 0.0005, '--warmup_epochs', 10, '--warmup_teacher_temp_epochs', 0, '--min_lr', 1e-05, '--local_crops_number', 6, '--seed', 0, '--num_workers', 10, '--optimizer', 'adamw', '--momentum_teacher', 0.996, '--use_bn_in_head', False, '--out_dim', 65536, '--drop_path_rate', 0.1, '--global_crops_scale', 0.25, 1.0, '--local_crops_scale', 0.05, 0.25, '--saveckp_freq', 4, '--output_dir', <azureml.data.output_dataset_config.OutputFileDatasetConfig object at 0x7fa6a02b33d0>]


In [12]:
src = ScriptRunConfig(source_directory=project_folder,                    
                      script='main_dino.py',
                       arguments=args,
                       compute_target=compute_target,
                       environment=dino_env,
                       distributed_job_config=distr_config,
                       docker_runtime_config=docker_config)



In [13]:
run = experiment.submit(src)
run.tag("author","AFS")
run.tag("storage" , "premium")
run.tag("envoirnment" ,'dino_env' )
run.tag("dataset", "download")
run.tag("batch_size_per_gpu" , str(batch_size_per_gpu))
run.tag("patch_size" , str(patch_size))
run.tag("epochs" , str(epochs))
run.tag("ENV" , str(curated_env_name))
run.tag("communication_backend" , str(communication_backend))
run.tag("gpus" , str(process_count))
run.tag("nodes" , str(node_count))


Submitting /mnt/batch/tasks/shared/LS_root/mounts/clusters/ds3-4cores/code/Users/aghasemi/dino directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


In [14]:
print(run)
RunDetails(run).show()

Run(Experiment: exp-ViTS16,
Id: exp-ViTS16_1646437533_f301af75,
Type: azureml.scriptrun,
Status: Queued)


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…