In [1]:
from __future__ import print_function

from datetime import datetime
import sys

from azure.storage.file import FileService
import azure.mgmt.batchai.models as models

# utilities.py contains helper functions used by different notebooks
#sys.path.append('../../')
import utilities

cfg = utilities.Configuration('configuration.json')
client = utilities.create_batchai_client(cfg)

Keyring cache token has failed: (1783, 'CredWrite', 'The stub received bad data')


In [2]:
utilities.create_resource_group(cfg)
_ = client.workspaces.create(cfg.resource_group, cfg.workspace, cfg.location).result()

Keyring cache token has failed: (1783, 'CredWrite', 'The stub received bad data')


In [3]:
nodes_count = 2
cluster_name = 'nc6'

parameters = models.ClusterCreateParameters(
    location=cfg.location,
    vm_size='STANDARD_NC6',
    scale_settings=models.ScaleSettings(
        manual=models.ManualScaleSettings(target_node_count=nodes_count)
    ),
    user_account_settings=models.UserAccountSettings(
        admin_user_name=cfg.admin,
        admin_user_password=cfg.admin_password or None,
        admin_user_ssh_public_key=cfg.admin_ssh_key or None,
    )
)

location is not a known attribute of class <class 'azure.mgmt.batchai.models.cluster_create_parameters_py3.ClusterCreateParameters'> and will be ignored


In [4]:
_ = client.clusters.create(cfg.resource_group, cfg.workspace, cluster_name, parameters).result()

In [5]:
cluster = client.clusters.get(cfg.resource_group, cfg.workspace, cluster_name)
utilities.print_cluster_status(cluster)

Cluster state: steady Target: 2; Allocated: 2; Idle: 2; Unusable: 0; Running: 0; Preparing: 0; Leaving: 0


In [9]:
azure_file_share = 'afs'
parameters = models.JobCreateParameters(
     #location=cfg.location,
     cluster=models.ResourceId(id=cluster.id),
     node_count=2,
     std_out_err_path_prefix="$AZ_BATCHAI_JOB_MOUNT_ROOT/{0}".format(azure_file_share),
     mount_volumes=models.MountVolumes(
            azure_file_shares=[
                models.AzureFileShareReference(
                    account_name=cfg.storage_account_name,
                    credentials=models.AzureStorageCredentialsInfo(
                        account_key=cfg.storage_account_key),
                    azure_file_url='https://{0}.file.core.windows.net/{1}'.format(
                        cfg.storage_account_name, 'bloodcell'),
                    relative_mount_path=azure_file_share)
            ]
        ), 
     container_settings=models.ContainerSettings(
          image_source_registry=models.ImageSourceRegistry(image='pytorch/pytorch:0.4_cuda9_cudnn7')),
     py_torch_settings = models.PyTorchSettings(
         python_script_file_path='$AZ_BATCHAI_JOB_MOUNT_ROOT/{0}/{1}/gan.py'.format(azure_file_share, 'gans'),
         command_line_args='--data $AZ_BATCHAI_JOB_MOUNT_ROOT/afs/gans/data/ --result $AZ_BATCHAI_JOB_MOUNT_ROOT/afs/gans/output --epoch 1 --model_path $AZ_BATCHAI_JOB_MOUNT_ROOT/afs/gans/saved_model/',
         communication_backend='gloo'))

In [10]:
experiment_name = 'pytorch_experiment'
experiment = client.experiments.create(cfg.resource_group, cfg.workspace, experiment_name).result()
job_name = datetime.utcnow().strftime('pytorch_%m_%d_%Y_%H%M%S')
job = client.jobs.create(cfg.resource_group, cfg.workspace, experiment_name, job_name, parameters).result()
print('Created Job {0} in Experiment {1}'.format(job.name, experiment.name))

Created Job pytorch_07_06_2018_120524 in Experiment pytorch_experiment


In [11]:
utilities.wait_for_job_completion(client, cfg.resource_group, cfg.workspace, 
                                  experiment_name, job_name, cluster_name,'stdouterr', 'stdout-0.txt')

Cluster state: steady Target: 2; Allocated: 2; Idle: 0; Unusable: 0; Running: 2; Preparing: 0; Leaving: 0
Job state: running ExitCode: None
Waiting for job output to become available...
Files already downloaded and verified
[0/1][0/782] Loss_D: 1.6274 Loss_G: 4.2939
[0/1][1/782] Loss_D: 1.3343 Loss_G: 6.1785
[0/1][2/782] Loss_D: 0.8022 Loss_G: 6.1877
[0/1][3/782] Loss_D: 0.7474 Loss_G: 5.7476
[0/1][4/782] Loss_D: 0.9917 Loss_G: 7.1261
[0/1][5/782] Loss_D: 0.8377 Loss_G: 6.9019
[0/1][6/782] Loss_D: 0.9394 Loss_G: 6.3998
[0/1][7/782] Loss_D: 0.6853 Loss_G: 8.5311
[0/1][8/782] Loss_D: 0.4510 Loss_G: 8.3736
[0/1][9/782] Loss_D: 0.7114 Loss_G: 9.4136
[0/1][10/782] Loss_D: 0.6751 Loss_G: 9.1559
[0/1][11/782] Loss_D: 0.6312 Loss_G: 10.0657
[0/1][12/782] Loss_D: 0.5034 Loss_G: 8.2565
[0/1][13/782] Loss_D: 1.1764 Loss_G: 13.7405
[0/1][14/782] Loss_D: 0.7809 Loss_G: 10.0186
[0/1][15/782] Loss_D: 0.4193 Loss_G: 9.3496
[0/1][16/782] Loss_D: 0.9187 Loss_G: 14.0149
[0/1][17/782] Loss_D: 0.3208 Loss_

[0/1][179/782] Loss_D: 0.1580 Loss_G: 5.1888
[0/1][180/782] Loss_D: 0.3258 Loss_G: 7.4804
[0/1][181/782] Loss_D: 0.3895 Loss_G: 4.6344
[0/1][182/782] Loss_D: 0.3753 Loss_G: 9.7142
[0/1][183/782] Loss_D: 0.2903 Loss_G: 7.2297
[0/1][184/782] Loss_D: 0.0884 Loss_G: 6.5268
[0/1][185/782] Loss_D: 0.2343 Loss_G: 7.9627
[0/1][186/782] Loss_D: 0.1838 Loss_G: 6.8893
[0/1][187/782] Loss_D: 0.1808 Loss_G: 5.9103
[0/1][188/782] Loss_D: 0.2147 Loss_G: 6.7284
[0/1][189/782] Loss_D: 0.1586 Loss_G: 6.5104
[0/1][190/782] Loss_D: 0.1137 Loss_G: 6.3454
[0/1][191/782] Loss_D: 0.1874 Loss_G: 5.4201
[0/1][192/782] Loss_D: 0.2004 Loss_G: 5.6003
[0/1][193/782] Loss_D: 0.1180 Loss_G: 6.1221
[0/1][194/782] Loss_D: 0.2185 Loss_G: 4.9555
[0/1][195/782] Loss_D: 0.3021 Loss_G: 10.5053
[0/1][196/782] Loss_D: 0.2237 Loss_G: 9.2143
[0/1][197/782] Loss_D: 0.1085 Loss_G: 6.3717
[0/1][198/782] Loss_D: 0.1075 Loss_G: 4.8385
[0/1][199/782] Loss_D: 0.2770 Loss_G: 9.5217
[0/1][200/782] Loss_D: 0.1069 Loss_G: 8.9767
[0/1][201

[0/1][361/782] Loss_D: 0.5257 Loss_G: 3.2797
[0/1][362/782] Loss_D: 0.5595 Loss_G: 3.6181
[0/1][363/782] Loss_D: 0.8240 Loss_G: 2.3005
[0/1][364/782] Loss_D: 0.8799 Loss_G: 7.6350
[0/1][365/782] Loss_D: 1.7874 Loss_G: 2.1606
[0/1][366/782] Loss_D: 0.9549 Loss_G: 4.7882
[0/1][367/782] Loss_D: 1.4598 Loss_G: 3.5105
[0/1][368/782] Loss_D: 0.7651 Loss_G: 4.6186
[0/1][369/782] Loss_D: 0.5488 Loss_G: 2.8362
[0/1][370/782] Loss_D: 0.7846 Loss_G: 7.0497
[0/1][371/782] Loss_D: 1.2714 Loss_G: 2.4134
[0/1][372/782] Loss_D: 0.7683 Loss_G: 5.8591
[0/1][373/782] Loss_D: 0.5057 Loss_G: 3.3221
[0/1][374/782] Loss_D: 0.5089 Loss_G: 3.3484
[0/1][375/782] Loss_D: 0.4768 Loss_G: 5.0421
[0/1][376/782] Loss_D: 0.6507 Loss_G: 2.8540
[0/1][377/782] Loss_D: 0.6408 Loss_G: 4.6383
[0/1][378/782] Loss_D: 0.6814 Loss_G: 2.4300
[0/1][379/782] Loss_D: 0.6692 Loss_G: 4.8547
[0/1][380/782] Loss_D: 0.5270 Loss_G: 3.8762
[0/1][381/782] Loss_D: 0.4155 Loss_G: 3.5598
[0/1][382/782] Loss_D: 0.6149 Loss_G: 4.0354
[0/1][383/

[0/1][543/782] Loss_D: 0.1854 Loss_G: 3.7705
[0/1][544/782] Loss_D: 0.2248 Loss_G: 4.1196
[0/1][545/782] Loss_D: 0.1259 Loss_G: 4.6195
[0/1][546/782] Loss_D: 0.1949 Loss_G: 3.8642
[0/1][547/782] Loss_D: 0.1585 Loss_G: 4.7033
[0/1][548/782] Loss_D: 0.1214 Loss_G: 4.6883
[0/1][549/782] Loss_D: 0.1376 Loss_G: 4.4405
[0/1][550/782] Loss_D: 0.1045 Loss_G: 4.5657
[0/1][551/782] Loss_D: 0.2043 Loss_G: 5.1943
[0/1][552/782] Loss_D: 0.1682 Loss_G: 5.4402
[0/1][553/782] Loss_D: 0.2820 Loss_G: 4.0802
[0/1][554/782] Loss_D: 0.5837 Loss_G: 7.0607
[0/1][555/782] Loss_D: 0.4744 Loss_G: 4.1358
[0/1][556/782] Loss_D: 0.7905 Loss_G: 10.1122
[0/1][557/782] Loss_D: 1.1575 Loss_G: 4.4846
[0/1][558/782] Loss_D: 1.1339 Loss_G: 10.2608
[0/1][559/782] Loss_D: 0.5657 Loss_G: 7.5546
[0/1][560/782] Loss_D: 0.1774 Loss_G: 4.2393
[0/1][561/782] Loss_D: 0.9276 Loss_G: 9.1488
[0/1][562/782] Loss_D: 1.1121 Loss_G: 5.2790
[0/1][563/782] Loss_D: 0.2001 Loss_G: 4.5878
[0/1][564/782] Loss_D: 0.5864 Loss_G: 4.5896
[0/1][56

[0/1][725/782] Loss_D: 0.1649 Loss_G: 6.4172
[0/1][726/782] Loss_D: 0.2723 Loss_G: 4.9231
[0/1][727/782] Loss_D: 0.1822 Loss_G: 3.2724
[0/1][728/782] Loss_D: 0.5678 Loss_G: 4.6663
[0/1][729/782] Loss_D: 0.4660 Loss_G: 4.9954
[0/1][730/782] Loss_D: 0.4430 Loss_G: 2.7776
[0/1][731/782] Loss_D: 0.5558 Loss_G: 5.8589
[0/1][732/782] Loss_D: 0.6236 Loss_G: 2.6891
[0/1][733/782] Loss_D: 0.7915 Loss_G: 7.6380
[0/1][734/782] Loss_D: 0.9256 Loss_G: 4.4885
[0/1][735/782] Loss_D: 0.2290 Loss_G: 3.9426
[0/1][736/782] Loss_D: 0.3472 Loss_G: 6.8906
[0/1][737/782] Loss_D: 0.1715 Loss_G: 5.7005
[0/1][738/782] Loss_D: 0.2703 Loss_G: 3.7478
[0/1][739/782] Loss_D: 0.5278 Loss_G: 6.4739
[0/1][740/782] Loss_D: 0.7232 Loss_G: 1.7880
[0/1][741/782] Loss_D: 1.5406 Loss_G: 9.4184
[0/1][742/782] Loss_D: 2.1855 Loss_G: 0.7920
[0/1][743/782] Loss_D: 2.0922 Loss_G: 8.5793
[0/1][744/782] Loss_D: 2.1902 Loss_G: 3.8452
[0/1][745/782] Loss_D: 0.4099 Loss_G: 2.5268
[0/1][746/782] Loss_D: 0.5249 Loss_G: 4.4524
[0/1][747/

In [12]:
files = client.jobs.list_output_files(cfg.resource_group, cfg.workspace, experiment_name, job_name,
                                      models.JobsListOutputFilesOptions(outputdirectoryid='stdouterr')) 
for f in list(files):
    print(f.name, f.download_url or 'directory')

execution-tvm-3657382398_1-20180705t070726z.log https://zbatchaistorage.file.core.windows.net/bloodcell/60be94cf-bd71-4d05-b7ce-d05fb5968d66/batchairg/workspaces/zbatchai/experiments/pytorch_experiment/jobs/pytorch_07_06_2018_120524/7b8f0692-4e67-4bf9-b664-d0f6f8b74f0a/stdouterr/execution-tvm-3657382398_1-20180705t070726z.log?sv=2016-05-31&sr=f&sig=ndKhUz8oE3ObPciQmN%2BiIftgbWgsIakjdtci%2B%2FvrBf0%3D&se=2018-07-06T13%3A35%3A57Z&sp=rl
execution-tvm-3657382398_2-20180705t070726z.log https://zbatchaistorage.file.core.windows.net/bloodcell/60be94cf-bd71-4d05-b7ce-d05fb5968d66/batchairg/workspaces/zbatchai/experiments/pytorch_experiment/jobs/pytorch_07_06_2018_120524/7b8f0692-4e67-4bf9-b664-d0f6f8b74f0a/stdouterr/execution-tvm-3657382398_2-20180705t070726z.log?sv=2016-05-31&sr=f&sig=JAtjpAU0wBhztu461hh29oGSEyhKVsWmC8H0fBbO3VE%3D&se=2018-07-06T13%3A35%3A57Z&sp=rl
stderr-0.txt https://zbatchaistorage.file.core.windows.net/bloodcell/60be94cf-bd71-4d05-b7ce-d05fb5968d66/batchairg/workspaces/zba

In [13]:
_ = client.jobs.delete(cfg.resource_group, cfg.workspace, experiment_name, job_name)

In [14]:
_ = client.clusters.delete(cfg.resource_group, cfg.workspace, cluster_name)