In [1]:
from __future__ import print_function

from datetime import datetime
import sys

from azure.storage.file import FileService
import azure.mgmt.batchai.models as models

# utilities.py contains helper functions used by different notebooks
#sys.path.append('../../')
import utilities

cfg = utilities.Configuration('configuration.json')
client = utilities.create_batchai_client(cfg)

Keyring cache token has failed: (1783, 'CredWrite', 'The stub received bad data')


In [2]:
utilities.create_resource_group(cfg)
_ = client.workspaces.create(cfg.resource_group, cfg.workspace, cfg.location).result()

Keyring cache token has failed: (1783, 'CredWrite', 'The stub received bad data')


In [3]:

nodes_count = 2
cluster_name = 'nc6dsvm'

parameters = models.ClusterCreateParameters(
    #location=cfg.location,
    vm_size='STANDARD_NC6',
    scale_settings=models.ScaleSettings(
        manual=models.ManualScaleSettings(target_node_count=nodes_count)
    ), 
    ####### comment out this section when want to use docker image instead of dsvm###########
    virtual_machine_configuration=models.VirtualMachineConfiguration(
        image_reference=models.ImageReference(
            publisher="microsoft-ads",
            offer="linux-data-science-vm-ubuntu",
            sku="linuxdsvmubuntu",
            version="latest")),
    ###################comment out end ####################################
    user_account_settings=models.UserAccountSettings(
        admin_user_name=cfg.admin,
        admin_user_password=cfg.admin_password or None,
        admin_user_ssh_public_key=cfg.admin_ssh_key or None,
    )
)

In [4]:
_ = client.clusters.create(cfg.resource_group, cfg.workspace, cluster_name, parameters).result()

In [5]:
cluster = client.clusters.get(cfg.resource_group, cfg.workspace, cluster_name)
utilities.print_cluster_status(cluster)

Cluster state: steady Target: 2; Allocated: 2; Idle: 2; Unusable: 0; Running: 0; Preparing: 0; Leaving: 0


In [6]:
backend = 'tensorflow'

In [7]:
if backend == 'tensorflow':
    parameters = models.JobCreateParameters(
        #location=cfg.location,
        cluster=models.ResourceId(id=cluster.id),
        node_count=1,
        job_preparation=models.JobPreparation(command_line='python -m pip install keras==2.1.6 scikit-image scikit-learn opencv-contrib-python-headless pillow numpy'),
        ####### only comment out the below when you use docker instead of dsvm in parameters block above
         #container_settings=models.ContainerSettings(
         #    image_source_registry=models.ImageSourceRegistry(image='tensorflow/tensorflow:1.8.0-gpu-py3')),
        ######### commend end #########################
        mount_volumes=models.MountVolumes(
            azure_file_shares=[
            models.AzureFileShareReference(
            account_name=cfg.storage_account_name,
            credentials=models.AzureStorageCredentialsInfo(
            account_key=cfg.storage_account_key),
            azure_file_url='https://{0}.file.core.windows.net/{1}'.format(
            cfg.storage_account_name, 'datasets'), ## bloodcell is the name of the fileshare
            relative_mount_path='afs')
            ]
         ),
        std_out_err_path_prefix='$AZ_BATCHAI_JOB_MOUNT_ROOT/afs/maskRCNN', ## here afs equal to bloodshare level
        tensor_flow_settings=models.TensorFlowSettings(
             python_script_file_path='$AZ_BATCHAI_JOB_MOUNT_ROOT/afs/maskRCNN/balloon.py',
             master_command_line_args='train --dataset $AZ_BATCHAI_JOB_MOUNT_ROOT/afs/maskRCNN/datasets/microscopy/ --weights imagenet --save_model $AZ_BATCHAI_JOB_MOUNT_ROOT/afs/maskRCNN/ --epochs 10 --logs $AZ_BATCHAI_JOB_MOUNT_ROOT/afs/maskRCNN/ ')) 

In [8]:
experiment_name = 'microscopy_maskRCNN'
experiment = client.experiments.create(cfg.resource_group, cfg.workspace, experiment_name).result()
job_name = datetime.utcnow().strftime('keras_{}_%m_%d_%Y_%H%M%S'.format(backend))
job = client.jobs.create(cfg.resource_group, cfg.workspace, experiment_name, job_name, parameters).result()
print('Created Job {0} in Experiment {1}'.format(job.name, experiment.name))

Created Job keras_tensorflow_09_09_2018_191056 in Experiment microscopy_maskrcnn


In [9]:
if backend == 'tensorflow':
    read_file = 'stdout-wk-0.txt'
elif backend == 'cntk':
    read_file = 'stdout.txt'
import utilities

utilities.wait_for_job_completion(client, cfg.resource_group, cfg.workspace, 
                                  experiment_name, job_name, cluster_name, 'stdouterr', read_file)

Cluster state: steady Target: 2; Allocated: 2; Idle: 2; Unusable: 0; Running: 0; Preparing: 0; Leaving: 0
Job state: running ExitCode: None
Waiting for job output to become available...
args command subset is  train
Weights:  imagenet
Logs:  /mnt/batch/tasks/shared/LS_root/jobs/zbatchai/microscopy_maskrcnn/keras_tensorflow_09_09_2018_191056/mounts/afs/maskRCNN/

Configurations:
BACKBONE                       resnet101
BACKBONE_SHAPES                [[16 16]
 [ 8  8]
 [ 4  4]
 [ 2  2]
 [ 1  1]]
BACKBONE_STRIDES               [4, 8, 16, 32, 64]
BATCH_SIZE                     5
BBOX_STD_DEV                   [0.1 0.1 0.2 0.2]
DETECTION_MAX_INSTANCES        100
DETECTION_MIN_CONFIDENCE       0.9
DETECTION_NMS_THRESHOLD        0.3
GPU_COUNT                      1
IMAGES_PER_GPU                 5
IMAGE_MAX_DIM                  64
IMAGE_MIN_DIM                  64
IMAGE_PADDING                  True
IMAGE_SHAPE                    [64 64  3]
LEARNING_MOMENTUM              0.9
LEARNING_RATE    

 2/10 [=====>........................] - ETA: 13s - loss: 2.2452 - rpn_class_loss: 0.1943 - rpn_bbox_loss: 0.6640 - mrcnn_class_loss: 0.0984 - mrcnn_bbox_loss: 0.6416 - mrcnn_mask_loss: 0.6468
Epoch 4/10

 1/10 [==>...........................] - ETA: 15s - loss: 1.9223 - rpn_class_loss: 0.1309 - rpn_bbox_loss: 0.5025 - mrcnn_class_loss: 0.1116 - mrcnn_bbox_loss: 0.5436 - mrcnn_mask_loss: 0.6338
 2/10 [=====>........................] - ETA: 13s - loss: 1.9474 - rpn_class_loss: 0.1309 - rpn_bbox_loss: 0.5209 - mrcnn_class_loss: 0.1183 - mrcnn_bbox_loss: 0.5461 - mrcnn_mask_loss: 0.6312
Epoch 5/10

 1/10 [==>...........................] - ETA: 15s - loss: 1.5612 - rpn_class_loss: 0.1312 - rpn_bbox_loss: 0.2976 - mrcnn_class_loss: 0.0810 - mrcnn_bbox_loss: 0.4400 - mrcnn_mask_loss: 0.6113
 2/10 [=====>........................] - ETA: 13s - loss: 1.4794 - rpn_class_loss: 0.1224 - rpn_bbox_loss: 0.2945 - mrcnn_class_loss: 0.0826 - mrcnn_bbox_loss: 0.3621 - mrcnn_mask_loss: 0.6177
Epoch 6/10


Epoch 7/10

 1/10 [==>...........................] - ETA: 15s - loss: 1.3636 - rpn_class_loss: 0.1003 - rpn_bbox_loss: 0.3432 - mrcnn_class_loss: 0.0616 - mrcnn_bbox_loss: 0.2464 - mrcnn_mask_loss: 0.6121
 2/10 [=====>........................] - ETA: 13s - loss: 1.3409 - rpn_class_loss: 0.1050 - rpn_bbox_loss: 0.2631 - mrcnn_class_loss: 0.0721 - mrcnn_bbox_loss: 0.3003 - mrcnn_mask_loss: 0.6004
Epoch 8/10

 1/10 [==>...........................] - ETA: 15s - loss: 1.1589 - rpn_class_loss: 0.0866 - rpn_bbox_loss: 0.1712 - mrcnn_class_loss: 0.0662 - mrcnn_bbox_loss: 0.2192 - mrcnn_mask_loss: 0.6157
 2/10 [=====>........................] - ETA: 13s - loss: 1.1684 - rpn_class_loss: 0.0791 - rpn_bbox_loss: 0.2364 - mrcnn_class_loss: 0.0563 - mrcnn_bbox_loss: 0.1972 - mrcnn_mask_loss: 0.5995
Epoch 9/10

 1/10 [==>...........................] - ETA: 16s - loss: 1.3764 - rpn_class_loss: 0.0796 - rpn_bbox_loss: 0.2881 - mrcnn_class_loss: 0.0752 - mrcnn_bbox_loss: 0.3269 - mrcnn_mask_loss: 0.6067

Job state: succeeded ExitCode: 0


In [10]:
from azure.storage.file import FileService
file_service = FileService(cfg.storage_account_name, cfg.storage_account_key)
generator = file_service.list_directories_and_files('datasets/maskRCNN/')
for file_or_dir in generator:
    print(file_or_dir.name, file_or_dir.metadata)

balloon.py None
coco.py None
config.py None
demo.ipynb None
inspect_balloon_model.ipynb None
inspect_sarcomas_data.ipynb None
inspect_weights.ipynb None
microscopy.h5 None
model.py None
parallel_model.py None
Sarcomas_model.h5 None
shapes.py None
train_shapes.ipynb None
utils.py None
visualize.py None
.ipynb_checkpoints None
60be94cf-bd71-4d05-b7ce-d05fb5968d66 None
datasets None
logs None
samples None
sarcomas20180909T1758 None
sarcomas20180909T1814 None
sarcomas20180909T1911 None
__pycache__ None


In [11]:
#file_service.get_file_to_path('bloodcell/ChestCTscan/',None ,'ChestCTscan_epoch200.h5','model200epoch.h5')

In [12]:
_ = client.jobs.delete(cfg.resource_group, cfg.workspace, experiment_name, job_name)

In [13]:
_ = client.clusters.delete(cfg.resource_group, cfg.workspace, cluster_name)