In [1]:
import azureml.core
print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.6


In [None]:
from azureml.core.workspace import Workspace
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      #'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [3]:
from azureml.core import Datastore
# only need to do it once
ds2 = Datastore.register_azure_file_share(workspace=ws, 
                                         datastore_name='choose_a_datastore_name', 
                                         file_share_name='your_fileshare_name',
                                         account_name='your_storage_acc_name', 
                                         account_key='your_storage_acc_key',
                                         create_if_not_exists=False)

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# Use the 'status' property to get a detailed status for the current cluster. 
print(compute_target.status.serialize())

Found existing compute target
{'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-01-09T14:14:16.649000+00:00', 'creationTime': '2019-01-08T10:23:55.033355+00:00', 'currentNodeCount': 0, 'errors': None, 'modifiedTime': '2019-01-08T10:25:35.793472+00:00', 'nodeStateCounts': {'idleNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0, 'preparingNodeCount': 0, 'runningNodeCount': 0, 'unusableNodeCount': 0}, 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'targetNodeCount': 0, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


In [5]:
import os

project_folder = './dist-keras-ctscan-exp'
os.makedirs(project_folder, exist_ok=True)

In [20]:
import shutil

#shutil.copy('keras_cnn_dicom.py', project_folder)
shutil.copy('dist_keras_ctscan.py', project_folder)

'./dist-keras-ctscan-exp/dist_keras_ctscan.py'

In [7]:
from azureml.core import Experiment

experiment_name = 'dist-keras-tf-exp'
experiment = Experiment(ws, name=experiment_name)

In [21]:
cat ./dist-keras-ctscan-exp/dist_keras_ctscan.py

from __future__ import print_function
import argparse
import pydicom
from matplotlib import pyplot, cm
import os
import sys
import numpy as np
import pandas as pd
import scipy
import keras
from keras.models import Sequential
from keras.layers import AveragePooling2D , Convolution2D , Flatten ,Dense, MaxPooling2D, Conv2D
from keras.preprocessing import utils
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
import math
import tensorflow as tf
import horovod.keras as hvd


# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))


# Horovod: adjust number of epochs based on number of GPUs.
epochs = 1


def get_data(dicom_dir):
    #resize the image to desir

In [22]:
from azureml.train.estimator import *
script_params={
    '--data': ds2.path(),
    '--epoch': 1,
    '--save_model':'/outputs'
}

estimator = Estimator(source_directory=project_folder,
                      compute_target=compute_target,
                      entry_script='dist_keras_ctscan.py',
                      script_params=script_params,
                      node_count=2,
                      process_count_per_node=1,
                      distributed_backend='mpi',    
                      pip_packages=['pydicom','tensorflow-gpu', 'keras', 'horovod','scikit-image','scikit-learn','scipy','argparse',
                                    'opencv-contrib-python-headless','pillow','numpy', 'pandas','matplotlib'],
                      #custom_docker_base_image='zecharpy/tfgpupy3:pydicom',
                      use_gpu=True)

In [23]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: dist-keras-tf-exp,
Id: dist-keras-tf-exp_1547628647461,
Type: azureml.scriptrun,
Status: Queued)


In [24]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [25]:
run.wait_for_completion(show_output=True)

RunId: dist-keras-tf-exp_1547628647461

Streaming azureml-logs/60_control_log_rank_0.txt

This is an MPI job. Rank:0
Streaming log file azureml-logs/60_control_log_rank_0.txt
Streaming log file azureml-logs/80_driver_log_rank_0.txt

Streaming azureml-logs/80_driver_log_rank_1.txt

Using TensorFlow backend.

Streaming azureml-logs/80_driver_log_rank_0.txt

Using TensorFlow backend.
2019-01-16 09:00:31.498467: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-01-16 09:00:31.620676: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties: 
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 724f:00:00.0
totalMemory: 11.17GiB freeMemory: 11.10GiB
2019-01-16 09:00:31.620718: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-01-16 09:00:31.903127: I tensorflow/core/common_runtime/gpu/gpu_device.cc:9

{'runId': 'dist-keras-tf-exp_1547628647461',
 'target': 'gpucluster',
 'status': 'Finalizing',
 'startTimeUtc': '2019-01-16T08:58:06.105893Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': '69689924-e351-4896-87b5-4b7d99a58877'},
 'runDefinition': {'Script': 'dist_keras_ctscan.py',
  'Arguments': ['--data',
   '$AZUREML_DATAREFERENCE_ctscands',
   '--epoch',
   '1',
   '--save_model',
   '/outputs'],
  'SourceDirectoryDataStore': None,
  'Framework': 0,
  'Communicator': 5,
  'Target': 'gpucluster',
  'DataReferences': {'ctscands': {'DataStoreName': 'ctscands',
    'Mode': 'Mount',
    'PathOnDataStore': None,
    'PathOnCompute': None,
    'Overwrite': False}},
  'JobName': None,
  'AutoPrepareEnvironment': True,
  'MaxRunDurationSeconds': None,
  'NodeCount': 2,
  'Environment': {'Python': {'InterpreterPath': 'python',
    'UserManagedDependencies': False,
    'CondaDependencies': {'name': 'project_environment',
     'dependencies': ['python=3.6.2',
      