In [1]:
import os
import numpy as np
import pandas as pd 

In [2]:
import azureml.core
from azureml.core import Workspace, Datastore
print(azureml.core.VERSION)

1.0.33


In [3]:
ws = Workspace.from_config(r'C:\Users\casocha\Downloads\histopathologic-cancer-detection\config.json')

ds = ws.get_default_datastore()

In [4]:
experiment_name = 'Cancer'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "carter")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 6)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "Standard_DS5_v2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

In [5]:
compute_target = ws.compute_targets['carter']

In [6]:
import os
script_folder  = os.path.join(os.getcwd(), "test")
os.makedirs(script_folder, exist_ok=True)

In [7]:
%%writefile $script_folder/train.py

import argparse
import os
import numpy as np
import pandas as pd
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from azureml.core import Run

# let user feed in 2 parameters, the location of the data files (from datastore), and the regularization rate of the logistic regression model
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
args = parser.parse_args()

data_folder = args.data_folder
print('Data folder:', data_folder)

df = pd.read_csv(os.path.join(data_folder, 'train_labels.csv'))

def append_ext(fn):
    return fn+".tif"

df["id"]=df["id"].apply(append_ext)
                   

df["label"]=df["label"].astype(str)

train_path = os.path.join(data_folder, 'train')
valid_path = os.path.join(data_folder, 'train')

testdf=pd.read_csv(os.path.join(data_folder, 'sample_submission.csv'),dtype=str)

def append_ext2(fn):
    return os.path.join(data_folder, 'test/') + fn+".tif"

testdf["id"]=testdf["id"].apply(append_ext2)
                   
                   
testdf = testdf.drop(['label'],axis=1)

test_path = os.path.join(data_folder, 'test')

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator=test_datagen.flow_from_dataframe(
dataframe=testdf,
directory="test_path",
x_col="id",
y_col=None,
batch_size=32,
seed=42,
shuffle=False,
class_mode=None,
target_size=(96,96))

train_datagen = ImageDataGenerator(
       # horizontal_flip=True,
       #vertical_flip=True,
       #brightness_range=[0.5, 1.5],
       #fill_mode='reflect',                               
        #rotation_range=15,
        rescale=1./255,
        #shear_range=0.2,
        #zoom_range=0.2
        validation_split=0.15
    
)


train_generator = train_datagen.flow_from_dataframe(
                dataframe=df,
                directory=train_path,
                x_col = 'id',
                y_col = 'label',
                has_ext=False,
                subset='training',
                target_size=(96, 96),
                batch_size=64,
                class_mode='binary'
                )

validation_generator = train_datagen.flow_from_dataframe(
                dataframe=df,
                directory=valid_path,
                x_col = 'id',
                y_col = 'label',
                has_ext=False,
                subset='validation', # This is the trick to properly separate train and validation dataset
                target_size=(96, 96),
                batch_size=64,
                shuffle=False,
                class_mode='binary'
                )



model = Sequential()
model.add(Conv2D(32,(3,3),padding='same',input_shape=(96,96,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3),padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3),padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
#model.add(Dropout(0.25))

model.add(Conv2D(64,(3,3),padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3),padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3),padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.summary()
model.compile(optimizers.rmsprop(lr=0.01, decay=1e-6),loss="sparse_categorical_crossentropy",metrics=["accuracy"])


# FIX THIS / Design a deeper smaller architecture after converting 
print(model.summary())

STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=validation_generator.n//validation_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=validation_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=10
)

model.evaluate_generator(generator=valid_generator,
steps=STEP_SIZE_TEST)

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/cancer_model.pkl')

Overwriting C:\Users\casocha\CNN\test/train.py


In [8]:
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.path('Cancer').as_mount(),
    '--regularization': 0.8
}

est = Estimator(source_directory=script_folder,
                script_params=script_params,
                compute_target=compute_target,
                node_count=4,
                entry_script='train.py',
                conda_packages=['keras','pandas','Pillow'],
                process_count_per_node=1,
                distributed_backend='mpi')



In [9]:
run = exp.submit(config=est)
run

Experiment,Id,Type,Status,Details Page,Docs Page
Cancer,Cancer_1557503669_1775c16b,azureml.scriptrun,Running,Link to Azure Portal,Link to Documentation


In [11]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…