In [9]:
#---------------------------------------
# Create a workspace
#---------------------------------------
import azureml.core

print(azureml.core.VERSION)

from azureml.core import Workspace

ws = Workspace.create(name='testworkspace',
            subscription_id='87531c67-f272-45f2-a6fe-c13b28bc4f46', 
            resource_group='rgAMLSLearnworkspace',
            create_resource_group = True,
            location='North Europe'
            )

print('AMLS Workspace created')


1.0.74
Deploying KeyVault with name testworkkeyvaulta95ee7b8.
Deploying StorageAccount with name testworkstorage820a0dde7.
Deployed KeyVault with name testworkkeyvaulta95ee7b8. Took 42.28 seconds.
Deployed StorageAccount with name testworkstorage820a0dde7. Took 41.4 seconds.
Deploying AppInsights with name testworkinsightsfc94c563.
Deployed AppInsights with name testworkinsightsfc94c563. Took 76.51 seconds.
Deploying Workspace with name testworkspace.
Deployed Workspace with name testworkspace. Took 27.39 seconds.
AMLS Workspace created


In [10]:
#---------------------------------------
# (1) Create a remote compute target
#---------------------------------------
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# Step 1: name the cluster and set the minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 3)

# Step 2: choose environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)

# create the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

print('Compute target created')

Compute target created


In [11]:
#---------------------------------------
# (2) Retrieve the MNIST data
#---------------------------------------
import os
import urllib.request

#create a folder for the dataset
os.makedirs('./data', exist_ok = True)

# load dataset to the directory--as you can see, you must load train sets and test sets separately
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', filename='./data/train-images.gz')
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', filename='./data/train-labels.gz')
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename='./data/test-images.gz')
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename='./data/test-labels.gz')

print('Done')

Done


In [12]:
#-----------------------------------------
# (3) Load data and create a modeling script
#-----------------------------------------
#upload data by using get_default_datastore()
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='mnist', overwrite=True, show_progress=True)

print('Done')

Uploading an estimated of 4 files
Uploading ./data/test-images.gz
Uploading ./data/test-labels.gz
Uploading ./data/train-images.gz
Uploading ./data/train-labels.gz
Uploaded ./data/test-labels.gz, 1 files out of an estimated total of 4
Uploaded ./data/train-labels.gz, 2 files out of an estimated total of 4
Uploaded ./data/test-images.gz, 3 files out of an estimated total of 4
Uploaded ./data/train-images.gz, 4 files out of an estimated total of 4
Uploaded 4 files
Done


In [13]:
import os

# create the folder
folder_training_script = './trial_model_mnist'
os.makedirs(folder_training_script, exist_ok=True)

print('Done')

Done


In [15]:
%%writefile $folder_training_script/train.py

# Let's prepare our model training script
import argparse
import os
import numpy as np
import glob

from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

from azureml.core import Run
# from utils import load_data

import gzip
import struct

# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
    with gzip.open(filename) as gz:
        struct.unpack('I', gz.read(4))
        n_items = struct.unpack('>I', gz.read(4))
        if not label:
            n_rows = struct.unpack('>I', gz.read(4))[0]
            n_cols = struct.unpack('>I', gz.read(4))[0]
            res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
            res = res.reshape(n_items[0], n_rows * n_cols)
        else:
            res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
            res = res.reshape(n_items[0], 1)
    return res


# let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
args = parser.parse_args()

###
data_folder = os.path.join(args.data_folder, 'mnist')
print('Data folder:', data_folder)

# load the train and test set into numpy arrays
X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0
X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0

#print variable set dimension
print(X_train.shape, X_test.shape, sep = '\n')

y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)
y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)

#print the response variable dimension
print( y_train.shape, y_test.shape, sep = '\n')

# get hold of the current run
run = Run.get_context()

print('Train a logistic regression model with regularization rate of', args.reg)
clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42)
clf.fit(X_train, y_train)

print('Predict the test set')
y_hat = clf.predict(X_test)

# calculate accuracy on the prediction
acc = np.average(y_hat == y_test)
print('Accuracy is', acc)

run.log('regularization rate', np.float(args.reg))
run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')

Writing ./trial_model_mnist/train.py


In [18]:
from azureml.train.sklearn import SKLearn

script_params = {
    '--data-folder': ds.as_mount(),
    '--regularization': 0.5
}

#import the Scikit-learn package 
est = SKLearn(source_directory=folder_training_script,
                script_params=script_params,
                compute_target=compute_target,
                entry_script='train.py',
                conda_packages=['scikit-learn'])



In [19]:
#-----------------------------------------------------------------
# (4) Submit the model, monitor the run, and retrieve the results
#-----------------------------------------------------------------
from azureml.core import Experiment

#Create an experiment
experiment = Experiment(workspace = ws, name = "amls-learn-experimentnew5")

print('Experiment created')

Experiment created


In [20]:
run = experiment.submit(config=est)
run

Experiment,Id,Type,Status,Details Page,Docs Page
amls-learn-experimentnew5,amls-learn-experimentnew5_1573818382_022f2bcb,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation
