Import all the necessary Python packages

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import azureml.core
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

We create an Azure workspace object. A workspace is more like a collection of resources. We read the workspace name from a configuration file stored on the compute.

In [None]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\n')

We create an experiment to track different runs of the same model or service. A single workspace can have multiple experiments.

In [None]:
experiment_name = 'diabetes-sklearn'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

We run our experiments on Azure's cluster of virtual machines. The below code checks if the given compute cluster exists - if yes then uses it for execution otherwise a new compute cluster is created.

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster-agx")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This code uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

In [None]:
import os
script_folder = os.path.join(os.getcwd(), "diabetes-sklearn")
os.makedirs(script_folder, exist_ok=True)

In [None]:
%%writefile $script_folder/train.py

import argparse
import os
import numpy as np
import glob
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import joblib

from azureml.core import Run

parser = argparse.ArgumentParser()
args = parser.parse_args()

# load train and test set into numpy arrays
data = pd.read_csv("diabetes.csv")
print(data.shape)
feature_names = data.iloc[:,0:8].columns
target_name = data.iloc[:1,8:].columns
data_features = data[feature_names]
data_target = data[target_name]
print(feature_names)
print(target_name)

from sklearn.model_selection import train_test_split
np.random.seed(123)
X_train, X_test, y_train, y_test = train_test_split(data_features.to_numpy(dtype=np.uint8), data_target.to_numpy(dtype=np.uint8), train_size = 0.70, test_size = 0.30, random_state = 1)
y_train = y_train.reshape(-1)
y_test = y_test.reshape(-1)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep = '\n')

# get hold of the current run
run = Run.get_context()

print('Train a linear regression model')
clf = LinearRegression()
clf.fit(X_train, y_train)

print(clf.coef_)
print(clf.intercept_)

print('Predict the test set with size = ', X_test.shape)
y_hat = clf.predict(X_test)
print(y_hat)
y_classify = lambda k: 1 * (k > 0.5)
y_hat = y_classify(y_hat)
print(y_hat)

# calculate accuracy on the prediction
acc = np.average(y_hat == y_test)
print('Accuracy is', acc)

#run.log('regularization rate', np.float(args.reg))
run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=clf, filename='outputs/diabetes-sklearn-model.pkl')

In [None]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('diabetes-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)

A ScriptRunConfig object is created to specify the configuration of training job, environment and compute to use.

In [None]:
from azureml.core import ScriptRunConfig

#args = ['--regularization', 0.5]
args = []

src = ScriptRunConfig(source_directory=script_folder,
                      script='train.py', 
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)

Once everything is configured, we submit the job to the cluster.

In [None]:
run = exp.submit(config=src)
run

In [None]:
from azureml.widgets import RunDetails
RunDetails(run).show()

In [None]:
# specify show_output to True for a verbose log
run.wait_for_completion(show_output=True) 

In [None]:
print(run.get_metrics())

In [None]:
print(run.get_file_names())

In [None]:
# register model 
model = run.register_model(model_name='diabetes_sklearn', model_path='outputs/diabetes-sklearn-model.pkl')
print(model.name, model.id, model.version, sep='\n')

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/NotebookVM/tutorials/img-classification-part1-training.png)