
### Install dependencies

In [None]:
!pip install azureml-sdk
!pip install azureml-train-automl
!pip install pandas
!pip install seaborn

### Declare constants

In [49]:
subscription_id = '{subscription-id}'
resource_group  = '{resource-group}'
workspace_name  = '{workspace-name}'
experiment_name = '{experiment-name}'
project_folder = './contoso-aml'
cluster_name = '{aml-compute-cluster-name}'
aks_cluster_name = '{aks-cluster-name}'
aks_service_name ='{aks_service_name}'

# Working directory
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

### Import open source Python libraries

In [1]:
import logging
import os
import random
import re
import lightgbm
import pandas as pd
import numpy as np
import json
import csv
from shutil import copy2
from matplotlib import pyplot as plt
from matplotlib.pyplot import imshow
from sklearn import datasets
import seaborn as sns
sns.set(color_codes='True')

### Import Azure Machine Learning Python SDK

In [2]:
import azureml.core
from azureml.core import Workspace
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.compute import AksCompute, ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies 
from azureml.core.datastore import Datastore
from azureml.core.dataset import Dataset
from azureml.core.webservice import Webservice, AksWebservice, AciWebservice
from azureml.core.image import ContainerImage, Image
from azureml.core.model import Model
from azureml.core.runconfig import DataReferenceConfiguration, RunConfiguration
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails
from azureml.core.model import InferenceConfig
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData, PublishedPipeline, PipelineRun, Schedule, TrainingOutput
from azureml.pipeline.core.graph import PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.train.automl import AutoMLConfig, AutoMLStep
from azureml.train.automl.automlexplainer import retrieve_model_explanation


### Connect to Azure Workspace

In [42]:
try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    ws.write_config()
    ws = Workspace.from_config()
    print('Library configuration succeeded')
except:
    print('Workspace not found')

Library configuration succeeded


### Create a Azure AutoML Workspace Experiment

In [43]:
experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Experiment'] = experiment.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T


Unnamed: 0,Unnamed: 1
SDK version,1.0.43
Subscription ID,2a779d6f-0806-4359-a6e8-f1fd57bb5dd7
Workspace,contoso-workspace
Experiment,contoso-aml
Resource Group,devintersection-2018-aml-demo
Location,westus2
Project Directory,./contoso-aml


### Get DataStore and list DataSets (Azure Storage, Azure Data Lake, Azure SQL and more)

In [44]:
ds = ws.get_default_datastore()
for dataset in Dataset.list(ws):
    print(dataset.name)

vehicle_testing
vehicle_training_simple
vehicle_training
vehicle_data


In [11]:
train_data = ws.datasets['vehicle_training']
test_data = ws.datasets['vehicle_data']

train_df = train_data.to_pandas_dataframe()
test_df = test_data.to_pandas_dataframe() 

In [None]:
train_df.head(5)

### Explore the training data

In [None]:
col_list = ['Survival_In_Days']
df = train_df[col_list]
sns.distplot(df)

In [None]:
col_list = ['Survival_In_Days', 'Trip_Length_Sigma', 'Trips_Per_Day_Sigma']
df = train_df[col_list]
sns.pairplot(df)

### Create get_data.py for Automate Machine Learning to use training dataset

In [None]:
%%writefile $project_folder/get_data.py
import pandas as pd
import numpy as np
from azureml.core import Workspace, Datastore, Dataset, Run

def get_data():
    run = Run.get_context()
    workspace = run.experiment.workspace
    dataset = Dataset.get(workspace=workspace, name='vehicle_training')
    
    # Get dataset by name
    train_data = dataset.to_pandas_dataframe()
    
    X = train_data.iloc[:,1:74]
    Y = train_data.iloc[:,0].values

    return { "X" : X.values, "y" : Y.flatten() }

# Create AML Compute Cluster

In [45]:
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2', max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

Found existing compute target.


<img src='../mlsolutions/workplacesafety/resources/compute.png'/>

### Setup Automated Machine Learning Experiment

<img align="left" width="850" src="https://raw.githubusercontent.com/chrislauren/amlallhands/master/automl%20overview.png">

In [46]:
run_config = RunConfiguration(framework="python")
run_config.target = compute_target
run_config.environment.docker.enabled = True

automl_config = AutoMLConfig(task = 'regression',
                             iterations = 25,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 10,
                             preprocess = True,
                             primary_metric = 'normalized_root_mean_squared_error',
                             n_cross_validations = 2,
                             debug_log = 'automl.log',
                             verbosity = logging.DEBUG,
                             data_script = project_folder + "/get_data.py",
                             run_configuration = run_config,
                             #compute_target = compute_target,
                             #blacklist_models = "",
                             path = project_folder)

### Run our Experiment on AML Compute

In [47]:
remote_run = experiment.submit(automl_config, show_output=False)

### Display Automated ML Run Details

In [52]:
RunDetails(remote_run).show() 

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

### Show best run and model

In [None]:
best_run, fitted_model = remote_run.get_output()
print(best_run)

### Publish best model to Azure Machine Learning Model Registry

In [None]:
# register the model for deployment
model = best_run.register_model(model_name='battery_failure_predictor', 
                                model_path='outputs/model.pkl', 
                                tags = {'area': "auto", 'type': "regression"}) 
print("Model name: " + model.name, "Model version: " + str(model.version), sep="\n")

### Create Scoring File

In [18]:
%%writefile score.py
import pickle
import json
import azureml.train.automl
import numpy as np

from azureml.core.model import Model
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from sklearn.externals import joblib

input_sample = np.array([['Centre_Val_de_Loire', 'MidWest', 18.12363, 6.041211, 4.022051, 1.0055129999999999, 250, 'M10', 'Y2013', 0.8890518, 'True', -2.081553, -7.993295, 10.80828, -12.12702, -0.6662728, -10.65882, 0.6769765999999999, -14.394279999999998, -2.4150240000000003, -7.103573, 7.1126830000000005, -6.97204, 1.0697889999999999, -17.84025, 7.2463169999999995, -20.07207, -0.3555982, -10.730080000000001, -1.621019, -9.052511, -3.151685, -12.38087, -2.213263, -8.923146000000001, 11.35322, -14.51881, 4.450673, -9.823507000000001, -2.996298, -9.121293, -0.5239272, -24.48196, -2.7362900000000003, -10.16089, -1.014979, -7.749569999999999, -1.9338509999999998, -22.129839999999998, -0.3227415, -6.803357000000001, 0.8542936, -14.50496, -0.44740240000000003, -13.992920000000002, -1.34541, -14.18944, -1.526483, -11.79697, -0.9755153, -17.62779, 3.219577, -8.458607, -0.3016018, -9.000634, 3.9828300000000003, -12.40082, 3.7529589999999997, -16.78719, 3.178833, -9.794724, 2.012089, -16.29766]])
output_sample = np.array([0])

def init():
    global model
    # this name is model.id of model that we want to deploy
    model_path = Model.get_model_path(model_name = 'battery_failure_predictor')
    # deserialize the model file back into a sklearn model
    model = joblib.load(model_path)

@input_schema('data', NumpyParameterType(input_sample))
@output_schema(NumpyParameterType(output_sample))
def run(data):
    try:
        result = model.predict(data)
        print(result)
    except Exception as e:
        result = str(e)
        print('Exception Ocurred')
        print(e)
        return {"error": result}
    return {"result":result.tolist()}

Overwriting score.py


### Create Environment Dependency File

In [19]:
myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'],pip_packages=['azureml-sdk[notebooks,automl]','inference-schema'])
print(myenv.serialize_to_string())

with open("myenv.yml","w") as f:
    f.write(myenv.serialize_to_string())

# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - azureml-sdk[notebooks,automl]
  - inference-schema
- numpy
- scikit-learn
channels:
- conda-forge



In [None]:
print(model.name, model.version)

### Create/connect to the Kubernetes compute cluster

In [None]:
# Use the default configuration (can also provide parameters to customize)
prov_config = AksCompute.provisioning_configuration(location='eastus2')

# Create the cluster
aks_target = ComputeTarget.create(workspace = ws, 
                          name = aks_cluster_name, 
                          provisioning_configuration = prov_config)

aks_target.wait_for_completion(True)

### Deploy the model to Kubernetes

In [None]:
inference_config = InferenceConfig(runtime= "python", entry_script="score.py", conda_file="myenv.yml")

deployment_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

service = Model.deploy(ws, aks_service_name, [model], inference_config, deployment_config, aks_target)

service.wait_for_deployment(show_output = True)
print(service.state)

### Connect to the deployed webservice

In [33]:
aks_service = Webservice(ws,aks_service_name)

In [34]:
import json
import csv
import requests

# prepare the test data
test__df = test_df.drop(columns=["Car_ID", "Battery_Age"])
sample = test__df.values.tolist()

headers = {'Content-Type':'application/json'}

if aks_service.auth_enabled:
    headers['Authorization'] = 'Bearer '+aks_service.get_keys()[0]

test_sample = json.dumps({'data': sample})
response = requests.post(aks_service.scoring_uri, data=test_sample, headers=headers)
print(response.json())

{'result': [1552.1351300157253, 1401.3791786358465, 1465.7011829586686, 1727.9514716303725, 1738.022517834776, 1121.8909903529561, 2230.1036098625264, 1289.8034144180742, 1750.9422334858118, 1906.4357235894115]}


In [35]:
results = response.json()['result']

### Take the prediction and calculate which batteries need testing

In [36]:
result_columns = ['Car_ID','Predicted_Days_Remaining', 'Index']
result_df = pd.DataFrame(columns=result_columns)

for i in range(len(results)):
    car_ID = test_df['Car_ID'].loc[i]
    days_remaining = results[i] - test_df['Battery_Age'].loc[i]
    if days_remaining < 31:
        car_values = [car_ID, days_remaining, i]
        result_df.loc[result_df.shape[0],:] = car_values

print('Percentage of cars that need batteries testing: {}%'.format(result_df.shape[0] / len(results) * 100))

Percentage of cars that need batteries testing: 10.0%


### Create models tailored for each model and year of car using Azure Machine Learning Pipelines

In [None]:
# Re-use our experiment configuration
input_data = DataReference(datastore=ds, 
                           data_reference_name='training_data',
                           path_on_datastore='data',
                           mode='download',
                           path_on_compute='/tmp/azureml_runs',
                           overwrite=True
                          )

In [None]:
# iterate over models and years of cars
car_models = ['ContosoXL','ContosoML']
car_years = ['2013','2014','2015','2016','2017']

steps = []
current = None
ds = ws.get_default_datastore()

# Build a model for every category
for i in car_models:
    for y in car_years:
                  
        automl_config = AutoMLConfig('regression',
                                     iterations = 10,
                                     iteration_timeout_minutes = 5, 
                                     max_cores_per_iteration = 10,
                                     preprocess = True,
                                     primary_metric = 'normalized_root_mean_squared_error',
                                     n_cross_validations = 2,                                     
                                     debug_log = 'automl.log',                                   
                                     verbosity = logging.DEBUG,
                                     data_script = '{}/get_data_c_{}_{}.py'.format(project_folder, i, y),
                                     run_configuration = run_config,
                                     compute_target = compute_target,
                                     path = project_folder)

        # AutoML action
        automl_step = AutoMLStep(name='automl_module__category_{}_{}'.format(i, y),
                                 automl_config=automl_config,
                                 inputs=[input_data],
                                 outputs=[metrics_data, model_data],
                                 allow_reuse=False)

        # These are the two outputs from AutoML
        metrics_data = PipelineData(name='metrics_data_category_{}_{}'.format(i, y),
                                    datastore=ds,
                                    pipeline_output_name='metrics_output_category_{}_{}'.format(i, y),
                                    training_output=TrainingOutput(type='Metrics'))

        model_data = PipelineData(name='model_data_category_{}_{}'.format(i, y),
                                  datastore=ds,
                                  pipeline_output_name='best_model_output_category_{}_{}'.format(i, y),
                                  training_output=TrainingOutput(type='Model'))
        
        # register the model afterwards
        register_step = PythonScriptStep(name='register_category_{}_{}'.format(i, y),
                                         script_name='register.py',
                                         compute_target=compute_target,
                                         source_directory=project_folder,
                                         arguments=['--model_name', '{}_{}_battery_failure_predictor'.format(i, y), '--model_path', model_data],
                                         inputs=[model_data],
                                         # These are the two outputs from AutoML
                                         allow_reuse=False)

        # And chain them together so they run sequentially
        if current:
            automl_step.run_after(current)

        current = register_step

        steps.append(automl_step)
        steps.append(register_step)

        pipeline = Pipeline(description='Generate recommendation models',
                        workspace=ws,
                        steps=steps)

        pipeline.validate()

        # Once published, we can invoke on demand via the SDK or via a REST endpoint
        published_pipeline = pipeline.publish(name='contoso-{}_{}'.format(i, y))
        steps = []
        current = None
        
        # Submit a run for the newly created pipeline
        #published_pipeline.submit(ws, published_pipeline.name)