# Automated ML

* Importing Workspace and Experiment class

In [1]:
from azureml.core import Workspace, Experiment

## Setting up the workspace

* Setting experiment name and associating it to the workspace

In [2]:
ws = Workspace.from_config()

print('WORKSPACE DETAILS:\nWorkspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

# choose a name for experiment
experiment_name = 'heart-disease-automl'

experiment=Experiment(ws, experiment_name)
print("Experiment created")

run = experiment.start_logging()

WORKSPACE DETAILS:
Workspace name: quick-starts-ws-137106
Azure region: southcentralus
Subscription id: 48a74bb7-9950-4cc1-9caa-5d50f995cc55
Resource group: aml-quickstarts-137106
Experiment created


## Compute Cluster
* Setting up the Compute cluster VM `cpu-cluster` to run the experiment

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
# Creating a compute cluster
# vm_size = "Standard_D2_V2"
# max_nodes = 4.

cpu_cluster_name = "cpu-cluster"

# to check whether the compute cluster exists already or not
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print("Existing compute target found... Using it")

except ComputeTargetException:
    print("Creating new Compute Target... " + cpu_cluster_name)
    provisioning_compute_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2", max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Existing compute target found... Using it
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Datset Preview
* Uploading the dataset from the given URL and reviewing the DataFrame

In [4]:
from azureml.data.dataset_factory import TabularDatasetFactory
import pandas as pd

url_path = "https://raw.githubusercontent.com/bharati-21/AZMLND-Capstone-Project/master/files/heart.csv"
ds = TabularDatasetFactory.from_delimited_files(path=url_path)

df = pd.read_csv(url_path)

In [5]:
ds_dataframe = ds.to_pandas_dataframe()
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### Cleaning and Splitting the Data

In [6]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x.head()

Shape of dataset before split: (302, 21)


Unnamed: 0,age,resting_BP,cholesterol,max_heart_rate,st_depression,sex_1,chest_pain_type_1,chest_pain_type_2,chest_pain_type_3,fasting_blood_sugar_1,rest_ECG_1,rest_ECG_2,exercise_induced_angina_1,st_slope_1,st_slope_2,num_major_vessels_1.0,num_major_vessels_2.0,num_major_vessels_3.0,thalassemia_2.0,thalassemia_3.0
0,63,145,233,150,2.3,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
1,37,130,250,187,3.5,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0
2,41,130,204,172,1.4,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3,56,120,236,178,0.8,1,1,0,0,0,1,0,0,0,1,0,0,0,1,0
4,57,120,354,163,0.6,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0


In [None]:
# x['target'] = y
# x

In [7]:
import os
path = "./Data"
try:
    os.makedirs(path, exist_ok=True)
    print("Dicrectory 'Data' created...")
except OSError:
    print("Directory 'Data' cannot be created...")

Dicrectory 'Data' created...


In [8]:
# split data into test and train sets

from sklearn.model_selection import train_test_split
import pandas as pd

#spliting the data
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2)
print(test_x.shape, train_x.shape, test_y.shape, train_y.shape)

train_df = train_x.copy(deep=True)
train_df['target'] = train_y
print(train_df.shape)

test_df = test_x.copy(deep=True)
test_df['target'] = test_y
print(test_df.shape)

# saving test and train data locally
train_df.to_csv("./Data/train_data.csv", index = False)
test_df.to_csv("./Data/test_data.csv", index = False)

(61, 20) (241, 20) (61,) (241,)
(241, 21)
(61, 21)


In [9]:
deafult_store = ws.get_default_datastore()
deafult_store.upload(src_dir="Data", target_path="Data", overwrite=True)

Uploading an estimated of 2 files
Uploading Data/test_data.csv
Uploaded Data/test_data.csv, 1 files out of an estimated total of 2
Uploading Data/train_data.csv
Uploaded Data/train_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_1f6bee0e8ad74980a463b1c2d1f82de7

In [10]:
from azureml.core import Dataset

train_data = TabularDatasetFactory.from_delimited_files(path=[(deafult_store, ('Data/train_data.csv'))])
test_data = TabularDatasetFactory.from_delimited_files(path=[(deafult_store, ('Data/test_data.csv'))])

In [11]:
train_data.to_pandas_dataframe()

Unnamed: 0,age,resting_BP,cholesterol,max_heart_rate,st_depression,sex_1,chest_pain_type_1,chest_pain_type_2,chest_pain_type_3,fasting_blood_sugar_1,...,rest_ECG_2,exercise_induced_angina_1,st_slope_1,st_slope_2,num_major_vessels_1.0,num_major_vessels_2.0,num_major_vessels_3.0,thalassemia_2.0,thalassemia_3.0,target
0,63,145,233,150,2.3,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,44,120,220,170,0.0,1,1,0,0,0,...,0,0,0,1,0,0,0,1,0,1
2,53,128,216,115,0.0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,1
3,35,138,183,182,1.4,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
4,61,130,330,169,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,55,132,353,132,1.2,1,0,0,0,0,...,0,1,1,0,1,0,0,0,1,0
237,60,102,318,160,0.0,0,0,1,0,0,...,0,0,0,1,1,0,0,1,0,1
238,51,140,298,122,4.2,1,0,0,0,0,...,0,1,1,0,0,0,1,0,1,0
239,53,140,203,155,3.1,1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0


In [12]:
# train_data = train_data.drop_columns('Column1')
train_data, train_data.to_pandas_dataframe().shape

({
   "source": [
     "('workspaceblobstore', 'Data/train_data.csv')"
   ],
   "definition": [
     "GetDatastoreFiles",
     "ParseDelimited",
     "DropColumns",
     "SetColumnTypes"
   ]
 },
 (241, 21))

## AutoML Configuration

1. The settings created for the AutoML run was:
   * `Experiment Timeout (experiment_timeout_minutes)`: Maximum amount of time (in minutes) that all iterations combined can take before the experiment terminates.
   * `Primary Metric (primary_metric)`: The primary metric which is used to evaluate every run. Accuracy is the primary metric to be evaluated.
   * `Cross Validations (n_cross_validations)`: Specifies the number of cross validations that needs to be performed on each model by splitting the dataset into n subsets.

1. The AutoMLConfg object was defined as follows:
   * `Task to be performed (task)`: The type of task that needs to be run such as classification, regression, forecasting etc.   In this project classification is the task to be performed.
   * `Training Data (training_data)`: The TabularDataset that contains the training data.
   * `Label Column (label_column_name)`: Name of the column that needs to be predicted. In this case the target column is used as the label column to perform classification.
   * `Compute Target (compute_target)`: The cluster used to run the experiment on.

In [14]:
from azureml.train.automl import AutoMLConfig

# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 30,
    "primary_metric": 'accuracy'
}

# Define Automl config 
automl_config = AutoMLConfig (
    task = 'classification',
    training_data = train_data,
    label_column_name = "target",
    n_cross_validations = 5,
    enable_onnx_compatible_models = True,
    compute_target = cpu_cluster,
    **automl_settings
)

print("AutoML config created.")

AutoML config created.


In [15]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output = True)

Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_c3359735-5208-4441-a6bb-864e97b28ed6

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: 

## Run Details

* Various algorithms were chosen by AutoML to train on the given dataset, and the best model obtained was a `Voting Ensemble` Algorithms.
* This discrepancy can be observed due to the varying techniques in which algorithms train on the data.

In [16]:
from azureml.widgets import RunDetails

RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…



****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_c3359735-5208-4441-a6bb-864e97b28ed6',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-02-03T16:28:06.856215Z',
 'endTimeUtc': '2021-02-03T17:07:41.290549Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"177ebe48-aeea-4071-b37a-c577385ba661\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"Data/train_data.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137106\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"48a74bb7-9950-4cc1-9caa-5d50f995cc55\\\\\\", \\\\\\"work

## Best Model

* The best model was a `Voting Ensemble` algorithm with an accuracy of `0.8423`.

In [17]:
best_run, fitted_model = remote_run.get_output()
print(best_run)

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Run(Experiment: heart-disease-automl,
Id: AutoML_c3359735-5208-4441-a6bb-864e97b28ed6_26,
Type: azureml.scriptrun,
Status: Completed)


In [18]:
print(fitted_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                    min_weight_fraction_leaf=0.0,
                                                                                                    n_estimators=25,
                                                                                                    n_jobs=1,
                                   

In [19]:
best_run.get_tags()

{'_aml_system_azureml.automlComponent': 'AutoML',
 '_aml_system_ComputeTargetStatus': '{"AllocationState":"steady","PreparingNodeCount":0,"RunningNodeCount":0,"CurrentNodeCount":1}',
 'ensembled_iterations': '[17, 22, 15, 0, 6, 20, 10, 8, 14, 4]',
 'ensembled_algorithms': "['XGBoostClassifier', 'LightGBM', 'LightGBM', 'LightGBM', 'GradientBoosting', 'ExtremeRandomTrees', 'LightGBM', 'XGBoostClassifier', 'LightGBM', 'RandomForest']",
 'ensemble_weights': '[0.2, 0.13333333333333333, 0.2, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667]',
 'best_individual_pipeline_score': '0.8296768707482993',
 'best_individual_iteration': '17',
 '_aml_system_automl_is_child_run_end_telemetry_event_logged': 'True'}

In [20]:
import joblib
from azureml.core.model import Model

#TODO: Save the best model
description = "AutoML model trained on the Kaggle Heart Disease UCI Dataset"

os.makedirs('outputs', exist_ok=True)
joblib.dump(fitted_model, filename="outputs/automl-heart-disease.pkl")
automl_model = remote_run.register_model(model_name='automl-heart-disease', description=description)

In [21]:
automl_model

Model(workspace=Workspace.create(name='quick-starts-ws-137106', subscription_id='48a74bb7-9950-4cc1-9caa-5d50f995cc55', resource_group='aml-quickstarts-137106'), name=automl-heart-disease, id=automl-heart-disease:1, version=1, tags={}, properties={})

## ONNX Format
* Obtaining an ONNX format model from `get_output()` method and then saving it locally.
* The ONNX model is then tested by predicting on the test dataset.

In [22]:
best_run, onnx_mdl = remote_run.get_output(return_onnx_model=True)

In [23]:
from azureml.automl.runtime.onnx_convert import OnnxConverter
onnx_fl_path = "./heart-disease-automl-onnx.onnx"
OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)

In [25]:
import sys
import json
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False

import onnxruntime
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res

if python_version_compatible:
    mdl_bytes = onnx_mdl.SerializeToString()
    onnx_res = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(mdl_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(test_x)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use Python version 3.6 or 3.7 to run the inference helper.')


[0 1 1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 1 0 1 0 0 0 1 1 1 0 0
 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 1 1 0 0 0 1 1 1 0]
[[0.78050053 0.19162548]
 [0.19687101 0.82180035]
 [0.25387788 0.75719845]
 [0.3790614  0.6172721 ]
 [0.6737729  0.30987364]
 [0.41441244 0.5839116 ]
 [0.23246741 0.78646594]
 [0.18493311 0.8387934 ]
 [0.6018187  0.38367268]
 [0.20872505 0.8045925 ]
 [0.34728676 0.6617884 ]
 [0.5496333  0.4387091 ]
 [0.751331   0.2210846 ]
 [0.76135063 0.21060821]
 [0.278318   0.7369645 ]
 [0.5905689  0.40360814]
 [0.5814624  0.40357417]
 [0.51427877 0.4674326 ]
 [0.21519828 0.7970789 ]
 [0.32076442 0.68453705]
 [0.7169296  0.2551699 ]
 [0.4567275  0.5409703 ]
 [0.5704504  0.42460567]
 [0.8485978  0.12171577]
 [0.15149194 0.873     ]
 [0.1384215  0.89017516]
 [0.358358   0.64802927]
 [0.7488961  0.2252743 ]
 [0.25180966 0.7629959 ]
 [0.8431312  0.12689696]
 [0.59042335 0.39216498]
 [0.78999126 0.18484798]
 [0.12338189 0.9064109 ]
 [0.27094847 0.7359301 ]
 [0.3535733  0.6476958 ]


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

TODO: In the cell below, send a request to the web service you deployed to test it.

TODO: In the cell below, print the logs of the web service and delete the service