# Automated ML

In [1]:
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails
from azureml.core.model import InferenceConfig 
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model

import pandas as pd

In [2]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

experiment_name = 'capstone-heart-failure-prediction'
experiment=Experiment(ws, experiment_name)
experiment

run = experiment.start_logging()

ws_udacity_capstone_v2
UdacityMLAzureCapstoneV2
eastus2
ca1598e0-85dc-47d5-b06d-41b5342b4989


## Create New Cluster / Use Existing Cluster

In [3]:
# Create compute cluster and choose a name for it
cpu_cluster_name = "CapstoneV1"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', 
                                                            min_nodes=1, 
                                                            max_nodes=4)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

# Can poll for a minimum number of nodes and for a specific timeout. 
# If no min node count is provided it uses the scale settings for the cluster.
compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing cluster, use it.

Running
{'errors': [], 'creationTime': '2021-04-25T18:53:25.316093+00:00', 'createdBy': {'userObjectId': '2f5770ca-7bf7-4ecc-bd4b-14652b1bbf0c', 'userTenantId': '3814e260-63cb-43a8-82ce-e862c309e004', 'userName': 'Abhi Ojha'}, 'modifiedTime': '2021-04-25T18:55:27.349216+00:00', 'state': 'Running', 'vmSize': 'STANDARD_DS12_V2'}


## Dataset

### Overview

This dataset used in this project is taken from [Kaggle](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data). It consists of 12 distinct features and 1 target as summarized below:
- **Input features** - Age, Anaemia, Creatinine-phosphokinase, Diabetes, Ejection_fraction, High_blood_pressure, Platelets, Serum_creatinine, Serum_sodium, Sex, Smoking, Time
- **Target** - DEATH_EVENT

We will use this dataset for creating a model to predict mortality caused by heart failure.

In [4]:
local_data_path = 'data/heart_failure_clinical_records_dataset.csv'

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data', target_path='data')

# create a dataset referencing the cloud location
heart_failure_ds= Dataset.Tabular.from_delimited_files(path = [(datastore, (local_data_path))])

# register dataset
heart_failure_ds = heart_failure_ds.register(workspace=ws, name='heart_failure_ds', create_new_version=True)

df = pd.read_csv(local_data_path)
df.head()

Uploading an estimated of 1 files
Target already exists. Skipping upload for data/heart_failure_clinical_records_dataset.csv
Uploaded 0 files


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [5]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


## AutoML Configuration

I used the following automl settings and configuration parameters:
```python
automl_settings = { "n_cross_validations": 2,
                    "primary_metric": 'accuracy',
                    "enable_early_stopping": True,
                    "max_concurrent_iterations": 4,
                    "experiment_timeout_minutes": 15,
                }

automl_config = AutoMLConfig(compute_target = compute_target,
                            task='classification',
                            training_data=heart_failure_ds,
                            label_column_name='DEATH_EVENT',
                            path = 'automl_runs',
                            featurization= 'auto',
                            debug_log = "automl_errors.log",
                            enable_onnx_compatible_models=True,
                            **automl_settings)
```
**AutoML Settings**
- *n_cross_validations*: I chose 2 cross validations, which means that the metrics are calculated as an average of 2 folds.
- *primary metric*: I chose `accuracy` as it is the default metric for classification tasks. 
- *enable_early_stopping*: I set this value to `True` so that the model can stop training once it stops improving.
- *max_concurrent_iterations*: This value is set at 4, which means that there can be at max 4 iterations in parallel.
- *experiment_timeout_minutes*: To save costs, I chose this value to be 15 mins. After this time the AutoML experiement will automatically stop.

**AutoML Config**
- *compute_target*: This defines the Azure Compute target that I set up for running this experiment.
- *task*: Since this is a classification problem, this value is set as `classification`.
- *training_data*: The training data used for this experiement. It contains both - training features and the target label.
- *label_column_name*: Target label column name, which is `DEATH_EVENT`.
- *path*: path to AzureML project folder.
- *featurization*: Setting this value to `auto` means that featurization will be done automatically.
- *debug_log*: path of the log file.
- *enable_onnx_compatible_models*: Setting this value to `True` enables `onnx_compatible_models`.

In [6]:
automl_settings = { "n_cross_validations": 2,
                    "primary_metric": 'accuracy',
                    "enable_early_stopping": True,
                    "max_concurrent_iterations": 10,
                    "experiment_timeout_minutes": 15,
                }

automl_config = AutoMLConfig(compute_target = compute_target,
                            task='classification',
                            training_data=heart_failure_ds,
                            label_column_name='DEATH_EVENT',
                            path = 'automl_runs',
                            featurization= 'auto',
                            debug_log = "automl_errors.log",
                            enable_onnx_compatible_models=True,
                            **automl_settings)

## Run Details

In [7]:
# Submit your experiment
automl_run = experiment.submit(automl_config, show_output=True)
RunDetails(automl_run).show()

Submitting remote run.
No run_configuration provided, running on CapstoneV1 with default configuration
Running on remote compute: CapstoneV1


Experiment,Id,Type,Status,Details Page,Docs Page
capstone-heart-failure-prediction,AutoML_a9c4d05b-5ba1-44fb-8126-8fc51c6ae086,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [18]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [8]:
automl_run.wait_for_completion()
print(f"Run Status: {automl_run.get_status()}")

Run Status: Completed


## Best Model



In [9]:
best_automl_run, fit_model = automl_run.get_output()
print("Best run metrics :",best_automl_run.get_metrics())
print("Best run properties :",best_automl_run.get_properties())

# download the environment file
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml', 'environment.yml')

Package:azureml-automl-runtime, training version:1.27.0.post1, current version:1.26.0
Package:azureml-core, training version:1.27.0, current version:1.26.0
Package:azureml-dataprep, training version:2.14.2, current version:2.13.2
Package:azureml-dataprep-native, training version:33.0.0, current version:32.0.0
Package:azureml-dataprep-rslex, training version:1.12.1, current version:1.11.2
Package:azureml-dataset-runtime, training version:1.27.0, current version:1.26.0
Package:azureml-defaults, training version:1.27.0, current version:1.26.0
Package:azureml-interpret, training version:1.27.0, current version:1.26.0
Package:azureml-mlflow, training version:1.27.0, current version:1.26.0
Package:azureml-pipeline-core, training version:1.27.0, current version:1.26.0
Package:azureml-telemetry, training version:1.27.0, current version:1.26.0
Package:azureml-train-automl-client, training version:1.27.0, current version:1.26.0
Package:azureml-train-automl-runtime, training version:1.27.0.post1,

Best run metrics : {'average_precision_score_macro': 0.8831363343714849, 'recall_score_micro': 0.8528859060402685, 'average_precision_score_weighted': 0.9056837576775196, 'f1_score_micro': 0.8528859060402684, 'AUC_macro': 0.8995812987086556, 'weighted_accuracy': 0.8882023284622077, 'f1_score_macro': 0.8206866617657156, 'AUC_micro': 0.9108387650205947, 'accuracy': 0.8528859060402685, 'precision_score_micro': 0.8528859060402685, 'precision_score_macro': 0.8505196192696194, 'recall_score_weighted': 0.8528859060402685, 'f1_score_weighted': 0.8476131033593202, 'precision_score_weighted': 0.8549056474224259, 'balanced_accuracy': 0.8074887317471754, 'log_loss': 0.3858780948223615, 'average_precision_score_micro': 0.9096561021149501, 'norm_macro_recall': 0.6149774634943508, 'recall_score_macro': 0.8074887317471754, 'AUC_weighted': 0.8995812987086556, 'matthews_correlation': 0.6555720612004814, 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_a9c4d05b-5ba1-44fb-8126-8fc51c6ae086_48

In [10]:
print(fit_model)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                               random_state=0,
                                                                                               reg_alpha=0.9375,
                                                                                               reg_lambda=0.7291666666666667,
                                          

In [11]:
# Save the best model

best_automl_run.download_file('outputs/model.pkl', 'best_automl_model.pkl')
print(best_automl_run)

Run(Experiment: capstone-heart-failure-prediction,
Id: AutoML_a9c4d05b-5ba1-44fb-8126-8fc51c6ae086_48,
Type: azureml.scriptrun,
Status: Completed)


## Model Deployment

In [12]:
model = automl_run.register_model(model_name = 'best_automl_model.pkl')
print(automl_run.model_id)

environment = best_automl_run.get_environment()
entry_script='inference/scoring.py'
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', entry_script)

inference_config = InferenceConfig(entry_script = entry_script, environment = environment)

# Deploy model using ACI WebService

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                                    memory_gb = 1, 
                                                    auth_enabled= True, 
                                                    enable_app_insights= True)

service = Model.deploy(ws, "aciservice", [model], inference_config, deployment_config)
service.wait_for_deployment(show_output = True)

best_automl_model.pkl
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-04-25 20:35:25+00:00 Creating Container Registry if not exists.
2021-04-25 20:35:25+00:00 Registering the environment.
2021-04-25 20:35:25+00:00 Generating deployment configuration.
2021-04-25 20:35:27+00:00 Submitting deployment to compute..
2021-04-25 20:35:33+00:00 Checking the status of deployment aciservice..
2021-04-25 20:40:34+00:00 Checking the status of inference endpoint aciservice.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [16]:
%run endpoint.py

{"result": [1]}


In [17]:
# Printing the logs
print(service.get_logs())

2021-04-25T20:40:29,687130600+00:00 - gunicorn/run 
2021-04-25T20:40:29,688205700+00:00 - rsyslog/run 
2021-04-25T20:40:29,689218700+00:00 - iot-server/run 
2021-04-25T20:40:29,712592900+00:00 - nginx/run 
/usr/sbin/nginx: /azureml-envs/azureml_38f1e90f927390c3641ac84304d53445/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_38f1e90f927390c3641ac84304d53445/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_38f1e90f927390c3641ac84304d53445/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_38f1e90f927390c3641ac84304d53445/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_38f1e90f927390c3641ac84304d53445/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
rsyslogd

In [19]:
service.delete()