# Automated ML

In [112]:
# dependencies needed to complete the project.
!pip install opendatasets
import opendatasets
import pandas as pd
import azureml.core
import logging

from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.core.dataset import Dataset
from azureml.core.datastore import Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.train.automl import AutoMLConfig
from azureml.core import Environment
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model, InferenceConfig
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.57.0


## Dataset

### Overview
In this project I will be using Heart Failure Prediction dataset from Kaggle. As per dataset details in Kaggle, the goal of this dataset is to early detect and manage mortality by heart failure.
Based on the dataset, the properly train machine learning model can predict heart failure based on features like age, ejection fraction, tobacco use, unhealthy diet and obesity, anaemia, physical inactivity, harmful use of alcohol etc.

Dataset features:	

- age: Age of patient
- anaemia: Decrease of red blood cells or hemoglobin
- creatinine-phosphokinase:	Level of the CPK enzyme in the blood
- diabetes:	Whether the patient has diabetes or not
- ejection_fraction: Percentage of blood leaving the heart at each contraction
- high_blood_pressure: Whether the patient has hypertension or not
- platelets: Platelets in the blood
- serum_creatinine: Level of creatinine in the blood
- serum_sodium: Level of sodium in the blood
- sex: Female (F) or Male (M)
- smoking	Whether the patient smokes or not
- time:	Follow-up period
- DEATH_EVENT: Whether the patient died during the follow-up period


The dataset has been downloaded from Kaggle using opendatasets package and an account I have created, and registered in the Workspace Data store.


In [105]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'awnanocapstoneexperiment01'

experiment=Experiment(ws, experiment_name)

found = False
key = "heart_failure_dataset"
description_text = "Heart Failure Prediction dataset"

if key in ws.datasets.keys(): 
    found = True
    dataset = ws.datasets[key] 

if not found:
    opendatasets.download('https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv', force = True)
    df = pd.read_csv('heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
    datastore = Datastore.get(ws, 'workspaceblobstore')
    dataset = Dataset.Tabular.register_pandas_dataframe(dataframe=df, target=datastore, name=key, description=description_text)

df = dataset.to_pandas_dataframe()
df.head()

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Dataset URL: https://www.kaggle.com/datasets/andrewmvd/heart-failure-clinical-data
Validating arguments.
Arguments validated.
Validating arguments.
Arguments validated.
'overwrite' is set to True. Any file already present in the target will be overwritten.
Uploading files from '/tmp/tmpjnvz8q91' to 'managed-dataset/ce46882a-8ed9-4870-9dc0-c98d6c72a4f7/'
Copying 1 files with concurrency set to 1
Copied /tmp/tmpjnvz8q91/dataframe.parquet, file 1 out of 1. Destination path: https://awnanocapstone1431179060.blob.core.windows.net/azureml-blobstore-6d35d5e0-0299-482d-86ed-70c2d92c6902/managed-dataset/ce46882a-8ed9-4870-9dc0-c98d6c72a4f7/dataframe.parquet
Files copied=1, skipped=0, failed=0
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
{'infer_column_types': 'False', 'a

  0%|          | 0.00/3.97k [00:00<?, ?B/s]


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [106]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.83,0.43,581.84,0.42,38.08,0.35,263358.03,1.39,136.63,0.65,0.32,130.26,0.32
std,11.89,0.5,970.29,0.49,11.83,0.48,97804.24,1.03,4.41,0.48,0.47,77.61,0.47
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [107]:
# creating or attaching to a compute cluster
cluster_name = "awnanocapstonecomputecluster1"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    print('Creating a new compute cluster...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_D4s_v3', min_nodes=1, max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Creating a new compute cluster...
InProgress..
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded............
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration
Overview of the AutoML sttings and configuration:

Settings:
- n_cross_validations: this parameter sets how many cross validations to perform. I set that value to 2
- primary_metric: I chose 'accuracy' as the default metric
- enable_early_stopping: it enables early termination of the experiment when the score is not improving
- max_concurrent_iterations: defines max number of iterations executed in parallel. I set it to 4.
- experiment_timeout_minutes: defines how long the experiment should continue, I set it to 30 minutes
- verbosity: verbosity level. I set it to INFO


Configuration:
- compute_target: Azure ML compute cluster which will be used for the experiment
- task: type of experiment; for our project set to 'classification'
- training_dataset: datset to be used during the experiment
- label_column_name: target column name for which the prediction is done
- path: project path
- debug_log: log file name
- featurization: defines if the featurization step should be performed automatically 'auto' or not 'off'. I set it to 'auto' 


In [113]:
automl_settings = {"n_cross_validations": 2,
                   "primary_metric": 'accuracy',
                   "enable_early_stopping": True,
                   "max_concurrent_iterations": 4,
                   "experiment_timeout_minutes": 30,
                   "verbosity": logging.INFO
                  }

automl_config = AutoMLConfig(compute_target = compute_target,
                            task='classification',
                            training_data=dataset,
                            label_column_name='DEATH_EVENT',
                            path = '.',
                            debug_log = "automl_errors.log",
                            featurization= 'auto',
                            **automl_settings
                            )

In [114]:
# submit the experiment
run = experiment.submit(automl_config, show_output = True)

Submitting remote run.
No run_configuration provided, running on awnanocapstonecomputecluster1 with default configuration
Running on remote compute: awnanocapstonecomputecluster1


Experiment,Id,Type,Status,Details Page,Docs Page
awnanocapstoneexperiment01,AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

********************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

**********************************************************************************

## Run Details

AutoML trained multiple models with varying algorithms and hyperparameters to get the best model based for the defined metric. There have been 34 iterations in the defined time frame. The best score was 0.8562 for StandardScalerWrapper XGBoostClassifier.
I use RunDetails widget to show details and also show children runs.

In [116]:
# use RunDetails widget

RunDetails(run).show()
run.wait_for_completion()



_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

{'runId': 'AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea',
 'target': 'awnanocapstonecomputecluster1',
 'status': 'Completed',
 'startTimeUtc': '2024-12-23T15:39:52.144219Z',
 'endTimeUtc': '2024-12-23T16:01:12.350631Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '2',
  'target': 'awnanocapstonecomputecluster1',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"f577bd8d-662f-43c4-9cf5-31bcb0861760\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependen

In [117]:
# show children runs of the best run

for child_run in run.get_children():
    print('---------------------------------------')
    print(child_run)

---------------------------------------
Run(Experiment: awnanocapstoneexperiment01,
Id: AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea_37,
Type: azureml.scriptrun,
Status: Completed)
---------------------------------------
Run(Experiment: awnanocapstoneexperiment01,
Id: AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea_38,
Type: azureml.scriptrun,
Status: Completed)
---------------------------------------
Run(Experiment: awnanocapstoneexperiment01,
Id: AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea_35,
Type: azureml.scriptrun,
Status: Canceled)
---------------------------------------
Run(Experiment: awnanocapstoneexperiment01,
Id: AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea_34,
Type: azureml.scriptrun,
Status: Canceled)
---------------------------------------
Run(Experiment: awnanocapstoneexperiment01,
Id: AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea_36,
Type: azureml.scriptrun,
Status: Canceled)
---------------------------------------
Run(Experiment: awnanocapstoneexperiment01,
Id: AutoML_4bc9b5ed

## Best Model

In [118]:
# get best model from the experiment
automl_run, automl_fitted_model = run.get_output()
print("run id: ", automl_run.id)
print("---------------------------------------------")
print("run metrics: ", automl_run.get_metrics())
print("---------------------------------------------")
print("run properties: ", automl_run.get_properties())
print("---------------------------------------------")
print("model: ", automl_fitted_model)

run id:  AutoML_4bc9b5ed-5d12-459e-9504-a71761adfaea_37
---------------------------------------------
run metrics:  {'average_precision_score_macro': 0.8834378661897636, 'matthews_correlation': 0.6734009201370673, 'recall_score_micro': 0.8595525727069351, 'weighted_accuracy': 0.8899103099883103, 'recall_score_macro': 0.8205609819238417, 'log_loss': 0.4064329718549208, 'AUC_macro': 0.9045119627796346, 'AUC_micro': 0.9112335280192585, 'precision_score_weighted': 0.8615235871996887, 'AUC_weighted': 0.9045119627796347, 'precision_score_micro': 0.8595525727069351, 'f1_score_macro': 0.8313595532775758, 'norm_macro_recall': 0.6411219638476836, 'average_precision_score_weighted': 0.9075851405618547, 'f1_score_micro': 0.8595525727069351, 'recall_score_weighted': 0.8595525727069351, 'balanced_accuracy': 0.8205609819238417, 'precision_score_macro': 0.854596843362692, 'f1_score_weighted': 0.8558180048057005, 'average_precision_score_micro': 0.9110739838283723, 'accuracy': 0.8595525727069351, 'accu

In [119]:
# save best model
automl_run.download_file('outputs/conda_env_v_1_0_0.yml', './automl-outputs/env.yml')
automl_run.download_file('outputs/model.pkl', './automl-outputs/model.pkl')
automl_run.download_file('outputs/scoring_file_v_2_0_0.py', './automl-outputs/scoring.py')

## Model Deployment

In [120]:
# register the best model from the experiment
model = automl_run.register_model(model_name='awnanocapstone-automl-model',
                                  model_path='outputs/model.pkl',
                                  description='Best model from the output of AutoML for Heart Failure Prediction dataset')

In [121]:
# deploy best model (creates the endpoint)
env = Environment.from_conda_specification('automl-env', './automl-outputs/env.yml')
inference_config = InferenceConfig(environment=env, entry_script='./automl-outputs/scoring.py')
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1, enable_app_insights=True)

deployment_name = 'awnanocapstone-automl-model-dep'
service = Model.deploy(workspace=ws,
                       name=deployment_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True)
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2024-12-23 16:25:22+00:00 Creating Container Registry if not exists.
2024-12-23 16:25:24+00:00 Use the existing image.
2024-12-23 16:25:24+00:00 Generating deployment configuration.
2024-12-23 16:25:26+00:00 Submitting deployment to compute.
2024-12-23 16:25:31+00:00 Checking the status of deployment awnanocapstone-automl-model-dep..
2024-12-23 16:27:28+00:00 Checking the status of inference endpoint awnanocapstone-automl-model-dep.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [123]:
# test the endpoint
import requests
import json

rest_endpoint = service.scoring_uri
headers = {'Content-type': 'application/json'}
response = requests.post(rest_endpoint,
                         headers=headers,
                         json={                         
                             'Inputs':{
                                'data': [{
                                "age": 55, 
                                "anaemia": 0, 
                                "creatinine_phosphokinase": 582, 
                                "diabetes": 0, 
                                "ejection_fraction": 20, 
                                "high_blood_pressure": 0, 
                                "platelets": 265000, 
                                "serum_creatinine": 1.9, 
                                "serum_sodium": 130, 
                                "sex": 0, 
                                "smoking": 0,
                                "time": 4
                                }]
                            }
                            }
                        )

print(response.content)

b'{"Results": [1]}'


In [124]:
# printing the logs
logs = service.get_logs()
for line in logs.split('\n'):
    print(line)

/bin/bash: /azureml-envs/azureml_3997f53a14290c7447e4e58537db5370/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_3997f53a14290c7447e4e58537db5370/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_3997f53a14290c7447e4e58537db5370/lib/libtinfo.so.6: no version information available (required by /bin/bash)
2024-12-23T16:27:10,436798294+00:00 - rsyslog/run 
2024-12-23T16:27:10,440774513+00:00 - gunicorn/run 
bash: /azureml-envs/azureml_3997f53a14290c7447e4e58537db5370/lib/libtinfo.so.6: no version information available (required by bash)
2024-12-23T16:27:10,444242038+00:00 | gunicorn/run | 
2024-12-23T16:27:10,445205859+00:00 | gunicorn/run | ###############################################
2024-12-23T16:27:10,446392111+00:00 | gunicorn/run | AzureML Container Runtime Information
2024-12-23T16:27:10,448433657+00:00 | gunicorn/run | ########################################

In [125]:
# delete the service and the cluster
service.delete()
compute_target.delete()

Running
2024-12-23 16:43:20+00:00 Check and wait for operation (ea626ade-b9d7-420f-a6b7-8dd0e6ce13cf) to finish.
2024-12-23 16:43:23+00:00 Deleting service entity.
Succeeded


**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
