# Heat Attack Dataset
This dataset was downloaded from https://www.kaggle.com/rashikrahmanpritom/heart-attack-analysis-prediction-dataset.<br>
The notebook was created taking as baseline the tutorials from https://microsoftlearning.github.io/mslearn-dp100/.
## Step 1: Connect to a workspace

In [None]:
from azureml.core import Workspace, Dataset
ws = Workspace.from_config()
print(ws.name, "loaded")

Check available compute resources. Mostly use CI (local compute) for this stage.

In [None]:
print("Compute Resources:")
for compute_name in ws.compute_targets:
    compute = ws.compute_targets[compute_name]
    print("\t", compute.name, ':', compute.type)

## Step 2: Load and register dataset
**Data Description**<br>

*age*: Age of the person<br>
*sex*: Gender of the person<br>
*cp*: chest pain type<br>
*trtbps*: resting blood pressure (mm Hg)<br>
*chol*: cholesterol (mg/dl)<br>
*fbs*: fasting blood sugar > 120 mg/dl<br>
*restecg*: resting electrocardiographic results<br>
*thalachh*: maximum heart rate achieved<br>
*exng*: exercise induced angina (1 = yes, 0 = no)<br>
*oldpeak*: previous peak<br>
*slp*: slope<br>
*caa*: number of major vessels (0-3)<br>
*thall*: Thal rate <br>
*output*: had heart attack (target)



In [3]:
# Load default datastore
default_ds = ws.get_default_datastore()

# Upload datasets to the datastore
default_ds.upload_files(['./data/heart.csv', './data/o2Saturation.csv'],
                        target_path='heart-data/',
                        overwrite=True,
                        show_progress=True)

Uploading an estimated of 2 files
Uploading ./data/heart.csv
Uploaded ./data/heart.csv, 1 files out of an estimated total of 2
Uploading ./data/o2Saturation.csv
Uploaded ./data/o2Saturation.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_70b8907bdabe4730baf582bf9c872eeb

In [4]:
# Create tabular dataset with heart data
heart_tab = Dataset.Tabular.from_delimited_files(path=(default_ds, 'heart-data/heart.csv'))
heart_tab.to_pandas_dataframe()


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [5]:
# Create tabular dataset with O2 saturation data
o2_tab = Dataset.Tabular.from_delimited_files(path=(default_ds, 'heart-data/o2Saturation.csv'))
o2_tab.to_pandas_dataframe()


Unnamed: 0,98.6
0,98.6
1,98.6
2,98.6
3,98.1
4,97.5
...,...
3580,98.6
3581,98.6
3582,98.6
3583,98.6


In [6]:
# Register heart dataset
heart_tab = heart_tab.register(workspace=ws,
                            name='heart',
                            description='heart attack data',
                            tags={'format':'CSV'},
                            create_new_version=True)

# Register o2 dataset
o2_tab = o2_tab.register(workspace=ws,
                            name='o2',
                            description='o2 saturation data',
                            tags={'format':'CSV'},
                            create_new_version=True)

In [7]:
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name, 'version', dataset.version)

Datasets:
	 o2 version 1
	 heart version 2


## Step 3: Check data

In [8]:
# Check for Null values
heart_tab.to_pandas_dataframe().isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

In [9]:
# Look inside
heart_tab.to_pandas_dataframe().describe()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## Step 4: Train a model from script

Create an experiment folder.

In [10]:
import os

# Create a folder for the experiment files
experiment_folder = 'heart_training_1'
os.makedirs(experiment_folder, exist_ok=True)

Create an environment file.

In [11]:
%%writefile $experiment_folder/environment.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting heart_training_1/environment.yml


Creating experiment script, using a random forest classifier.

In [15]:
%%writefile $experiment_folder/heart_training.py
# Import libraries
import os
import argparse
from azureml.core import Run, Dataset
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Get script arguments (mostly the training dataset ID at this point)
parser = argparse.ArgumentParser()
parser.add_argument("--ds", type=str, dest='ds_id')
args = parser.parse_args()

# Get experiment run context
run = Run.get_context()

# Get training dataset
print("Loading Data...")
ws = run.experiment.workspace
heart = run.input_datasets['heart_dataset'].to_pandas_dataframe()

# Separate features and labels
y = heart['output'].values
X = heart.drop(['output'], axis=1).values
print(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Train a random forest model
print('Training a Random Forest Classifier model with default hyperparameters.')
model = RandomForestClassifier().fit(X_train, y_train)

# Calculate accuracy
y_hat = model.predict(X_test)
model_accuracy = np.average(y_hat == y_test)
print('Accuracy: ', model_accuracy)
run.log('Accuracy', np.float(model_accuracy))

# Calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:,1])
print('AUC: ', str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/heart_model1.pkl')

run.complete()

Overwriting heart_training_1/heart_training.py


Running the experiment script.

In [13]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.widgets import RunDetails

# Create python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("experiment_env", experiment_folder + "/environment.yml")

# Get training dataset
heart_ds = ws.datasets.get("heart")

# Get a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='heart_training.py',
                                arguments=['--ds', heart_ds.as_named_input('heart_dataset')],
                                environment=env)

# Submit the experiment
experiment_name = 'train-heart'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'train-heart_1635994987_e1effee5',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2021-11-04T03:03:09.418237Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'eb7801e7-c618-47ed-aee2-dfd11025b580'},
 'inputDatasets': [{'dataset': {'id': 'd27bcc02-cca7-433b-81f5-7bb4c8fac898'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'heart_dataset', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'heart_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--ds', 'DatasetConsumptionConfig:heart_dataset'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {'heart_dataset': {'dataLocation': {'dataset': {'id': 'd27bcc02-cca7-433b-81f5-7bb4c8fac898',
      'name': 'heart',
      'version': '2'},
     'dataPath': None,
     'uri': None},
    'mechanism': 'Direct',
    'environ

Check metrics.

In [14]:
metrics = run.get_metrics()
for key in metrics.keys():
    print(key, metrics.get(key))
    

Accuracy 0.8289473684210527
AUC 0.9270613107822411
