# WS 12 AutoML with AutoGluon Hands on Module

We start by pip installing the `utogluon` and `ucimlrepo` packages

In [None]:
!pip install autogluon
!pip install ucimlrepo

Now we import pacakges and load in heart disease data from [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/45/heart+disease) and a Stroke Prediction dataset from [Kaggle](https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset/data)

In [None]:

import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

In [None]:
# load in the heart disease dataset from UCI
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets



In [None]:

# variable information
print(heart_disease.variables)

In [None]:
# finalize the heart disease dataset in a single DataFrame with predictors and labels
heart_disease_df = X.assign(
    binary_label=y.map(lambda value: value > 0).astype(int) # convert categorical labels to binary (1=heart disease, 0=no heart disease)
)

In [None]:
# load in the stroke dataset from the GitHub repository
stroke_df = pd.read_csv('https://github.com/btwooton/arch_workshop_automl_ws14/raw/refs/heads/main/data/healthcare-dataset-stroke-data.csv')

In [None]:
stroke_df

Now we split the two datasets into 80%/20% training/test set splits

In [None]:
# splitting the heart disease dataset into training and test sets using DataFrame.sample()
hd_train = heart_disease_df.sample(frac=0.8)
hd_test = heart_disease_df.drop(hd_train.index)

In [None]:
hd_train['binary_label'].value_counts()

In [None]:
hd_test['binary_label'].value_counts()

In [None]:
hd_test.index.to_numpy()

In [None]:
stroke_train = stroke_df.sample(frac=0.8).drop(columns=['id'])
stroke_test = stroke_df.drop(stroke_train.index).drop(columns=['id'])

In [None]:
stroke_train['stroke'].value_counts(normalize=True)

In [None]:
stroke_test['stroke'].value_counts(normalize=True)

Now we use AutoGluon's `TabularPredictor` class to fit a weighted ensemble of classifiers on each of the two datasets, which will automatically use the best performing models on Validation data

In [None]:
from autogluon.tabular import TabularPredictor

In [None]:
# Fitting a tabular predictor on the Heart Disease Dataset
predictor_hd = TabularPredictor(label='binary_label', eval_metric='roc_auc').fit(hd_train)

In [None]:
predictor_stroke = TabularPredictor(label='stroke', eval_metric='roc_auc').fit(stroke_train)

Now we evaluate the models on the test datasets, and also show a leaderboard with a performance breakdown across all models trained during construction of the ensemble

In [None]:
predictor_hd.evaluate(hd_test)

In [None]:
predictor_stroke.evaluate(stroke_test)

In [None]:
threshold = predictor_stroke.calibrate_decision_threshold(metric='f1')

In [None]:
predictor_stroke.set_decision_threshold(threshold)

In [None]:
predictor_stroke.evaluate(stroke_test)

### Now We Compare the Performance of the Off-the-shelf AutoGluon tabular Predictor with Commonly Used Scikit-learn models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, roc_curve, f1_score
from sklearn.preprocessing import OrdinalEncoder

In [None]:
clf_stroke = RandomForestClassifier()
clf_hd = RandomForestClassifier()

In [None]:
clf_hd.fit(hd_train.drop(columns=['binary_label']), hd_train['binary_label'])

In [None]:
preds = clf_hd.predict_proba(hd_test.drop(columns=['binary_label']))

In [None]:
fpr, tpr, _ = roc_curve(hd_test['binary_label'], preds[:, 1])

In [None]:
print(f"AUC: {auc(fpr, tpr)}")

In [None]:
stroke_train

In [None]:
# for the stroke dataset, we need convert the text labels to numerical ordinal values
encoder = OrdinalEncoder()
stroke_ordinal = encoder.fit_transform(stroke_train[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
stroke_ordinal_df = pd.DataFrame(data=stroke_ordinal, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
stroke_train_ordinal = pd.concat([
    stroke_train[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']].reset_index(drop=True),
    stroke_ordinal_df.reset_index(drop=True),
    stroke_train[['stroke']].reset_index(drop=True)
], axis=1)

In [None]:
stroke_train_ordinal

In [None]:
clf_stroke.fit(stroke_train_ordinal.drop(columns=['stroke']), stroke_train_ordinal['stroke'])

In [None]:
stroke_test_ordinal = encoder.transform(stroke_test[['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']])
stroke_test_ordinal_df = pd.DataFrame(data=stroke_test_ordinal, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])
stroke_test_ordinal_final = pd.concat([
    stroke_test[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']].reset_index(drop=True),
    stroke_test_ordinal_df.reset_index(drop=True),
    stroke_test[['stroke']].reset_index(drop=True)
], axis=1)

In [None]:
stroke_test_ordinal_final

In [None]:
preds = clf_stroke.predict_proba(stroke_test_ordinal_final.drop(columns=['stroke']))

In [None]:
fpr, tpr, _ = roc_curve(stroke_test_ordinal_final['stroke'], preds[:, 1])

In [None]:
print(f"AUC: {auc(fpr, tpr)}")
print(f"F1: {f1_score(stroke_test_ordinal_final['stroke'], clf_stroke.predict(stroke_test_ordinal_final.drop(columns=['stroke'])))}")