In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

In [2]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

column_names = [
    "age", "sex", "cp", "trestbps", "chol",
    "fbs", "restecg", "thalach", "exang",
    "oldpeak", "slope", "ca", "thal", "target"
]

df = pd.read_csv(url, header=None, names=column_names)

In [3]:
df.replace("?", np.nan, inplace=True)
df = df.apply(pd.to_numeric)
df["target"] = df["target"].apply(lambda x: 0 if x == 0 else 1)

In [4]:
X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [5]:
pipelines = {
    "LogisticRegression": Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]),

    "KNN": Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier())
    ]),

    "DecisionTree": Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("model", DecisionTreeClassifier())
    ]),

    "RandomForest": Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("model", RandomForestClassifier())
    ]),

    "SVM": Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", SVC(probability=True))
    ])
}

In [6]:
param_grids = {
    "LogisticRegression": {
        "model__C": [0.01, 0.1, 1, 10]
    },

    "KNN": {
        "model__n_neighbors": [3, 5, 7, 9],
        "model__weights": ["uniform", "distance"]
    },

    "DecisionTree": {
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_split": [2, 5, 10]
    },

    "RandomForest": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [None, 10, 20]
    },

    "SVM": {
        "model__C": [0.1, 1, 10],
        "model__gamma": ["scale", "auto"]
    }
}

In [7]:
results = {}

for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")

    grid = GridSearchCV(
        pipeline,
        param_grids[name],
        cv=5,
        scoring="recall",   # medical priority
        n_jobs=-1
    )

    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    results[name] = {
        "best_params": grid.best_params_,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba)
    }


Training LogisticRegression...

Training KNN...

Training DecisionTree...

Training RandomForest...

Training SVM...


In [8]:
results_df = pd.DataFrame(results).T
print(results_df.sort_values(by="recall", ascending=False))

                                                          best_params  \
KNN                 {'model__n_neighbors': 3, 'model__weights': 'u...   
RandomForest        {'model__max_depth': 20, 'model__n_estimators'...   
SVM                         {'model__C': 10, 'model__gamma': 'scale'}   
LogisticRegression                                   {'model__C': 10}   
DecisionTree        {'model__max_depth': 10, 'model__min_samples_s...   

                    accuracy precision    recall        f1   roc_auc  
KNN                 0.852459  0.771429  0.964286  0.857143  0.893939  
RandomForest        0.901639   0.84375  0.964286       0.9  0.945346  
SVM                 0.819672   0.72973  0.964286  0.830769  0.902597  
LogisticRegression  0.852459  0.787879  0.928571  0.852459  0.949134  
DecisionTree        0.704918  0.647059  0.785714  0.709677  0.711039  
