In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

# Loading the CSV data into a Pandas DataFrame from a URL
df = pd.read_csv('https://raw.githubusercontent.com/kb22/Heart-Disease-Prediction/master/dataset.csv')

# Display the first few rows of the DataFrame to inspect the data
df.head()

# Display concise summary of the DataFrame, including data types and non-null values
df.info()

# Display unique values for categorical features to understand their distribution
df["sex"].unique()
df["cp"].unique()
df["fbs"].unique()
df["restecg"].unique()
df["exang"].unique()
df["slope"].unique()
df["thal"].unique()
df["ca"].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


array([0, 2, 1, 3, 4])

In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, accuracy_score
from scipy import stats

# ==========================================
# 1. SETUP & DATA LOADING
# ==========================================

# Basic Preprocessing
X = df.drop('target', axis=1)
y = df['target']

# Configuration
N_BOOTSTRAPS = 1000   # Number of bootstrap iterations (higher = more precise, e.g., 200 or 500)
RECALL_THRESHOLD = 0.90
RANDOM_SEED = 42

print(f"Starting Bootstrap Analysis with {N_BOOTSTRAPS} iterations...")

# ==========================================
# 2. BOOTSTRAP FUNCTION
# ==========================================
def get_bootstrap_metrics(model, X, y, seeds):
    """
    Runs bootstrap evaluation using a fixed list of seeds.
    Returns arrays of Recall and Accuracy scores.
    """
    recalls = []
    accuracies = []

    for seed in seeds:
        # Bootstrap Resample (Create a training set with replacement)
        # The 'Out-of-Bag' (OOB) samples are used as the Test set
        train_idx = resample(np.arange(len(y)), replace=True, n_samples=len(y), random_state=seed)
        test_idx = np.setdiff1d(np.arange(len(y)), train_idx)

        # Edge case: if test set is empty (rare), skip
        if len(test_idx) == 0: continue

        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        recalls.append(recall_score(y_test, y_pred))
        accuracies.append(accuracy_score(y_test, y_pred))

    return np.array(recalls), np.array(accuracies)

# Generate fixed seeds so Baseline and Tuned models see the SAME data splits (Paired Testing)
bootstrap_seeds = np.random.RandomState(RANDOM_SEED).randint(0, 10000, size=N_BOOTSTRAPS)

# ==========================================
# 3. DEFINE MODELS & GRIDS
# ==========================================
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000),
    'SVC': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Simplified Grids for demonstration (Expand these for better results)
param_grids = {
    'Logistic Regression': [{'C': x, 'class_weight': w} for x in [0.01, 0.1, 1, 10] for w in [None, 'balanced']],
    'SVC': [{'C': x, 'kernel': k, 'class_weight': 'balanced'} for x in [0.1, 1, 10] for k in ['rbf', 'linear']],
    'KNN': [{'n_neighbors': n} for n in [3, 5, 7, 9, 11]]
}

results = {}

# ==========================================
# 4. EXECUTION LOOP
# ==========================================
for name, base_model in models.items():
    print(f"\nAnalyzing {name}...")

    # Needs scaling? (Tree models don't, others do)
    if name == 'Random Forest':
        pipeline_base = base_model
    else:
        pipeline_base = make_pipeline(StandardScaler(), base_model)

    # --- A. BASELINE EVALUATION ---
    # We must re-evaluate baseline with bootstrap to compare apples-to-apples
    base_rec, base_acc = get_bootstrap_metrics(pipeline_base, X, y, bootstrap_seeds)

    # --- B. TUNING SEARCH (Find config with Recall >= 90%) ---
    best_tuned_acc = -1
    best_tuned_scores = None
    best_params = None

    # Iterate through grid
    grid = param_grids.get(name, [])
    for params in grid:
        # Update model params
        if name == 'Random Forest':
            model_tune = base_model.set_params(**params)
            pipe_tune = model_tune
        else:
            # Update the classifier step in pipeline
            model_tune = base_model.set_params(**params)
            pipe_tune = make_pipeline(StandardScaler(), model_tune)

        # Run Quick Bootstrap (fewer iterations for speed, e.g., 20) or Full
        # For accuracy, we'll use the full set here.
        rec_scores, acc_scores = get_bootstrap_metrics(pipe_tune, X, y, bootstrap_seeds)

        mean_rec = np.mean(rec_scores)

        # CRITERION: Average Recall >= 90%
        if mean_rec >= RECALL_THRESHOLD:
            mean_acc = np.mean(acc_scores)
            if mean_acc > best_tuned_acc:
                best_tuned_acc = mean_acc
                best_tuned_scores = (rec_scores, acc_scores)
                best_params = params

    # --- C. STORE RESULTS ---
    results[name] = {
        'baseline_acc': base_acc,
        'baseline_rec': base_rec,
        'tuned_acc': best_tuned_scores[1] if best_tuned_scores else None,
        'tuned_rec': best_tuned_scores[0] if best_tuned_scores else None,
        'best_params': best_params
    }

# ==========================================
# 5. STATISTICAL COMPARISON OUTPUT
# ==========================================
print("\n" + "="*60)
print(f"{'MODEL COMPARISON (BOOTSTRAP N=' + str(N_BOOTSTRAPS) + ')':^60}")
print("="*60)

for name in models.keys():
    res = results[name]

    # Baseline Metrics
    base_acc_mean = np.mean(res['baseline_acc'])
    base_ci = np.percentile(res['baseline_acc'], [2.5, 97.5])

    print(f"\n>>> {name}")
    print(f"   Baseline Accuracy: {base_acc_mean:.2%} (95% CI: {base_ci[0]:.2%} - {base_ci[1]:.2%})")

    if res['tuned_acc'] is not None:
        tuned_acc_mean = np.mean(res['tuned_acc'])
        tuned_rec_mean = np.mean(res['tuned_rec'])
        tuned_ci = np.percentile(res['tuned_acc'], [2.5, 97.5])

        print(f"   Tuned Accuracy:    {tuned_acc_mean:.2%} (95% CI: {tuned_ci[0]:.2%} - {tuned_ci[1]:.2%})")
        print(f"   (Constraint Met: Recall = {tuned_rec_mean:.2%})")
        print(f"   Best Params: {res['best_params']}")

        # --- STATISTICAL TEST ---
        # Paired t-test (Difference between Tuned and Baseline on SAME folds)
        diff = res['tuned_acc'] - res['baseline_acc']
        t_stat, p_val = stats.ttest_1samp(diff, 0)

        significance = "SIGNIFICANT" if p_val < 0.05 else "Not Significant"
        print(f"   Improvement:       {np.mean(diff):.2%} (p-value: {p_val:.4f}) -> {significance}")
    else:
        print(f"   [!] No configuration met the {RECALL_THRESHOLD*100}% Recall constraint.")

Starting Bootstrap Analysis with 1000 iterations...

Analyzing Logistic Regression...

Analyzing SVC...

Analyzing KNN...

Analyzing Random Forest...

            MODEL COMPARISON (BOOTSTRAP N=1000)             

>>> Logistic Regression
   Baseline Accuracy: 81.65% (95% CI: 75.00% - 87.50%)
   Tuned Accuracy:    81.98% (95% CI: 75.65% - 87.74%)
   (Constraint Met: Recall = 91.19%)
   Best Params: {'C': 0.01, 'class_weight': None}
   Improvement:       0.33% (p-value: 0.0000) -> SIGNIFICANT

>>> SVC
   Baseline Accuracy: 81.29% (95% CI: 75.22% - 87.28%)
   [!] No configuration met the 90.0% Recall constraint.

>>> KNN
   Baseline Accuracy: 79.24% (95% CI: 72.48% - 85.71%)
   [!] No configuration met the 90.0% Recall constraint.

>>> Random Forest
   Baseline Accuracy: 81.43% (95% CI: 75.23% - 87.39%)
   [!] No configuration met the 90.0% Recall constraint.
