# Hyperparameter Tuning - Heart Disease Dataset

## Steps to Complete:
1. Use GridSearchCV & RandomizedSearchCV to optimize model hyperparameters
2. Compare optimized models with baseline performance

## Deliverable:
- Best performing model with optimized hyperparameters


In [14]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded!")


Libraries loaded!


In [15]:
# Load and prepare data (same as notebook 01)
print("Loading and preparing data...")

column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

df = pd.read_csv('../data/Heart_Disease.csv', names=column_names, na_values='?')
df_clean = df.dropna()

categorical_columns = ['cp', 'restecg', 'slope', 'ca', 'thal']
numerical_columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# One-hot encoding
df_encoded = pd.get_dummies(df_clean, columns=categorical_columns, prefix=categorical_columns)

# Scale numerical features
scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

# Create target
df_encoded['target_binary'] = (df_encoded['target'] > 0).astype(int)

# Features / target
feature_columns = [col for col in df_encoded.columns if col not in ['target', 'target_binary']]
X = df_encoded[feature_columns]
y = df_encoded['target_binary']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data ready for tuning!")


Loading and preparing data...
Data ready for tuning!


In [16]:
# Step 1: Baseline models (for comparison)
print("Training baseline models...")

baseline_models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)
}

baseline_results = {}
for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    baseline_results[name] = {
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_proba)
    }

print("Baseline results (F1, AUC):")
print(pd.DataFrame(baseline_results).T.round(4))


Training baseline models...
Baseline results (F1, AUC):
                         f1     auc
Logistic Regression  0.7843  0.9375
Decision Tree        0.7143  0.7321
Random Forest        0.7451  0.9213
SVM                  0.8077  0.9509


In [17]:
# Step 2: Hyperparameter Tuning (RandomizedSearchCV + GridSearchCV)
print("Tuning hyperparameters...")

# Define parameter grids
param_distributions = {
    'Logistic Regression': {
        'C': uniform(0.001, 10.0),
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    'Decision Tree': {
        'max_depth': randint(2, 20),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'Random Forest': {
        'n_estimators': randint(50, 300),
        'max_depth': randint(2, 20),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'SVM': {
        'C': uniform(0.1, 10.0),
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear']
    }
}

param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs'],
        'penalty': ['l2']
    },
    'Decision Tree': {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10, 15],
        'min_samples_leaf': [1, 2, 4, 6]
    },
    'Random Forest': {
        'n_estimators': [100, 150, 200, 300],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear']
    }
}

# Mapping from name to estimator
estimators = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True)
}

# Run tuning
best_models = {}
for name, estimator in estimators.items():
    print(f"\n{name} - RandomizedSearchCV...")
    rs = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_distributions[name],
        n_iter=25,
        scoring='f1',
        cv=5,
        random_state=42,
        n_jobs=-1
    )
    rs.fit(X_train, y_train)

    print(f"Best params (Randomized): {rs.best_params_}")

    print(f"{name} - GridSearchCV around best...")
    # Build a small grid around the best params when applicable
    grid = param_grids[name]
    gs = GridSearchCV(
        estimator=estimator,
        param_grid=grid,
        scoring='f1',
        cv=5,
        n_jobs=-1
    )
    gs.fit(X_train, y_train)
    print(f"Best params (Grid): {gs.best_params_}")

    # Select the best of the two by CV score
    best_cv = max(rs.best_score_, gs.best_score_)
    best_est = rs.best_estimator_ if rs.best_score_ >= gs.best_score_ else gs.best_estimator_
    best_models[name] = best_est

print("\nHyperparameter tuning complete!")


Tuning hyperparameters...

Logistic Regression - RandomizedSearchCV...
Best params (Randomized): {'C': np.float64(0.20684494295802447), 'penalty': 'l2', 'solver': 'lbfgs'}
Logistic Regression - GridSearchCV around best...
Best params (Grid): {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

Decision Tree - RandomizedSearchCV...
Best params (Randomized): {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8}
Decision Tree - GridSearchCV around best...
Best params (Grid): {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 15}

Random Forest - RandomizedSearchCV...
Best params (Randomized): {'max_depth': 8, 'min_samples_leaf': 8, 'min_samples_split': 16, 'n_estimators': 84}
Random Forest - GridSearchCV around best...
Best params (Grid): {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}

SVM - RandomizedSearchCV...
Best params (Randomized): {'C': np.float64(1.6601864044243653), 'gamma': 'scale', 'kernel': 'rbf'}
SVM - GridSearchCV around

In [18]:
# Step 3: Compare optimized models with baseline
print("Comparing optimized models with baseline...")

comparison = []
for name in baseline_models.keys():
    # Baseline
    base_f1 = baseline_results[name]['f1']
    base_auc = baseline_results[name]['auc']
    
    # Optimized
    model_opt = best_models[name]
    y_pred_opt = model_opt.predict(X_test)
    y_proba_opt = model_opt.predict_proba(X_test)[:, 1]
    opt_f1 = f1_score(y_test, y_pred_opt)
    opt_auc = roc_auc_score(y_test, y_proba_opt)
    
    comparison.append({
        'Model': name,
        'Baseline F1': base_f1,
        'Optimized F1': opt_f1,
        'Baseline AUC': base_auc,
        'Optimized AUC': opt_auc
    })

comparison_df = pd.DataFrame(comparison)
print(comparison_df.round(4))

# Identify best model by F1 (then AUC as tie-breaker)
best_row = comparison_df.sort_values(['Optimized F1', 'Optimized AUC'], ascending=False).iloc[0]
best_model_name = best_row['Model']
best_model = best_models[best_model_name]

print(f"\nBest performing model after tuning: {best_model_name}")
print(f"Optimized F1: {best_row['Optimized F1']:.4f}")
print(f"Optimized AUC: {best_row['Optimized AUC']:.4f}")


Comparing optimized models with baseline...
                 Model  Baseline F1  Optimized F1  Baseline AUC  Optimized AUC
0  Logistic Regression       0.7843        0.8077        0.9375         0.9397
1        Decision Tree       0.7143        0.7200        0.7321         0.8599
2        Random Forest       0.7451        0.8077        0.9213         0.9342
3                  SVM       0.8077        0.7843        0.9509         0.9420

Best performing model after tuning: Logistic Regression
Optimized F1: 0.8077
Optimized AUC: 0.9397


## Deliverables Completed

### ✅ Step 1: Use GridSearchCV & RandomizedSearchCV to optimize model hyperparameters
- Tuned hyperparameters for Logistic Regression, Decision Tree, Random Forest, and SVM
- Used RandomizedSearchCV for broad exploration
- Used GridSearchCV for fine-tuning near the best parameters

### ✅ Step 2: Compare optimized models with baseline performance
- Computed baseline (F1, AUC) for all models
- Compared with optimized model performance on test set
- Identified best performing model based on F1 (AUC tie-breaker)

### ✅ Deliverable: Best performing model with optimized hyperparameters
- Best model printed with optimized F1 and AUC
- Ready for downstream evaluation, saving, and deployment
