# 06 – Hyperparameter Tuning

Use GridSearchCV / RandomizedSearchCV to optimize models and compare to baseline.


In [2]:
# --- Imports ---
import os, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import randint

# --- 1. Load and clean .data file ---
DATA_PATH = os.path.join("..", "data", "heart_disease.data")
df = pd.read_csv(DATA_PATH, header=None)

# Assign column names based on UCI dataset
df.columns = [
    "age","sex","cp","trestbps","chol","fbs","restecg",
    "thalach","exang","oldpeak","slope","ca","thal","num"
]

# Replace '?' with NaN and convert to numeric
df = df.replace('?', np.nan)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows where target ('num') is missing and create binary target
df = df.dropna(subset=['num'])
df['target'] = (df['num'] > 0).astype(int)
df = df.drop(columns=['num'])

print("Shape after cleaning:", df.shape)
print("Target distribution:\n", df['target'].value_counts())

# --- 2. Split features and target ---
X = df.drop(columns=['target'])
y = df['target']

num_features = X.select_dtypes(include=np.number).columns.tolist()
cat_features = [c for c in X.columns if c not in num_features]

# --- 3. Preprocessor ---
preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_features),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features)
])

# --- 4. Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5. Baseline RandomForest ---
rf_base = Pipeline([("prep", preprocessor),
                    ("clf", RandomForestClassifier(random_state=42))])
rf_base.fit(X_train, y_train)
base_auc = roc_auc_score(y_test, rf_base.predict_proba(X_test)[:, 1])
print(f"\nBaseline RF AUC: {base_auc:.4f}")

# --- 6. RandomizedSearchCV for RandomForest ---
rf = Pipeline([("prep", preprocessor),
               ("clf", RandomForestClassifier(random_state=42))])

param_dist = {
    "clf__n_estimators": randint(200, 800),
    "clf__max_depth": randint(3, 20),
    "clf__min_samples_split": randint(2, 20),
    "clf__min_samples_leaf": randint(1, 10)
}

rs = RandomizedSearchCV(
    rf, param_dist, n_iter=30, scoring="roc_auc",
    cv=5, random_state=42, n_jobs=-1
)
rs.fit(X_train, y_train)
best_rs_auc = roc_auc_score(y_test, rs.predict_proba(X_test)[:, 1])
print(f"Best RF (RandomizedSearch) AUC: {best_rs_auc:.4f}")
print("Best RF params:", rs.best_params_)

# --- 7. GridSearchCV for SVC ---
svc = Pipeline([("prep", preprocessor),
                ("clf", SVC(probability=True, random_state=42))])

param_grid = {
    "clf__C": [0.1, 1, 3, 10],
    "clf__gamma": ["scale", 0.01, 0.001],
    "clf__kernel": ["rbf"]
}

gs = GridSearchCV(
    svc, param_grid, scoring="roc_auc", cv=5, n_jobs=-1
)
gs.fit(X_train, y_train)
best_gs_auc = roc_auc_score(y_test, gs.predict_proba(X_test)[:, 1])
print(f"Best SVC (GridSearch) AUC: {best_gs_auc:.4f}")
print("Best SVC params:", gs.best_params_)


Shape after cleaning: (303, 14)
Target distribution:
 target
0    164
1    139
Name: count, dtype: int64

Baseline RF AUC: 0.9513
Best RF (RandomizedSearch) AUC: 0.9621
Best RF params: {'clf__max_depth': 3, 'clf__min_samples_leaf': 8, 'clf__min_samples_split': 12, 'clf__n_estimators': 698}
Best SVC (GridSearch) AUC: 0.9600
Best SVC params: {'clf__C': 1, 'clf__gamma': 0.001, 'clf__kernel': 'rbf'}
