In [4]:
# ============================================================
# 06_hyperparameter_tuning.ipynb
# Hyperparameter tuning for best model (RandomForest)
# ============================================================

# ---- Cell 1: Imports ----
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# ---- Cell 2: Load cleaned dataset ----
df = pd.read_csv('../results/cleaned_heart.csv')
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

# ---- Cell 3: Separate features and target ----
target_col = 'num'
X = df.drop(columns=[target_col])
y = df[target_col]

# Explicit numeric and categorical features
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

# ---- Cell 4: Preprocessing pipeline ----
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# ---- Cell 5: Train/test split ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ---- Cell 6: Define parameter grid for RandomForest ----
param_dist = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [None, 5, 10, 15, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__bootstrap': [True, False]
}

# ---- Cell 7: Build pipeline ----
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# ---- Cell 8: Randomized search ----
search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_dist,
    n_iter=30,
    scoring='accuracy',
    n_jobs=-1,
    cv=5,
    random_state=42,
    verbose=2
)

search.fit(X_train, y_train)

print("\nBest parameters found:")
print(search.best_params_)
print(f"Best CV score: {search.best_score_:.4f}")

# ---- Cell 9: Evaluate tuned model ----
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nClassification Report (Tuned RandomForest):")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ---- Cell 10: Save best model ----
joblib.dump(best_model, '../model/final_model.pkl')
print("Tuned model saved to ../model/final_model.pkl")


Shape: (303, 14)
Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best parameters found:
{'classifier__n_estimators': 400, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 4, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 10, 'classifier__bootstrap': True}
Best CV score: 0.8138

Classification Report (Tuned RandomForest):
              precision    recall  f1-score   support

           0       0.90      0.82      0.86        33
           1       0.81      0.89      0.85        28

    accuracy                           0.85        61
   macro avg       0.85      0.86      0.85        61
weighted avg       0.86      0.85      0.85        61

Confusion Matrix:
 [[27  6]
 [ 3 25]]
Tuned model saved to ../model/final_model.pkl
