In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [3]:
def train_baseline_model(X_train, X_test, y_train, y_test):
    """Train and evaluate a baseline Random Forest model."""
    baseline_rf = RandomForestClassifier(random_state=42)
    baseline_rf.fit(X_train, y_train)
    
    y_pred = baseline_rf.predict(X_test)
    
    baseline_accuracy = accuracy_score(y_test, y_pred)
    baseline_f1 = f1_score(y_test, y_pred)
    
    print("Baseline Model Performance:")
    print(f"Accuracy: {baseline_accuracy:.4f}")
    print(f"F1-Score: {baseline_f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return baseline_rf, baseline_accuracy, baseline_f1

In [4]:
def perform_grid_search(X_train, y_train):
    """Perform grid search for hyperparameter tuning."""
    # Define parameter grid
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    }
    
    rf = RandomForestClassifier(random_state=42)
    
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        scoring='f1',
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print("\nGrid Search Results:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation F1-Score: {grid_search.best_score_:.4f}")
    
    return grid_search

In [5]:
def evaluate_tuned_model(grid_search, X_test, y_test):
    """Evaluate the tuned model on the test set."""
    # Make predictions with the best model
    y_pred = grid_search.predict(X_test)
    
    # Calculate metrics
    tuned_accuracy = accuracy_score(y_test, y_pred)
    tuned_f1 = f1_score(y_test, y_pred)
    
    print("\nTuned Model Performance:")
    print(f"Accuracy: {tuned_accuracy:.4f}")
    print(f"F1-Score: {tuned_f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return tuned_accuracy, tuned_f1

In [6]:
df = pd.read_csv("../dataset/full_processed_heart_disease_cleveland.csv")

X = df.drop(["target"],axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [10]:
# Train baseline model
baseline_rf, baseline_accuracy, baseline_f1 = train_baseline_model(X_train, X_test, y_train, y_test)


Baseline Model Performance:
Accuracy: 0.8689
F1-Score: 0.8710

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        29
           1       0.90      0.84      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [11]:
# Perform grid search
grid_search = perform_grid_search(X_train, y_train)
    
# Evaluate tuned model
tuned_accuracy, tuned_f1 = evaluate_tuned_model(grid_search, X_test, y_test)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Grid Search Results:
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best Cross-Validation F1-Score: 0.8024

Tuned Model Performance:
Accuracy: 0.9016
F1-Score: 0.9032

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61



In [8]:
# Print improvement summary
print("\nModel Improvement Summary:")
print(f"Accuracy Improvement: {(tuned_accuracy - baseline_accuracy) * 100:.2f}%")
print(f"F1-Score Improvement: {(tuned_f1 - baseline_f1) * 100:.2f}%")


Model Improvement Summary:
Accuracy Improvement: 3.28%
F1-Score Improvement: 3.23%


In [9]:
# Feature importance analysis
feature_importance = pd.DataFrame(
    grid_search.best_estimator_.feature_importances_,
    index=["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
           "thalach", "exang", "oldpeak", "slope", "ca", "thal"],
    columns=['importance']
    ).sort_values('importance', ascending=False)
    
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
          importance
ca          0.122644
cp          0.120732
oldpeak     0.112389
thal        0.105864
age         0.092402
thalach     0.092356
chol        0.086921
trestbps    0.075393
slope       0.062131
exang       0.055185
sex         0.043103
restecg     0.022041
fbs         0.008839
