In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
try:
    df = pd.read_csv('../data/heart_disease_selected_features.csv')
    print("Dataset with selected features loaded successfully.")
except FileNotFoundError:
    print("Error: 'heart_disease_selected_features.csv' not found.")
    print("Please run the feature selection script first to generate this file.")
    # As a fallback, create a dummy dataframe
    print("Creating a dummy dataframe for demonstration purposes.")
    data = {
        'feat1': np.random.rand(303), 'feat2': np.random.rand(303), 'feat3': np.random.rand(303),
        'feat4': np.random.rand(303), 'feat5': np.random.rand(303), 'feat6': np.random.rand(303),
        'feat7': np.random.rand(303), 'feat8': np.random.rand(303),
        'target': np.random.randint(0, 2, 303)
    }
    df = pd.DataFrame(data)

Dataset with selected features loaded successfully.


In [3]:
X = df.drop('target', axis=1)
y = df['target']

In [4]:
y = (y > 0).astype(int)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
print("\n--- Evaluating Baseline Random Forest Model ---")
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)
y_pred_baseline = baseline_rf.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Model Accuracy: {baseline_accuracy:.4f}")
print("Baseline Model Classification Report:")
print(classification_report(y_test, y_pred_baseline))


--- Evaluating Baseline Random Forest Model ---
Baseline Model Accuracy: 0.8525
Baseline Model Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.76      0.85        33
           1       0.77      0.96      0.86        28

    accuracy                           0.85        61
   macro avg       0.87      0.86      0.85        61
weighted avg       0.87      0.85      0.85        61



In [7]:
print("\n--- Starting Hyperparameter Tuning for Random Forest ---")

param_grid = {
    'n_estimators': [100, 200, 300],        
    'max_depth': [None, 10, 20, 30],        
    'min_samples_split': [2, 5, 10],         
    'min_samples_leaf': [1, 2, 4],           
    'max_features': ['sqrt', 'log2'] 
}


--- Starting Hyperparameter Tuning for Random Forest ---


In [8]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=2, 
                           scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
print("\n--- Evaluating Best Model Found by GridSearchCV ---")

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters Found: {best_params}")

# Use the best estimator found by the grid search
best_rf_model = grid_search.best_estimator_

# Make predictions with the tuned model
y_pred_tuned = best_rf_model.predict(X_test)

# Evaluate the tuned model
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
print(f"\nTuned Model Accuracy: {tuned_accuracy:.4f}")
print("Tuned Model Classification Report:")
print(classification_report(y_test, y_pred_tuned))


--- Evaluating Best Model Found by GridSearchCV ---
Best Hyperparameters Found: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

Tuned Model Accuracy: 0.8689
Tuned Model Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        33
           1       0.81      0.93      0.87        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61



In [10]:
print("\n--- Performance Comparison ---")
print(f"Baseline Accuracy: {baseline_accuracy:.4f}")
print(f"Tuned Accuracy:    {tuned_accuracy:.4f}")
improvement = ((tuned_accuracy - baseline_accuracy) / baseline_accuracy) * 100
print(f"Improvement: {improvement:.2f}%")


--- Performance Comparison ---
Baseline Accuracy: 0.8525
Tuned Accuracy:    0.8689
Improvement: 1.92%
