# Hyperparameter Tuning

In this step, we apply methods such as Grid Search with Cross-Validation to explore different hyperparameter combinations for models like Decision Trees and Random Forests. By systematically testing and comparing results, we aim to identify the most effective configuration that enhances both accuracy and stability of the model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Load dataset
df = pd.read_csv("../Data/heart_disease_features_selected.csv")
X = df.drop("num", axis=1)
y = df["num"]

In [3]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [4]:
# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Baseline Models

In [5]:
baseline_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver="liblinear"),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

baseline_results = {}
for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    baseline_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred)
    }

print("Baseline Results")
for model, scores in baseline_results.items():
    print(f"{model}: {scores}")

Baseline Results
Logistic Regression: {'Accuracy': 0.8852459016393442, 'F1': 0.8771929824561403}
Decision Tree: {'Accuracy': 0.7868852459016393, 'F1': 0.7796610169491526}
Random Forest: {'Accuracy': 0.9016393442622951, 'F1': 0.9}
SVM: {'Accuracy': 0.8852459016393442, 'F1': 0.8771929824561403}


### Hyperparameter Tuning

##### Logistic Regression

In [6]:
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"]
}
grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5, scoring="f1")
grid_lr.fit(X_train, y_train)

##### Decision Tree

In [7]:
param_dist_dt = {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
rand_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_distributions=param_dist_dt,
    n_iter=20, cv=5, scoring="f1", random_state=42
)
rand_dt.fit(X_train, y_train)

##### Random Forest

In [8]:
param_dist_rf = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
rand_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20, cv=5, scoring="f1", random_state=42
)
rand_rf.fit(X_train, y_train)

##### SVM

In [9]:
param_grid_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"]
}
grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring="f1")
grid_svm.fit(X_train, y_train)

### Optimized Models Evaluation

In [10]:
optimized_models = {
    "Logistic Regression": grid_lr.best_estimator_,
    "Decision Tree": rand_dt.best_estimator_,
    "Random Forest": rand_rf.best_estimator_,
    "SVM": grid_svm.best_estimator_
}

optimized_results = {}
for name, model in optimized_models.items():
    y_pred = model.predict(X_test)
    optimized_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred)
    }

print("\nOptimized Results")
for model, scores in optimized_results.items():
    print(f"{model}: {scores}")


Optimized Results
Logistic Regression: {'Accuracy': 0.8852459016393442, 'F1': 0.8771929824561403}
Decision Tree: {'Accuracy': 0.7540983606557377, 'F1': 0.7540983606557377}
Random Forest: {'Accuracy': 0.9016393442622951, 'F1': 0.9}
SVM: {'Accuracy': 0.8688524590163934, 'F1': 0.8620689655172413}


### Best Hyperparameters

In [11]:
print("\nBest Hyperparameters Found:")
print("Logistic Regression:", grid_lr.best_params_)
print("Decision Tree:", rand_dt.best_params_)
print("Random Forest:", rand_rf.best_params_)
print("SVM:", grid_svm.best_params_)


Best Hyperparameters Found:
Logistic Regression: {'C': 1, 'solver': 'liblinear'}
Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None}
Random Forest: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10}
SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


##### Observations

Decision Tree performed the worst → low accuracy and F1, so not a good candidate.

Logistic Regression is decent but weaker than Random Forest.

SVM has stable performance, but lower than Random Forest.

Random Forest achieved the highest accuracy (90.16%) and highest F1 score (0.90), which means it’s both precise and robust in handling false positives/negatives.

### Save the Final Model

In [14]:
# Use the tuned hyperparameters
best_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=4,
    random_state=42
)

best_rf.fit(X_train, y_train)

In [15]:
import joblib
joblib.dump(best_rf, '../Models/final_model.pkl')
print("Model saved as final_model.pkl")

Model saved as final_model.pkl


-------------