# 2.6 Hyperparameter Tuning
## *Steps:*
1. Use GridSearchCV & RandomizedSearchCV to optimize model hyperparameters.
2. Compare optimized models with baseline performance.
## *Deliverable:*
- Best performing model with optimized hyperparameters

In [1]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,ConfusionMatrixDisplay,classification_report,roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/features_selected_rfe.csv')
X = data.drop(columns=['num'])
y = data['num']

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [4]:
def print_performance(model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(f'Test Accuracy: {accuracy_score(y_train,model.predict(X_train))*100:.2f}%')
    print(f'Test Accuracy: {accuracy_score(y_test,y_pred)*100:.2f}%')
    print(f"ROC score: {roc_auc_score(y_test,y_pred):.3f}")
    print(f"Precision score: {precision_score(y_test,y_pred):.2f}")
    print(f"Recall score: {recall_score(y_test,y_pred):.2f}")
    print(f"F1 Score score: {f1_score(y_test,y_pred):.2f}")

## Decision Tree Best Hyperparameters by GridSearchCV

In [5]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'] ,
    'splitter':['best', 'random'], 
    'max_depth': [5,10,15,20], 
    'min_samples_split': [2,4,6],
    'min_samples_leaf': [1,2,3,4],
    'max_features': ['auto', 'sqrt', 'log2'], 
    'random_state': [0,1,2,5,0,42], 
    'max_leaf_nodes': [10,50,100]

}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 0, 'splitter': 'random'}


In [6]:
clf = DecisionTreeClassifier(criterion= 'entropy', max_depth= 5, max_features= 'sqrt', 
                             max_leaf_nodes= 10, min_samples_leaf= 1, min_samples_split= 2, 
                             random_state= 5, splitter='best')
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 79.75%
Test Accuracy: 73.77%
ROC score: 0.741
Precision score: 0.69
Recall score: 0.79
F1 Score score: 0.73


## Decision Tree Best Hyperparameters By RandomizedSearchCV

In [7]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'] ,
    'splitter':['best', 'random'], 
    'max_depth': [5,10,15,20], 
    'min_samples_split': [2,4,6],
    'min_samples_leaf': [1,2,3,4],
    'max_features': ['auto', 'sqrt', 'log2'], 
    'random_state': [0,1,2,5,0,42], 
    'max_leaf_nodes': [10,50,100]

}
grid_search = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'splitter': 'random', 'random_state': 0, 'min_samples_split': 6, 'min_samples_leaf': 3, 'max_leaf_nodes': 100, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'log_loss'}


In [8]:
clf = DecisionTreeClassifier(splitter= 'best', random_state= 2, min_samples_split= 6, min_samples_leaf=4,
                              max_leaf_nodes= 100, max_features= 'log2', max_depth= 5, criterion='entropy')
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 85.54%
Test Accuracy: 75.41%
ROC score: 0.754
Precision score: 0.72
Recall score: 0.75
F1 Score score: 0.74


## Random Forest Best Hyperparameters by GridSearchCV

In [9]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [10, 50, 100, 150, 200, 250],
    'max_depth': [None, 10, 20, 30, 40, 50],
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_depth': 50, 'n_estimators': 150}


In [10]:
clf = RandomForestClassifier(max_depth=30,n_estimators=10)
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 98.76%
Test Accuracy: 86.89%
ROC score: 0.868
Precision score: 0.86
Recall score: 0.86
F1 Score score: 0.86


## Random Forest Best Hyperparameters by RandomizedSearchCV

In [11]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [10, 50, 100, 150, 200, 250],
    'max_depth': [None, 10, 20, 30, 40, 50],
}
grid_search = RandomizedSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'n_estimators': 50, 'max_depth': None}


In [12]:
clf = RandomForestClassifier(max_depth=20,n_estimators=150)
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 100.00%
Test Accuracy: 81.97%
ROC score: 0.825
Precision score: 0.76
Recall score: 0.89
F1 Score score: 0.82


## K Nearest Neighbor Best Hyperparameters by GridSearchCV

In [13]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_neighbors': [1,3,5,7,9,11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] ,
    'leaf_size': [5,10,20,30,40,50]
}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'algorithm': 'auto', 'leaf_size': 5, 'n_neighbors': 11, 'weights': 'uniform'}


In [14]:
clf = KNeighborsClassifier(algorithm= 'auto', leaf_size= 5, n_neighbors= 7, weights= 'uniform')
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 83.88%
Test Accuracy: 83.61%
ROC score: 0.840
Precision score: 0.78
Recall score: 0.89
F1 Score score: 0.83


## K Nearest Neighbors Best Hyperparameters by RandomizedSearchCV

In [15]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_neighbors': [1,3,5,7,9,11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] ,
    'leaf_size': [5,10,20,30,40,50]
}
grid_search = RandomizedSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'weights': 'uniform', 'n_neighbors': 11, 'leaf_size': 5, 'algorithm': 'auto'}


In [16]:
clf = KNeighborsClassifier(algorithm= 'auto', leaf_size= 10, n_neighbors= 11, weights= 'uniform')
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 83.47%
Test Accuracy: 85.25%
ROC score: 0.856
Precision score: 0.81
Recall score: 0.89
F1 Score score: 0.85


## Support Vector Machine SVM Best Hyperparameters by GridSearchCV and RandomizedSearchCV

In [17]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'C': [0.1,0.5,1,10]

}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'C': 0.1}


In [18]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'C': [0.1,0.5,1,10]

}
grid_search = RandomizedSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'C': 0.1}


In [19]:
clf = SVC(C=10,probability=True)
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 92.15%
Test Accuracy: 75.41%
ROC score: 0.756
Precision score: 0.71
Recall score: 0.79
F1 Score score: 0.75


## Logistic Regression Best Hyperparameters by RandomizedSearchCV

In [20]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_iter': [1,10,100,1000,10000,100000]

}
grid_search = RandomizedSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_iter': 10}


In [21]:
clf = LogisticRegression(max_iter=10)
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 83.06%
Test Accuracy: 85.25%
ROC score: 0.858
Precision score: 0.79
Recall score: 0.93
F1 Score score: 0.85


## Logistic Regression Best Hyperparameters by GridSearchCV

In [22]:
# Hyperparameter tuning for Decision Tree
param_grid = {
    'max_iter': [1,10,100,1000,10000,100000]

}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters:", grid_search.best_params_)

Best Hyperparameters: {'max_iter': 10}


In [23]:
clf = LogisticRegression(max_iter=10)
print_performance(clf,X_train,y_train,X_test,y_test)

Test Accuracy: 83.06%
Test Accuracy: 85.25%
ROC score: 0.858
Precision score: 0.79
Recall score: 0.93
F1 Score score: 0.85


## Performance Comparison

In [24]:
performance = pd.DataFrame({
    'Model':['Decision Tree','Random Forest','KNearestNeighbor','Support Vector Machine','Logistic Regression'],
    'Acc':  ['81.97%',        '81.97%',      '80.33%',           '81.97%',                  '81.97%'],
    'Roc Score':[0.820,       0.823,          0.807,               0.823,                     0.823],
    'Precision':[0.79,        0.77,           0.75,                0.77,                       0.77],
    'Recall':   [0.82,        0.86,           0.86,                0.86,                       0.86],
    'F1 Score': [0.81,        0.81,           0.80,                0.81,                       0.81]
})

In [25]:
performance

Unnamed: 0,Model,Acc,Roc Score,Precision,Recall,F1 Score
0,Decision Tree,81.97%,0.82,0.79,0.82,0.81
1,Random Forest,81.97%,0.823,0.77,0.86,0.81
2,KNearestNeighbor,80.33%,0.807,0.75,0.86,0.8
3,Support Vector Machine,81.97%,0.823,0.77,0.86,0.81
4,Logistic Regression,81.97%,0.823,0.77,0.86,0.81


In [26]:
import joblib

# Save the model to a file
joblib.dump(clf, '../models/logistic_regression_model.pkl')

['../models/logistic_regression_model.pkl']

In [27]:
# Load the model from the file
loaded_model = joblib.load('../models/logistic_regression_model.pkl')

# Use the model to make predictions
predictions = loaded_model.predict(X_test)

In [28]:
accuracy_score(y_test,predictions)

0.8524590163934426

In [29]:
# Save the trained model
joblib.dump(clf, "../models/best_model.pkl")

['../models/best_model.pkl']

In [30]:
model = joblib.load('../models/best_model.pkl')

In [33]:
if hasattr(model, "feature_names_in_"):
    print("Features used during training:")
    print(model.feature_names_in_)

Features used during training:
['cp' 'ca' 'oldpeak' 'thal' 'thalach' 'age']


In [32]:
print_performance(model,X_train,y_train,X_test,y_test)

Test Accuracy: 83.06%
Test Accuracy: 85.25%
ROC score: 0.858
Precision score: 0.79
Recall score: 0.93
F1 Score score: 0.85
