In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score, classification_report

In [82]:
# Loading the final selected dataset
df_final = pd.read_csv('final_dataset.csv')
df_final.head()

Unnamed: 0,oldpeak,thalch,ca,age,chol,trestbps,thal_reversable defect,exang_True,cp_atypical angina,sex_Male,thal_normal,cp_non-anginal,target
0,1.069475,0.029124,-0.718306,0.940446,-0.262867,0.74976,0,0,0,1,0,0,0
1,0.380309,-1.790447,2.487269,1.384143,0.747722,1.596354,0,1,0,1,1,0,1
2,1.327912,-0.880662,1.418744,1.384143,-0.339138,-0.661231,1,1,0,1,0,0,1
3,2.103224,1.632079,-0.718306,-1.943588,0.061285,-0.096835,0,0,0,1,1,1,0
4,0.294163,0.982232,-0.718306,-1.499891,-0.81583,-0.096835,0,0,1,0,1,0,0


# Hyperparameter Tuning

In [85]:
# now we have to split the data again into x and y and then into 80% and 20%  train test data 

# separate the features (X) and the target (y)
X = df_final.drop('target', axis=1)
y = df_final['target']

In [87]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Tunning Logistic Regression with GridSearchCV

In [90]:
# now we apply the Hyperparameter Tuning with Logistic Regression (GridSearchCV)
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

lr_grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), lr_param_grid, cv=5, scoring='accuracy')
lr_grid_search.fit(X_train, y_train)

In [92]:
# now we will print the results to identify the best parameter for logistic regrassion and best cross-validation score
print(f"Best parameters for Logistic Regression: {lr_grid_search.best_params_}")
print(f"Best cross-validation score: {lr_grid_search.best_score_:.4f}")

Best parameters for Logistic Regression: {'C': 1, 'solver': 'liblinear'}
Best cross-validation score: 0.8160


### Tunning Random Forest with RandomizedSearchCV

In [95]:
# now we try the the Hyperparameter Tuning with Random Forest (RandomizedSearchCV)

rf_param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 15),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

rf_random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=rf_param_dist, n_iter=50, cv=5, scoring='accuracy', random_state=42)
rf_random_search.fit(X_train, y_train)


In [96]:
# now printing the result to identify the best parameter for Random Forest and best cross-validation score
print(f"Best parameters for Random Forest: {rf_random_search.best_params_}")
print(f"Best cross-validation score: {rf_random_search.best_score_:.4f}")

Best parameters for Random Forest: {'max_depth': 14, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 121}
Best cross-validation score: 0.8162


* as result of the RandomizedSearchCV and GridSearchCV for the two most effective methods we got. For Logistic Regression, the best parameters are C=1 and solver='liblinear'.
* and For Random Forest, the best combination includes n_estimators=121, max_depth=14, min_samples_split=2, and min_samples_leaf=7
  2

#### Comparing optimized models with baseline performance.

In [101]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
# now we will try to Train Final Logistic Regression Model with Best Parameters we got( C=1, solver='liblinear')
lr_final_model = LogisticRegression(C=1, solver='liblinear', random_state=42)
lr_final_model.fit(X_train, y_train)

In [105]:
# and also will Train Final Random Forest Model with Best Parameters we got ( n_estimators=121, max_depth=14, min_samples_leaf=7, min_samples_split=2 )
rf_final_model = RandomForestClassifier(n_estimators=121, max_depth=14, min_samples_leaf=7, min_samples_split=2, random_state=42)
rf_final_model.fit(X_train, y_train)

In [107]:
# no i will Evaluate Final Models on the Test Set 
# Evaluate Logistic Regression
lr_predictions = lr_final_model.predict(X_test)
print("--- Final Logistic Regression Performance ---")
print(classification_report(y_test, lr_predictions))


--- Final Logistic Regression Performance ---
              precision    recall  f1-score   support

           0       0.89      0.91      0.90        35
           1       0.88      0.84      0.86        25

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60



In [109]:
# now Evaluate for Random Forest
rf_predictions = rf_final_model.predict(X_test)
print("\n--- Final Random Forest Performance ---")
print(classification_report(y_test, rf_predictions))


--- Final Random Forest Performance ---
              precision    recall  f1-score   support

           0       0.92      0.94      0.93        35
           1       0.92      0.88      0.90        25

    accuracy                           0.92        60
   macro avg       0.92      0.91      0.91        60
weighted avg       0.92      0.92      0.92        60



* now for Conclusion: Hyperparameter tuning did not significantly improve the Logistic Regression model's performance. The initial default parameters were already very effective for this dataset.
* Hyperparameter tuning had a significant positive impact on the Random Forest model, leading to a noticeable increase in all key metrics. The optimized model is now more accurate and reliable.

### Random forest is the Best performing model with optimized hyperparameters