In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore")

In [3]:
%%capture
%run "./04_supervised_learning.ipynb"

In [5]:
# 1 - Logistic Regression model

# Grid Search
param_grid = {
    'penalty': ['l1', 'l2'],             
    'C': [0.1, 1, 10],   
    'solver': ['liblinear'],            
    'max_iter': [100, 200]              
}
lr_grid_search = GridSearchCV(
    estimator=lrmodel,
    param_grid=param_grid,
    cv=3,                   # fewer folds = faster
    scoring='accuracy',
    n_jobs=-1
)
lr_grid_search.fit(X_train, y_train)
print("🔍 GridSearch Best params:", lr_grid_search.best_params_)
accuracy_calc(lr_grid_search)
print("\n\n")
# Random Search
lr_random_search = RandomizedSearchCV(
    estimator=lrmodel,
    param_distributions=param_grid,
    cv=3,
    n_iter=10,
    scoring='accuracy',
    n_jobs=-1
)
lr_random_search.fit(X_train, y_train)
print("🎯 RandomSearch Best params:", lr_random_search.best_params_)
accuracy_calc(lr_random_search)


🔍 GridSearch Best params: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy of training data = 61.18%
accuracy of testing data = 63.33%



🎯 RandomSearch Best params: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 200, 'C': 1}
accuracy of training data = 61.18%
accuracy of testing data = 63.33%


In [7]:
# 2 - Decision Tree model

# Grid Search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, None],                 
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', None]              
}
dt_grid_search = GridSearchCV(estimator=dtmodel, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search.fit(X_train, y_train)

print("🔍 Best parameters from GridSearch:", dt_grid_search.best_params_)
accuracy_calc(dt_grid_search)
print('\n\n')
# Random Search
dt_random_search = RandomizedSearchCV(estimator=dtmodel, param_distributions=param_grid, cv=3, scoring='accuracy', n_iter=10)
dt_random_search.fit(X_train, y_train)

print("Best parameters from RandomSearch:", dt_random_search.best_params_)
accuracy_calc(dt_random_search)

🔍 Best parameters from GridSearch: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5}
accuracy of training data = 73.84%
accuracy of testing data = 63.33%



Best parameters from RandomSearch: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 5, 'criterion': 'gini'}
accuracy of training data = 70.89%
accuracy of testing data = 55.00%


In [15]:
# 3 - Random Forest model

    # Grid Search
param_grid = {
    'n_estimators': [100, 300, 500],             
    'max_depth': [10, 20, None],                 
    'min_samples_split': [2, 5],                 
    'min_samples_leaf': [1, 2],                  
    'max_features': ['sqrt', None],              
    'bootstrap': [True]                          
}


rf_grid_search = GridSearchCV(estimator=rfmodel, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

print("Best parameters from GridSearch:", rf_grid_search.best_params_)
accuracy_calc(rf_grid_search)

print("\n\n")

# Random Search
rf_random_search = RandomizedSearchCV(estimator=rfmodel, param_distributions=param_grid, cv=3, scoring='accuracy', n_iter=10)
rf_random_search.fit(X_train, y_train)

print("Best parameters from RandomSearch:", rf_random_search.best_params_)
accuracy_calc(rf_random_search)

Best parameters from GridSearch: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
accuracy of training data = 91.14%
accuracy of testing data = 61.67%



Best parameters from RandomSearch: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20, 'bootstrap': True}
accuracy of training data = 93.25%
accuracy of testing data = 56.67%


In [9]:
#Compare optimized models with baseline performance.
# 1 - Logistic Regression model
print("Baseline model performance:")
accuracy_calc(lrmodel)

print("\n\n")

print("Optimized model using grid search performance:")
accuracy_calc(lr_grid_search)

print("\n\n")

print("Optimized model using random search performance:")
accuracy_calc(lr_random_search)

print("\n\n")

test_acc_basline    =   accuracy_score(y_test, lrmodel.predict(X_test))
test_acc_grid       =   accuracy_score(y_test, lr_grid_search.predict(X_test))
test_acc_random     =   accuracy_score(y_test, lr_random_search.predict(X_test))

highest_acc_lr = 0

if test_acc_basline > test_acc_grid:
    lr_model = "Baseline Logistic Regression Model"
    highest_acc_lr = test_acc_basline
elif test_acc_grid > test_acc_random:
    lr_model = "Optimized Logistic Regression Model using grid search"
    highest_acc_lr = test_acc_grid
else:
    lr_model = "Optimized Logistic RegressionModel using random search"
    highest_acc_lr = test_acc_random

print(f"Model with highest testing accuracy is: '{lr_model}' with testing accuracy = {highest_acc_lr*100:.2f}%")

Baseline model performance:
accuracy of training data = 63.71%
accuracy of testing data = 58.33%



Optimized model using grid search performance:
accuracy of training data = 61.18%
accuracy of testing data = 63.33%



Optimized model using random search performance:
accuracy of training data = 61.18%
accuracy of testing data = 63.33%



Model with highest testing accuracy is: 'Optimized Logistic RegressionModel using random search' with testing accuracy = 63.33%


In [11]:
# 2 - Decision Tree model
print("Baseline model performance:")
accuracy_calc(dtmodel)

print("\n\n")

print("Optimized model using grid search performance:")
accuracy_calc(dt_grid_search)

print("\n\n")

print("Optimized model using random search performance:")
accuracy_calc(dt_random_search)

print("\n\n")

test_acc_basline    =   accuracy_score(y_test, dtmodel.predict(X_test))
test_acc_grid       =   accuracy_score(y_test, dt_grid_search.predict(X_test))
test_acc_random     =   accuracy_score(y_test, dt_random_search.predict(X_test))

highest_acc_dt = 0

if test_acc_basline > test_acc_grid:
    dt_model = "Baseline Decision Tree Model"
    highest_acc_dt = test_acc_basline
elif test_acc_grid > test_acc_random:
    dt_model = "Optimized Decision Tree Model using grid search"
    highest_acc_dt = test_acc_grid
else:
    dt_model = "Optimized Decision Tree Model using random search"
    highest_acc_dt = test_acc_random

print(f"Model with highest testing accuracy is: '{dt_model}' with testing accuracy = {highest_acc_dt*100:.2f}%")

Baseline model performance:
accuracy of training data = 100.00%
accuracy of testing data = 48.33%



Optimized model using grid search performance:
accuracy of training data = 73.84%
accuracy of testing data = 63.33%



Optimized model using random search performance:
accuracy of training data = 70.89%
accuracy of testing data = 55.00%



Model with highest testing accuracy is: 'Optimized Decision Tree Model using grid search' with testing accuracy = 63.33%


In [17]:
# 3 - Random Forest model
print("Baseline model performance:")
accuracy_calc(rfmodel)

print("\n\n")

print("Optimized model using grid search performance:")
accuracy_calc(rf_grid_search)

print("\n\n")

print("Optimized model using random search performance:")
accuracy_calc(rf_random_search)

print("\n\n")

test_acc_basline    =   accuracy_score(y_test, rfmodel.predict(X_test))
test_acc_grid       =   accuracy_score(y_test, rf_grid_search.predict(X_test))
test_acc_random     =   accuracy_score(y_test, rf_random_search.predict(X_test))

highest_acc_rf = 0

if test_acc_basline > test_acc_grid:
    rf_model = "Baseline Random Forest Model"
    highest_acc_rf = test_acc_basline
elif test_acc_grid > test_acc_random:
    rf_model = "Optimized Random Forest Model using grid search"
    highest_acc_rf = test_acc_grid
else:
    rf_model = "Optimized Random Forest Model using random search"
    highest_acc_rf = test_acc_random

print(f"Model with highest testing accuracy is: '{rf_model}' with testing accuracy = {highest_acc_rf*100:.2f}%")

Baseline model performance:
accuracy of training data = 100.00%
accuracy of testing data = 58.33%



Optimized model using grid search performance:
accuracy of training data = 91.14%
accuracy of testing data = 61.67%



Optimized model using random search performance:
accuracy of training data = 93.25%
accuracy of testing data = 56.67%



Model with highest testing accuracy is: 'Optimized Random Forest Model using grid search' with testing accuracy = 61.67%


In [21]:
# get model with highest testing accuracy

if highest_acc_lr > highest_acc_dt and highest_acc_lr > highest_acc_rf:
    print(f"model with highest testing accuracy is '{lr_model}' with testing accuracy = {highest_acc_lr*100:.2f}%")
    
elif highest_acc_dt > highest_acc_rf:
    print(f"model with highest testing accuracy is '{dt_model}' with testing accuracy = {highest_acc_dt*100:.2f}%")
    
else:
    print(f"model with highest testing accuracy is '{rf_model}' with testing accuracy = {highest_acc_rf*100:.2f}%")

model with highest testing accuracy is 'Optimized Decision Tree Model using grid search' with testing accuracy = 63.33%
