In [1]:
#1. Importing necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
# 2. Loading and splitting the data.
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=345)

In [3]:
# 3. Data standardization.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# 4. List of algorithms.
models = [
    ('Logistic Regression', LogisticRegression(max_iter=10000)),
    ('SVC', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('k-NN', KNeighborsClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Neural Network', MLPClassifier(max_iter=10000)),
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')),
    ('CatBoost', CatBoostClassifier(verbose=0))
]

In [5]:
# 5. Cross-validation parameters.
cv = KFold(n_splits=5, shuffle=True, random_state=345)

In [6]:
# 6. Functions to generate model metrics.
def evaluate_model(name, model, X_train, y_train, cv):
    accuracy_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    f1_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1_macro')
    precision_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='precision_macro')
    recall_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='recall_macro')

    return {
        'Model': name,
        'Accuracy': accuracy_scores.mean(),
        'F1 Score': f1_scores.mean(),
        'Precision': precision_scores.mean(),
        'Recall': recall_scores.mean()
    }

def evaluate_models(models, X_train, y_train, cv):
    return [evaluate_model(name, model, X_train, y_train, cv) for name, model in models]


In [7]:
# 7. Generating metrics for default hyperparameters.
default_results = evaluate_models(models, X_train, y_train, cv)
df_default = pd.DataFrame(default_results)

In [8]:
# 8. Defining hyperparameter grids for each algorithm.
param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0]
    },
    'SVC': {
        'C': [0.01, 0.1, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0],
        'gamma': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'kernel': ['rbf', 'linear']
    },
    'Decision Tree': {
        'max_depth': [None, 2, 5, 10, 20, 30, 40, 50, 100],
        'min_samples_split': [2, 4, 5, 6, 8, 10, 12, 15],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10]
    },
    'Random Forest': {
        'n_estimators': [10, 25, 40, 50, 80, 100],
        'max_depth': [None, 5, 10, 20, 25, 30, 40],
        'min_samples_split': [2, 5, 8, 10],
        'min_samples_leaf': [1, 2, 4, 5, 8, 10, 15]
    },
    'k-NN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'Gradient Boosting': {
        'n_estimators': [10, 20, 25, 30, 40, 50, 80, 100, 150, 200],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 1],
        'max_depth': [3, 5, 7, 9, 11]
    },
    'Neural Network': {
        'hidden_layer_sizes': [(50,), (100,)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'alpha': [0.0001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    },
    'XGBoost': {
        'n_estimators': [50, 75, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'max_depth': [3, 5, 7, 9]
    },
    'CatBoost': {
        'iterations': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'depth': [3, 5, 7, 9]
    }
}

In [9]:
# 9. Generating best hyperparameter sets based on specified metric using grid search.
best_models = []
for name, model in models:
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models.append((name, grid_search.best_estimator_, grid_search.best_params_))
    print(f"Best parameters for {name}: {grid_search.best_params_}")

Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 20.0}
Tuning SVC...
Best parameters for SVC: {'C': 2.0, 'gamma': 0.2, 'kernel': 'rbf'}
Tuning Decision Tree...
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}
Tuning k-NN...
Best parameters for k-NN: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'}
Tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 200}
Tuning Neural Network...
Best parameters for Neural Network: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'solver': 'adam'}
Tuning XGBoost...
Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Tuning CatBoost...
Best parameters for CatBoost:

In [11]:
# 10. Generating additional metrics using the best hyperparameter sets.
tuned_results = evaluate_models([(name, model) for name, model, params in best_models], X_train, y_train, cv)
df_tuned = pd.DataFrame(tuned_results)

In [12]:
# 11. Formatting tables.
def styled_df(df):
    styled_df = df.style.background_gradient(subset=['Accuracy', 'F1 Score', 'Precision', 'Recall'], cmap='RdYlGn')
    return styled_df

In [13]:
# 12.1. Displaying the generated table for default hyperparameters.
styled_df(df_default)

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.95,0.950337,0.954545,0.951353
1,SVC,0.95,0.952199,0.954074,0.954558
2,Decision Tree,0.916667,0.937703,0.943771,0.921652
3,Random Forest,0.925,0.937703,0.943771,0.935969
4,k-NN,0.933333,0.93224,0.931878,0.939744
5,Gradient Boosting,0.925,0.93147,0.939731,0.935969
6,Neural Network,0.941667,0.943962,0.954545,0.951353
7,XGBoost,0.916667,0.917736,0.921044,0.927635
8,CatBoost,0.933333,0.937703,0.943771,0.941097


In [14]:
# 12.2. Displaying the generated table for tuned hyperparameters.
styled_df(df_tuned)

Unnamed: 0,Model,Accuracy,F1 Score,Precision,Recall
0,Logistic Regression,0.966667,0.968172,0.970948,0.970798
1,SVC,0.958333,0.958574,0.96,0.959687
2,Decision Tree,0.95,0.952199,0.954074,0.954558
3,Random Forest,0.941667,0.920141,0.943771,0.925356
4,k-NN,0.95,0.950337,0.954545,0.951353
5,Gradient Boosting,0.933333,0.937703,0.943771,0.941097
6,Neural Network,0.95,0.950337,0.954545,0.959687
7,XGBoost,0.941667,0.93955,0.939466,0.949003
8,CatBoost,0.95,0.950337,0.954545,0.951353
