In [None]:
# Install required packages
!pip install ucimlrepo scikit-learn numpy pandas matplotlib seaborn xgboost optuna tqdm optuna ucimlrepo

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import optuna
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
# Import tqdm for progress bar display
from tqdm import tqdm as tqdm_cli

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, ucimlrepo, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0 ucimlrepo-0.0.7


In [None]:
# FROM THE ORIGINAL DATASET I AM CREATING VARIOUS DATASETS -  (A,B,C) X (BINARY, 5LEVEL, REGRESSION) X (MAT, POR)

# Load the student performance datasets for math and Portuguese subjects
df_mat = pd.read_csv("/content/student-mat.csv", sep=';')
df_por = pd.read_csv("/content/student-por.csv", sep=';')

# binary classification target: 'Pass' if final grade G3 >= 10, otherwise 'Fail'
df_mat['binary'] = np.where(df_mat['G3'] >= 10, 'Pass', 'Fail')
df_por['binary'] = np.where(df_por['G3'] >= 10, 'Pass', 'Fail')

# 5-level classification target based on G3: group grades into 5 performance bands
bins = [-1, 9, 11, 13, 15, 20]  # Define boundaries
labels = [5, 4, 3, 2, 1]  # Corresponding labels
df_mat['5level'] = pd.cut(df_mat['G3'], bins=bins, labels=labels, include_lowest=True)
df_por['5level'] = pd.cut(df_por['G3'], bins=bins, labels=labels, include_lowest=True)

# Create various dataframes for different configurations
df_A_Binary_mat = df_mat.drop(columns=['5level', 'G3'])
df_A_5Level_mat = df_mat.drop(columns=['binary', 'G3'])
df_A_Regression_mat = df_mat.drop(columns=['binary', '5level'])

df_A_Binary_por = df_por.drop(columns=['5level', 'G3'])
df_A_5Level_por = df_por.drop(columns=['binary', 'G3'])
df_A_Regression_por = df_por.drop(columns=['binary', '5level'])

df_B_Binary_mat = df_mat.drop(columns=['G2', '5level', 'G3'])
df_B_5Level_mat = df_mat.drop(columns=['G2', 'binary', 'G3'])
df_B_Regression_mat = df_mat.drop(columns=['G2', 'binary', '5level'])

df_B_Binary_por = df_por.drop(columns=['G2', '5level', 'G3'])
df_B_5Level_por = df_por.drop(columns=['G2', 'binary', 'G3'])
df_B_Regression_por = df_por.drop(columns=['G2', 'binary', '5level'])

df_C_Binary_mat = df_mat.drop(columns=['G2', '5level', 'G1', 'G3'])
df_C_5Level_mat = df_mat.drop(columns=['G2', 'binary', 'G1', 'G3'])
df_C_Regression_mat = df_mat.drop(columns=['G2', 'binary', '5level', 'G1'])

df_C_Binary_por = df_por.drop(columns=['G2', '5level', 'G1', 'G3'])
df_C_5Level_por = df_por.drop(columns=['G2', 'binary', 'G1', 'G3'])
df_C_Regression_por = df_por.drop(columns=['G2', 'binary', '5level', 'G1'])

# Define a custom callback for Optuna to show progress using tqdm
class TqdmCallback:
    def __init__(self, n_trials):
        # Initialize a progress bar with total number of trials
        self.pbar = tqdm(total=n_trials, desc="Optimization")

    def __call__(self, study, trial):
        # Update the progress bar and display the best RMSE found so far
        self.pbar.update(1)
        self.pbar.set_postfix({"Best RMSE": f"{study.best_value:.4f}"})

In [None]:
# We apply XGBoost for a regression task and uses Optuna for hyperparameter tuning.

# Optuna is a powerful hyperparameter optimization framework that automates the search for optimal parameters.
# It uses advanced algorithms like Tree-structured Parzen Estimators (TPE) to efficiently explore the hyperparameter space,
# optimizing performance metrics such as RMSE faster and more effectively than traditional grid or random search.

# Key hyperparameters include:
# - n_estimators: Number of boosting rounds (trees) in the model.
# - max_depth: Maximum depth of each tree, helping to control overfitting by limiting tree complexity.
# - learning_rate: Controls the contribution of each tree; smaller values lead to slower but more precise learning.
# - subsample: Fraction of training data used for each tree, introducing randomness to reduce overfitting.
# - colsample_bytree: Fraction of features sampled for each tree, also helping to prevent overfitting.
# - min_child_weight: Minimum sum of instance weights in a child node, acting as a regularization mechanism.
# - reg_alpha: L1 regularization term, encouraging sparsity and simpler models.
# - reg_lambda: L2 regularization term, penalizing large weights to improve model generalization.

# Define objective function for Optuna.
def objective1(trial, X, y, preprocessor, kf):
    # Suggest values for XGBoost hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=10), # Number of boosting rounds
        'max_depth': trial.suggest_int('max_depth', 3, 10), # Max depth of a tree
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), # Step size shrinkage
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsample ratio of the training data
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0), # Subsample ratio of columns for each tree
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10), # Minimum sum of instance weight in a child
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10, log=True),  # L1 regularization term on weights
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10, log=True),  # L2 regularization term on weights
        'random_state': 42, # Fixed seed for reproducibility
        'objective': 'reg:squarederror',  # For regression
        'eval_metric': 'rmse'  # Evaluation metric: root mean squared error
    }

    # Create XGBoost model
    xgb_model = xgb.XGBRegressor(**params)

    # Define a pipeline: first preprocess, then train model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', xgb_model)
    ])

    # List to store RMSE scores from each fold
    scores = []
    # Perform K-Fold cross-validation
    for train_idx, val_idx in kf.split(X):
      # Split data into training and validation sets
        X_fold_train, X_fold_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_fold_train, y_fold_val = y.iloc[train_idx].copy(), y.iloc[val_idx].copy()

         # Fit pipeline on training fold
        pipeline.fit(X_fold_train, y_fold_train)
        # Predict on validation fold
        y_pred = pipeline.predict(X_fold_val)
        # Compute RMSE for this fold
        fold_rmse = np.sqrt(mean_squared_error(y_fold_val, y_pred))
        # Save RMSE
        scores.append(fold_rmse)

    return np.mean(scores)

# Setup 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define datasets
datasets = {
    'A_Regression_mat': df_A_Regression_mat,
    'B_Regression_mat': df_B_Regression_mat,
    'C_Regression_mat': df_C_Regression_mat,
    'A_Regression_por': df_A_Regression_por,
    'B_Regression_por': df_B_Regression_por,
    'C_Regression_por': df_C_Regression_por
}

# Iterate over datasets
for name, df in datasets.items():
    print(f"\nProcessing dataset: {name}")

    # Create preprocessor
    # Identify categorical features (strings or categories)
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    # Identify numerical features (ints and floats), excluding the target column
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_features = [col for col in numerical_features if col != 'G3']

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features), # One-hot encode categorical columns
            ('num', 'passthrough', numerical_features) # Pass numerical columns without change as XGBoost is scale invariant
        ])

    # Extract features and target
    y = df['G3']
    X = df.drop(columns=['G3'])

    # Run Optuna optimization for this dataset
    n_trials = 50 # Number of hyperparameter combinations to try
    print(f"Running {n_trials} optimization trials for {name}...")
    study = optuna.create_study(direction='minimize') # Create an Optuna study to minimize RMSE
    study.optimize(lambda trial: objective1(trial, X, y, preprocessor, kf), n_trials=n_trials, callbacks=[TqdmCallback(n_trials)]) # Start optimization with progress bar callback

    # Print best parameters
    print(f"\nBest hyperparameters for {name}: {study.best_params}")
    print(f"Best RMSE for {name}: {study.best_value:.4f}")

[I 2025-04-13 18:41:34,823] A new study created in memory with name: no-name-8fab7a82-6e37-4941-b4e4-4dd935af9625



Processing dataset: A_Regression_mat
Running 50 optimization trials for A_Regression_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 18:41:38,215] Trial 0 finished with value: 2.1748328171383458 and parameters: {'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.010694731500909256, 'subsample': 0.883973990563776, 'colsample_bytree': 0.8236087091148208, 'min_child_weight': 2, 'reg_alpha': 0.37715223903666795, 'reg_lambda': 5.75573304005879}. Best is trial 0 with value: 2.1748328171383458.
[I 2025-04-13 18:41:49,604] Trial 1 finished with value: 1.612965505330438 and parameters: {'n_estimators': 890, 'max_depth': 8, 'learning_rate': 0.08778216033324442, 'subsample': 0.8643717153633388, 'colsample_bytree': 0.7230757697415937, 'min_child_weight': 5, 'reg_alpha': 9.863890233657319, 'reg_lambda': 0.7904756356561569}. Best is trial 1 with value: 1.612965505330438.
[I 2025-04-13 18:42:07,303] Trial 2 finished with value: 1.5023182974196967 and parameters: {'n_estimators': 870, 'max_depth': 9, 'learning_rate': 0.011998026081626895, 'subsample': 0.6982444497952853, 'colsample_bytree': 0.81564062405018, 'min


Best hyperparameters for A_Regression_mat: {'n_estimators': 590, 'max_depth': 8, 'learning_rate': 0.020323102976392896, 'subsample': 0.6322962287929168, 'colsample_bytree': 0.9699206572160829, 'min_child_weight': 8, 'reg_alpha': 0.34801861588336613, 'reg_lambda': 7.87546831623349}
Best RMSE for A_Regression_mat: 1.4834

Processing dataset: B_Regression_mat
Running 50 optimization trials for B_Regression_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 18:47:07,966] Trial 0 finished with value: 2.537279445856306 and parameters: {'n_estimators': 660, 'max_depth': 9, 'learning_rate': 0.09224147088537446, 'subsample': 0.5386643060771249, 'colsample_bytree': 0.70775338558245, 'min_child_weight': 1, 'reg_alpha': 0.1483974164082109, 'reg_lambda': 0.25078227956996263}. Best is trial 0 with value: 2.537279445856306.
[I 2025-04-13 18:47:09,275] Trial 1 finished with value: 2.359882310607384 and parameters: {'n_estimators': 330, 'max_depth': 4, 'learning_rate': 0.068001495354658, 'subsample': 0.8273108591110399, 'colsample_bytree': 0.9133489679632738, 'min_child_weight': 8, 'reg_alpha': 3.4221525921360616, 'reg_lambda': 4.080842893429633}. Best is trial 1 with value: 2.359882310607384.
[I 2025-04-13 18:47:14,504] Trial 2 finished with value: 2.371831086680041 and parameters: {'n_estimators': 710, 'max_depth': 5, 'learning_rate': 0.05290759445945368, 'subsample': 0.7963377937675732, 'colsample_bytree': 0.9945979940357579, 'min_chi


Best hyperparameters for B_Regression_mat: {'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.011213672596707117, 'subsample': 0.7265547654036166, 'colsample_bytree': 0.9450303109908904, 'min_child_weight': 10, 'reg_alpha': 7.414675309015117, 'reg_lambda': 0.1449402101851111}
Best RMSE for B_Regression_mat: 2.2613

Processing dataset: C_Regression_mat
Running 50 optimization trials for C_Regression_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 18:49:38,220] Trial 0 finished with value: 3.8584491810023644 and parameters: {'n_estimators': 320, 'max_depth': 6, 'learning_rate': 0.019211721328198626, 'subsample': 0.9991458278261036, 'colsample_bytree': 0.5617597337059981, 'min_child_weight': 7, 'reg_alpha': 0.21760329323841845, 'reg_lambda': 3.560564764793572}. Best is trial 0 with value: 3.8584491810023644.
[I 2025-04-13 18:49:41,058] Trial 1 finished with value: 3.980318602117856 and parameters: {'n_estimators': 910, 'max_depth': 10, 'learning_rate': 0.11110829689865072, 'subsample': 0.954254197329135, 'colsample_bytree': 0.8797821919620459, 'min_child_weight': 1, 'reg_alpha': 0.7965224728296927, 'reg_lambda': 0.187485139902057}. Best is trial 0 with value: 3.8584491810023644.
[I 2025-04-13 18:49:44,342] Trial 2 finished with value: 3.9124649887032703 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.04872779129640574, 'subsample': 0.983474652449327, 'colsample_bytree': 0.7151299265924325, '


Best hyperparameters for C_Regression_mat: {'n_estimators': 470, 'max_depth': 4, 'learning_rate': 0.013903028268773293, 'subsample': 0.8016832536865943, 'colsample_bytree': 0.9336880867331868, 'min_child_weight': 4, 'reg_alpha': 0.20695499237140552, 'reg_lambda': 0.7220252076277969}
Best RMSE for C_Regression_mat: 3.7904

Processing dataset: A_Regression_por
Running 50 optimization trials for A_Regression_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 18:52:20,604] Trial 0 finished with value: 1.2405106486001127 and parameters: {'n_estimators': 620, 'max_depth': 3, 'learning_rate': 0.021802415961367685, 'subsample': 0.7741819291090364, 'colsample_bytree': 0.5468088816036525, 'min_child_weight': 2, 'reg_alpha': 3.7979520506737297, 'reg_lambda': 1.9737491001916214}. Best is trial 0 with value: 1.2405106486001127.
[I 2025-04-13 18:52:23,896] Trial 1 finished with value: 1.3544526201482017 and parameters: {'n_estimators': 610, 'max_depth': 5, 'learning_rate': 0.10162010047742788, 'subsample': 0.6873426828067141, 'colsample_bytree': 0.8019777643926855, 'min_child_weight': 1, 'reg_alpha': 0.353268082642765, 'reg_lambda': 7.9338848429450985}. Best is trial 0 with value: 1.2405106486001127.
[I 2025-04-13 18:52:31,391] Trial 2 finished with value: 1.2381507241765806 and parameters: {'n_estimators': 720, 'max_depth': 7, 'learning_rate': 0.028735744984336474, 'subsample': 0.5215202800520435, 'colsample_bytree': 0.8973395583887511


Best hyperparameters for A_Regression_por: {'n_estimators': 590, 'max_depth': 8, 'learning_rate': 0.014655188360257176, 'subsample': 0.5959527283579, 'colsample_bytree': 0.8798592209572552, 'min_child_weight': 2, 'reg_alpha': 1.631287367964372, 'reg_lambda': 1.3797922413883006}
Best RMSE for A_Regression_por: 1.2045

Processing dataset: B_Regression_por
Running 50 optimization trials for B_Regression_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 18:56:13,763] Trial 0 finished with value: 1.7571296987453446 and parameters: {'n_estimators': 710, 'max_depth': 6, 'learning_rate': 0.1185070951105935, 'subsample': 0.8329005919742598, 'colsample_bytree': 0.6082872644985519, 'min_child_weight': 7, 'reg_alpha': 7.559342075407398, 'reg_lambda': 0.44350094711253474}. Best is trial 0 with value: 1.7571296987453446.
[I 2025-04-13 18:56:16,714] Trial 1 finished with value: 1.6956322821102248 and parameters: {'n_estimators': 210, 'max_depth': 5, 'learning_rate': 0.058165108258417364, 'subsample': 0.6961993306047065, 'colsample_bytree': 0.8175752965547611, 'min_child_weight': 5, 'reg_alpha': 0.6390566735404462, 'reg_lambda': 1.9434873324170279}. Best is trial 1 with value: 1.6956322821102248.
[I 2025-04-13 18:56:17,788] Trial 2 finished with value: 1.7147813417711677 and parameters: {'n_estimators': 220, 'max_depth': 4, 'learning_rate': 0.05174627334561117, 'subsample': 0.6531545517760027, 'colsample_bytree': 0.6134634453685023,


Best hyperparameters for B_Regression_por: {'n_estimators': 450, 'max_depth': 5, 'learning_rate': 0.022589556612863205, 'subsample': 0.9935492273274007, 'colsample_bytree': 0.9984409196890032, 'min_child_weight': 7, 'reg_alpha': 7.019451582626208, 'reg_lambda': 0.4010200769396677}
Best RMSE for B_Regression_por: 1.6192

Processing dataset: C_Regression_por
Running 50 optimization trials for C_Regression_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 18:58:58,912] Trial 0 finished with value: 2.6728968723260893 and parameters: {'n_estimators': 540, 'max_depth': 6, 'learning_rate': 0.04387214616209587, 'subsample': 0.628976938970589, 'colsample_bytree': 0.689911401053223, 'min_child_weight': 6, 'reg_alpha': 9.514221564172168, 'reg_lambda': 0.9422495490426983}. Best is trial 0 with value: 2.6728968723260893.
[I 2025-04-13 18:59:02,061] Trial 1 finished with value: 2.807015225391491 and parameters: {'n_estimators': 580, 'max_depth': 9, 'learning_rate': 0.21530768015074012, 'subsample': 0.6185266382275003, 'colsample_bytree': 0.7970277473451768, 'min_child_weight': 2, 'reg_alpha': 3.057433294823277, 'reg_lambda': 7.701853886169776}. Best is trial 0 with value: 2.6728968723260893.
[I 2025-04-13 18:59:05,523] Trial 2 finished with value: 2.652303475663972 and parameters: {'n_estimators': 470, 'max_depth': 10, 'learning_rate': 0.016520616093748417, 'subsample': 0.7940097089548986, 'colsample_bytree': 0.5121908518530458, 'min


Best hyperparameters for C_Regression_por: {'n_estimators': 560, 'max_depth': 4, 'learning_rate': 0.012485201480383722, 'subsample': 0.6454247050453034, 'colsample_bytree': 0.5504584512456258, 'min_child_weight': 10, 'reg_alpha': 0.18917146580553376, 'reg_lambda': 0.3834010499173182}
Best RMSE for C_Regression_por: 2.6317


In [None]:
# NOW ALSO WE ARE DOING XGBOOST BUT FOR BINARY AND 5-LEVEL CLASSIFICATION DATASETS

def objective2(trial, X, y, preprocessor, kf):
    # Suggest values for XGBoost hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=10),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0, log=True),
        'gamma': trial.suggest_float('gamma', 0, 0.5), # Minimum loss reduction required to make a split
        'random_state': 42,
        'eval_metric': 'logloss'  # Suitable for classification
    }

    # Set objective based on number of classes
    if len(np.unique(y)) == 2:  # Binary classification
        params['objective'] = 'binary:logistic'
    else:  # 5-level classification
        params['objective'] = 'multi:softmax'
        params['num_class'] = len(np.unique(y))  # 5 classes for '5level'

    # Create XGBoost classifier
    xgb_model = xgb.XGBClassifier(**params)

    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', xgb_model)
    ])

    # Cross-validate
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_fold_train, X_fold_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_fold_train, y_fold_val = y.iloc[train_idx].copy(), y.iloc[val_idx].copy()

        # Fit the pipeline on the training data for the current fold
        pipeline.fit(X_fold_train, y_fold_train)
        y_pred = pipeline.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_pred) # Calculate accuracy (percentage of correct classification)
        scores.append(fold_accuracy)

    # Return positive accuracy to maximize
    return np.mean(scores)

# Setup 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define classification datasets (assuming these are defined elsewhere in your code)
datasets = {
    # Binary classification datasets
    'A_Binary_mat': (df_A_Binary_mat, 'binary'),
    'B_Binary_mat': (df_B_Binary_mat, 'binary'),
    'C_Binary_mat': (df_C_Binary_mat, 'binary'),
    'A_Binary_por': (df_A_Binary_por, 'binary'),
    'B_Binary_por': (df_B_Binary_por, 'binary'),
    'C_Binary_por': (df_C_Binary_por, 'binary'),
    # 5-level classification datasets
    'A_5Level_mat': (df_A_5Level_mat, '5level'),
    'B_5Level_mat': (df_B_5Level_mat, '5level'),
    'C_5Level_mat': (df_C_5Level_mat, '5level'),
    'A_5Level_por': (df_A_5Level_por, '5level'),
    'B_5Level_por': (df_B_5Level_por, '5level'),
    'C_5Level_por': (df_C_5Level_por, '5level')
}

# Iterate over classification datasets
for name, (df, target) in datasets.items():
    print(f"\nProcessing dataset: {name}")


    # Extract features and target
    y = df[target]
    X = df.drop(columns=[target])
     # Map string/ordinal labels to numeric values for binary classification
    if name in ['A_Binary_mat', 'B_Binary_mat', 'C_Binary_mat', 'A_Binary_por', 'B_Binary_por', 'C_Binary_por']:
      y = y.map({'Fail': 0, 'Pass': 1}) # Convert labels to 0/1
      # Map ordinal values to 0–4 for 5-level classification
    if name in ['A_5Level_mat', 'B_5Level_mat', 'C_5Level_mat',
                'A_5Level_por', 'B_5Level_por', 'C_5Level_por']:
      y = y.map({1: 0, 2: 1, 3: 2, 4: 3, 5: 4}) # Convert labels to 0–4

    # Create preprocessor
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_features = [col for col in numerical_features if col != target]  # Ensure target isn’t in features

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ('num', 'passthrough', numerical_features)
        ])

    # Run optimization
    n_trials = 50
    print(f"Running {n_trials} optimization trials for {name}...")
    study = optuna.create_study(direction='maximize')  # Directly maximize accuracy
    study.optimize(lambda trial: objective2(trial, X, y, preprocessor, kf), n_trials=n_trials, callbacks=[TqdmCallback(n_trials)])

    # Print best parameters and accuracy
    print(f"\nBest hyperparameters for {name}: {study.best_params}")
    print(f"Best accuracy for {name}: {study.best_value:.4f}")

[I 2025-04-13 19:01:39,356] A new study created in memory with name: no-name-0733d6af-d4a3-4ccf-9fc5-9a513a5b2f20



Processing dataset: A_Binary_mat
Running 50 optimization trials for A_Binary_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:01:42,736] Trial 0 finished with value: 0.9189102564102564 and parameters: {'n_estimators': 640, 'max_depth': 9, 'learning_rate': 0.014489652194794292, 'subsample': 0.6995885041357708, 'colsample_bytree': 0.823272397901309, 'min_child_weight': 7, 'reg_alpha': 0.16028658895153275, 'reg_lambda': 0.052242338005195225, 'gamma': 0.19399940956972755}. Best is trial 0 with value: 0.9189102564102564.
[I 2025-04-13 19:01:43,363] Trial 1 finished with value: 0.9164102564102565 and parameters: {'n_estimators': 160, 'max_depth': 3, 'learning_rate': 0.13692560624849262, 'subsample': 0.8237398582688432, 'colsample_bytree': 0.5184072135305667, 'min_child_weight': 10, 'reg_alpha': 0.4847651839172266, 'reg_lambda': 0.16483855599205963, 'gamma': 0.19058121234332287}. Best is trial 0 with value: 0.9189102564102564.
[I 2025-04-13 19:01:44,657] Trial 2 finished with value: 0.923974358974359 and parameters: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.0713988174464494, 'subsampl


Best hyperparameters for A_Binary_mat: {'n_estimators': 650, 'max_depth': 8, 'learning_rate': 0.013583747962663978, 'subsample': 0.7641069145331185, 'colsample_bytree': 0.8198301903465339, 'min_child_weight': 2, 'reg_alpha': 0.07603301413967815, 'reg_lambda': 0.06659226316411504, 'gamma': 0.16447150751520284}
Best accuracy for A_Binary_mat: 0.9266

Processing dataset: B_Binary_mat
Running 50 optimization trials for B_Binary_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:03:06,111] Trial 0 finished with value: 0.8482051282051282 and parameters: {'n_estimators': 730, 'max_depth': 10, 'learning_rate': 0.04779692863412705, 'subsample': 0.9408832008620982, 'colsample_bytree': 0.5083020056410718, 'min_child_weight': 9, 'reg_alpha': 0.29380965332865766, 'reg_lambda': 0.048481384455111506, 'gamma': 0.21049138885659435}. Best is trial 0 with value: 0.8482051282051282.
[I 2025-04-13 19:03:06,799] Trial 1 finished with value: 0.8403846153846153 and parameters: {'n_estimators': 180, 'max_depth': 5, 'learning_rate': 0.03495809658810584, 'subsample': 0.5708400569391392, 'colsample_bytree': 0.6631607133239472, 'min_child_weight': 10, 'reg_alpha': 0.014961441912879585, 'reg_lambda': 0.01394685493721555, 'gamma': 0.206938239520852}. Best is trial 0 with value: 0.8482051282051282.
[I 2025-04-13 19:03:10,891] Trial 2 finished with value: 0.8150641025641026 and parameters: {'n_estimators': 820, 'max_depth': 4, 'learning_rate': 0.01742557829958734, 'subsa


Best hyperparameters for B_Binary_mat: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.12129386358403305, 'subsample': 0.7823687853124639, 'colsample_bytree': 0.9240424969893589, 'min_child_weight': 9, 'reg_alpha': 0.2740525612944526, 'reg_lambda': 0.34709305191319734, 'gamma': 0.0175516363104192}
Best accuracy for B_Binary_mat: 0.8558

Processing dataset: C_Binary_mat
Running 50 optimization trials for C_Binary_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:04:53,476] Trial 0 finished with value: 0.6887820512820514 and parameters: {'n_estimators': 180, 'max_depth': 8, 'learning_rate': 0.03403657266742079, 'subsample': 0.9988212748254008, 'colsample_bytree': 0.7164487562280818, 'min_child_weight': 7, 'reg_alpha': 0.8155938193136147, 'reg_lambda': 0.019495896335114848, 'gamma': 0.2675120007887425}. Best is trial 0 with value: 0.6887820512820514.
[I 2025-04-13 19:04:56,591] Trial 1 finished with value: 0.6987179487179487 and parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.03999906243547439, 'subsample': 0.9493769124409837, 'colsample_bytree': 0.8650004457105311, 'min_child_weight': 2, 'reg_alpha': 0.2162401084600122, 'reg_lambda': 0.07282455050110645, 'gamma': 0.48650593028502387}. Best is trial 1 with value: 0.6987179487179487.
[I 2025-04-13 19:04:58,556] Trial 2 finished with value: 0.7064102564102563 and parameters: {'n_estimators': 740, 'max_depth': 4, 'learning_rate': 0.012739277637308782, 'subsampl


Best hyperparameters for C_Binary_mat: {'n_estimators': 420, 'max_depth': 9, 'learning_rate': 0.01388084080576899, 'subsample': 0.7575947909295877, 'colsample_bytree': 0.8704319368962993, 'min_child_weight': 10, 'reg_alpha': 0.018760524223901218, 'reg_lambda': 0.040829229360886586, 'gamma': 0.363263545873292}
Best accuracy for C_Binary_mat: 0.7245

Processing dataset: A_Binary_por
Running 50 optimization trials for A_Binary_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:06:04,027] Trial 0 finished with value: 0.9291586538461539 and parameters: {'n_estimators': 430, 'max_depth': 9, 'learning_rate': 0.044641031400439395, 'subsample': 0.9294532939642464, 'colsample_bytree': 0.9049083800947748, 'min_child_weight': 5, 'reg_alpha': 0.042423132796952305, 'reg_lambda': 0.26768859841411263, 'gamma': 0.3734277491005534}. Best is trial 0 with value: 0.9291586538461539.
[I 2025-04-13 19:06:05,708] Trial 1 finished with value: 0.9353365384615385 and parameters: {'n_estimators': 640, 'max_depth': 10, 'learning_rate': 0.05688477069318591, 'subsample': 0.5852281917122892, 'colsample_bytree': 0.7654508514096168, 'min_child_weight': 2, 'reg_alpha': 0.8499695992714715, 'reg_lambda': 0.029732774242929267, 'gamma': 0.4484376279800279}. Best is trial 1 with value: 0.9353365384615385.
[I 2025-04-13 19:06:08,822] Trial 2 finished with value: 0.9384134615384616 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.10352094450449523, 'subsam


Best hyperparameters for A_Binary_por: {'n_estimators': 910, 'max_depth': 4, 'learning_rate': 0.027671088361991048, 'subsample': 0.6241324930044161, 'colsample_bytree': 0.6595009034756953, 'min_child_weight': 3, 'reg_alpha': 0.02733820112353735, 'reg_lambda': 0.14778126831169425, 'gamma': 0.44787112003830276}
Best accuracy for A_Binary_por: 0.9400

Processing dataset: B_Binary_por
Running 50 optimization trials for B_Binary_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:07:48,334] Trial 0 finished with value: 0.9060817307692307 and parameters: {'n_estimators': 690, 'max_depth': 3, 'learning_rate': 0.17690531051780262, 'subsample': 0.8883875235680835, 'colsample_bytree': 0.7748285637863132, 'min_child_weight': 7, 'reg_alpha': 0.07998886994073715, 'reg_lambda': 0.010381984314453582, 'gamma': 0.4304427458354961}. Best is trial 0 with value: 0.9060817307692307.
[I 2025-04-13 19:07:51,101] Trial 1 finished with value: 0.9014663461538461 and parameters: {'n_estimators': 890, 'max_depth': 4, 'learning_rate': 0.020849394677639733, 'subsample': 0.8320018212261131, 'colsample_bytree': 0.75414200516303, 'min_child_weight': 4, 'reg_alpha': 0.01941244626286256, 'reg_lambda': 0.0659787772617666, 'gamma': 0.29711155894649754}. Best is trial 0 with value: 0.9060817307692307.
[I 2025-04-13 19:07:52,992] Trial 2 finished with value: 0.9030048076923076 and parameters: {'n_estimators': 720, 'max_depth': 5, 'learning_rate': 0.10061907326800819, 'subsample


Best hyperparameters for B_Binary_por: {'n_estimators': 320, 'max_depth': 8, 'learning_rate': 0.020683765660951346, 'subsample': 0.6680298461819386, 'colsample_bytree': 0.7160095847671739, 'min_child_weight': 10, 'reg_alpha': 0.07085181288217547, 'reg_lambda': 0.7455817903852436, 'gamma': 0.31024210049631856}
Best accuracy for B_Binary_por: 0.9199

Processing dataset: C_Binary_por
Running 50 optimization trials for C_Binary_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:08:59,886] Trial 0 finished with value: 0.8598798076923077 and parameters: {'n_estimators': 400, 'max_depth': 4, 'learning_rate': 0.10476090372207848, 'subsample': 0.6084479842303943, 'colsample_bytree': 0.6942803478342663, 'min_child_weight': 1, 'reg_alpha': 0.11394177256780068, 'reg_lambda': 0.2987568459646315, 'gamma': 0.11851922659153946}. Best is trial 0 with value: 0.8598798076923077.
[I 2025-04-13 19:09:03,713] Trial 1 finished with value: 0.8491346153846153 and parameters: {'n_estimators': 720, 'max_depth': 7, 'learning_rate': 0.03372214024817388, 'subsample': 0.7812614012848347, 'colsample_bytree': 0.9286431215335155, 'min_child_weight': 10, 'reg_alpha': 0.15895255048021542, 'reg_lambda': 0.2868952610149331, 'gamma': 0.057672756468328645}. Best is trial 0 with value: 0.8598798076923077.
[I 2025-04-13 19:09:05,263] Trial 2 finished with value: 0.8413942307692308 and parameters: {'n_estimators': 490, 'max_depth': 8, 'learning_rate': 0.13556006057272288, 'subsamp


Best hyperparameters for C_Binary_por: {'n_estimators': 370, 'max_depth': 3, 'learning_rate': 0.013743726968807442, 'subsample': 0.5818728814978958, 'colsample_bytree': 0.7119247191198648, 'min_child_weight': 1, 'reg_alpha': 0.11929175002840363, 'reg_lambda': 0.03833574970645264, 'gamma': 0.49023039924428713}
Best accuracy for C_Binary_por: 0.8753

Processing dataset: A_5Level_mat
Running 50 optimization trials for A_5Level_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:10:37,622] Trial 0 finished with value: 0.7518589743589743 and parameters: {'n_estimators': 820, 'max_depth': 3, 'learning_rate': 0.034253494860435725, 'subsample': 0.8293839453325887, 'colsample_bytree': 0.6382017893844929, 'min_child_weight': 6, 'reg_alpha': 0.41845628122168177, 'reg_lambda': 0.08201711446801725, 'gamma': 0.39694456127496}. Best is trial 0 with value: 0.7518589743589743.
[I 2025-04-13 19:10:43,443] Trial 1 finished with value: 0.7090384615384615 and parameters: {'n_estimators': 510, 'max_depth': 3, 'learning_rate': 0.09217512888546592, 'subsample': 0.5117748678334217, 'colsample_bytree': 0.922722925303745, 'min_child_weight': 5, 'reg_alpha': 0.051480532610763526, 'reg_lambda': 0.23127824622832904, 'gamma': 0.1520135834670261}. Best is trial 0 with value: 0.7518589743589743.
[I 2025-04-13 19:10:50,699] Trial 2 finished with value: 0.7193589743589743 and parameters: {'n_estimators': 930, 'max_depth': 5, 'learning_rate': 0.0974603293739713, 'subsample':


Best hyperparameters for A_5Level_mat: {'n_estimators': 170, 'max_depth': 9, 'learning_rate': 0.011345418425074904, 'subsample': 0.9978636644427255, 'colsample_bytree': 0.9358482561306183, 'min_child_weight': 6, 'reg_alpha': 0.5547745807801112, 'reg_lambda': 0.5230353302698785, 'gamma': 0.1637961164042258}
Best accuracy for A_5Level_mat: 0.7821

Processing dataset: B_5Level_mat
Running 50 optimization trials for B_5Level_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:13:54,577] Trial 0 finished with value: 0.5743589743589743 and parameters: {'n_estimators': 140, 'max_depth': 9, 'learning_rate': 0.014676122488399519, 'subsample': 0.6578395203054066, 'colsample_bytree': 0.8196632774627217, 'min_child_weight': 4, 'reg_alpha': 0.2229318963624058, 'reg_lambda': 0.9409829045772788, 'gamma': 0.38799864682233026}. Best is trial 0 with value: 0.5743589743589743.
[I 2025-04-13 19:13:56,807] Trial 1 finished with value: 0.5717948717948718 and parameters: {'n_estimators': 220, 'max_depth': 10, 'learning_rate': 0.04346975584609283, 'subsample': 0.5352813781152793, 'colsample_bytree': 0.6228503689457685, 'min_child_weight': 9, 'reg_alpha': 0.3722792617801196, 'reg_lambda': 0.36707864680758157, 'gamma': 0.46238448625836487}. Best is trial 0 with value: 0.5743589743589743.
[I 2025-04-13 19:14:01,189] Trial 2 finished with value: 0.5696794871794871 and parameters: {'n_estimators': 550, 'max_depth': 9, 'learning_rate': 0.05904157120440104, 'subsampl


Best hyperparameters for B_5Level_mat: {'n_estimators': 970, 'max_depth': 6, 'learning_rate': 0.010509302146657353, 'subsample': 0.6538707038208171, 'colsample_bytree': 0.9620772906320718, 'min_child_weight': 1, 'reg_alpha': 0.9968301992170565, 'reg_lambda': 0.9353409158710099, 'gamma': 0.21130716262829444}
Best accuracy for B_5Level_mat: 0.5819

Processing dataset: C_5Level_mat
Running 50 optimization trials for C_5Level_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:18:59,370] Trial 0 finished with value: 0.32166666666666666 and parameters: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05766547156968393, 'subsample': 0.6123993966388653, 'colsample_bytree': 0.9941133808141875, 'min_child_weight': 8, 'reg_alpha': 0.4211343910755292, 'reg_lambda': 0.6635996500793344, 'gamma': 0.04056614996507285}. Best is trial 0 with value: 0.32166666666666666.
[I 2025-04-13 19:19:01,680] Trial 1 finished with value: 0.29358974358974355 and parameters: {'n_estimators': 170, 'max_depth': 4, 'learning_rate': 0.08020529503109242, 'subsample': 0.5318412967220959, 'colsample_bytree': 0.8089386705921908, 'min_child_weight': 2, 'reg_alpha': 0.5999153632586923, 'reg_lambda': 0.023087323688684856, 'gamma': 0.3574015830706042}. Best is trial 0 with value: 0.32166666666666666.
[I 2025-04-13 19:19:11,883] Trial 2 finished with value: 0.29634615384615387 and parameters: {'n_estimators': 890, 'max_depth': 9, 'learning_rate': 0.07584718203112094, 'subsa


Best hyperparameters for C_5Level_mat: {'n_estimators': 170, 'max_depth': 4, 'learning_rate': 0.013605564797512975, 'subsample': 0.6850123931725215, 'colsample_bytree': 0.6114346266695785, 'min_child_weight': 6, 'reg_alpha': 0.02823288163981518, 'reg_lambda': 0.010111128675312095, 'gamma': 0.27840721738003993}
Best accuracy for C_5Level_mat: 0.3490

Processing dataset: A_5Level_por
Running 50 optimization trials for A_5Level_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:23:10,783] Trial 0 finished with value: 0.7241346153846154 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.10502558132505696, 'subsample': 0.5034518449125913, 'colsample_bytree': 0.6913383620895007, 'min_child_weight': 6, 'reg_alpha': 0.22066163511787834, 'reg_lambda': 0.5391769896189659, 'gamma': 0.43838787775858196}. Best is trial 0 with value: 0.7241346153846154.
[I 2025-04-13 19:23:21,854] Trial 1 finished with value: 0.7226201923076924 and parameters: {'n_estimators': 960, 'max_depth': 10, 'learning_rate': 0.04080436835539726, 'subsample': 0.6447658581914781, 'colsample_bytree': 0.5019759510338281, 'min_child_weight': 1, 'reg_alpha': 0.010432078263188815, 'reg_lambda': 0.330528110729213, 'gamma': 0.2774979830376101}. Best is trial 0 with value: 0.7241346153846154.
[I 2025-04-13 19:23:30,670] Trial 2 finished with value: 0.7318990384615385 and parameters: {'n_estimators': 910, 'max_depth': 5, 'learning_rate': 0.19964820372529737, 'subsample


Best hyperparameters for A_5Level_por: {'n_estimators': 370, 'max_depth': 8, 'learning_rate': 0.010058441338521502, 'subsample': 0.7486150677029069, 'colsample_bytree': 0.6894214941015352, 'min_child_weight': 7, 'reg_alpha': 0.07644259112025795, 'reg_lambda': 0.013914421646943013, 'gamma': 0.4406052486230606}
Best accuracy for A_5Level_por: 0.7642

Processing dataset: B_5Level_por
Running 50 optimization trials for B_5Level_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:30:42,114] Trial 0 finished with value: 0.5731971153846154 and parameters: {'n_estimators': 770, 'max_depth': 9, 'learning_rate': 0.01938735210317355, 'subsample': 0.6583375791648087, 'colsample_bytree': 0.8574603779039479, 'min_child_weight': 10, 'reg_alpha': 0.6088287502276422, 'reg_lambda': 0.01421329495930986, 'gamma': 0.23161871751344676}. Best is trial 0 with value: 0.5731971153846154.
[I 2025-04-13 19:30:48,744] Trial 1 finished with value: 0.5515384615384615 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.051727466510944946, 'subsample': 0.9063386784649958, 'colsample_bytree': 0.9872945460018033, 'min_child_weight': 8, 'reg_alpha': 0.0821432251424606, 'reg_lambda': 0.08308757089425342, 'gamma': 0.2666473574192939}. Best is trial 0 with value: 0.5731971153846154.
[I 2025-04-13 19:30:55,965] Trial 2 finished with value: 0.5701442307692307 and parameters: {'n_estimators': 730, 'max_depth': 4, 'learning_rate': 0.03598926811706168, 'subsampl


Best hyperparameters for B_5Level_por: {'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.014298312939470724, 'subsample': 0.6259514004174477, 'colsample_bytree': 0.998488155630813, 'min_child_weight': 9, 'reg_alpha': 0.019183204583051974, 'reg_lambda': 0.10976598628247725, 'gamma': 0.2827295097921428}
Best accuracy for B_5Level_por: 0.6148

Processing dataset: C_5Level_por
Running 50 optimization trials for C_5Level_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-13 19:35:19,124] Trial 0 finished with value: 0.3635817307692308 and parameters: {'n_estimators': 330, 'max_depth': 5, 'learning_rate': 0.014138881504178817, 'subsample': 0.5344318714492101, 'colsample_bytree': 0.690204668719754, 'min_child_weight': 2, 'reg_alpha': 0.08049486254622261, 'reg_lambda': 0.2986072493575224, 'gamma': 0.24683795352242172}. Best is trial 0 with value: 0.3635817307692308.
[I 2025-04-13 19:35:28,413] Trial 1 finished with value: 0.35122596153846153 and parameters: {'n_estimators': 590, 'max_depth': 6, 'learning_rate': 0.013456655981117263, 'subsample': 0.6082261345987537, 'colsample_bytree': 0.7312895891332724, 'min_child_weight': 6, 'reg_alpha': 0.06356686282725492, 'reg_lambda': 0.16558263071962276, 'gamma': 0.42503728122473383}. Best is trial 0 with value: 0.3635817307692308.
[I 2025-04-13 19:35:41,159] Trial 2 finished with value: 0.3496394230769231 and parameters: {'n_estimators': 920, 'max_depth': 5, 'learning_rate': 0.047031934166765, 'subsampl


Best hyperparameters for C_5Level_por: {'n_estimators': 210, 'max_depth': 9, 'learning_rate': 0.034323849912553384, 'subsample': 0.9052886125379356, 'colsample_bytree': 0.7440507651725391, 'min_child_weight': 1, 'reg_alpha': 0.5930506763729393, 'reg_lambda': 0.0259495815726823, 'gamma': 0.4804965794008174}
Best accuracy for C_5Level_por: 0.3790


In [None]:
# WE TRIED TO TUNE THE HYPERPARAMETERS OF RANDOM FOREST FOR REGRESSION DATASET.

# Define objective function for Optuna
def objective(trial, X, y, preprocessor, kf):
    # Suggest values for Random Forest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=5)
    max_depth = trial.suggest_int('max_depth', 6, 30, step=3)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Create RF model
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', rf)
    ])

    # Cross-validate
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_fold_train, X_fold_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_fold_train, y_fold_val = y.iloc[train_idx].copy(), y.iloc[val_idx].copy()

        pipeline.fit(X_fold_train, y_fold_train)
        y_pred = pipeline.predict(X_fold_val)
        fold_rmse = np.sqrt(mean_squared_error(y_fold_val, y_pred))
        scores.append(fold_rmse)

    return np.mean(scores)

# Setup 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Iterate over regression datasets
datasets = {
    'A_Regression_mat': df_A_Regression_mat,
    'B_Regression_mat': df_B_Regression_mat,
    'C_Regression_mat': df_C_Regression_mat,
    'A_Regression_por': df_A_Regression_por,
    'B_Regression_por': df_B_Regression_por,
    'C_Regression_por': df_C_Regression_por
}

for name, df in datasets.items():
    print(f"\nProcessing dataset: {name}")

    # Create preprocessor
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_features = [col for col in numerical_features if col != 'G3']  # Exclude target

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ('num', 'passthrough', numerical_features)
        ])

    # Extract features and target
    y = df['G3']
    X = df.drop(columns=['G3'])

    # Run optimization
    n_trials = 50
    print(f"Running {n_trials} optimization trials for {name}...")
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, X, y, preprocessor, kf), n_trials=n_trials, callbacks=[TqdmCallback(n_trials)])

    # Print best parameters
    print(f"\nBest hyperparameters for {name}: {study.best_params}")
    print(f"Best RMSE for {name}: {study.best_value:.4f}")

[I 2025-04-21 05:15:56,005] A new study created in memory with name: no-name-54182969-5938-476b-b253-6f12195f2e5f



Processing dataset: A_Regression_mat
Running 50 optimization trials for A_Regression_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 05:16:02,496] Trial 0 finished with value: 1.53726514820533 and parameters: {'n_estimators': 230, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 7}. Best is trial 0 with value: 1.53726514820533.
[I 2025-04-21 05:16:26,463] Trial 1 finished with value: 1.5306647008057872 and parameters: {'n_estimators': 820, 'max_depth': 30, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 1 with value: 1.5306647008057872.
[I 2025-04-21 05:16:41,197] Trial 2 finished with value: 1.5286762416331403 and parameters: {'n_estimators': 440, 'max_depth': 24, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 2 with value: 1.5286762416331403.
[I 2025-04-21 05:16:45,655] Trial 3 finished with value: 1.5475572774938713 and parameters: {'n_estimators': 190, 'max_depth': 30, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 2 with value: 1.5286762416331403.
[I 2025-04-21 05:16:51,719] Trial 4 finished with value: 1.5310533538541145 and parameters: {'n_


Best hyperparameters for A_Regression_mat: {'n_estimators': 150, 'max_depth': 30, 'min_samples_split': 6, 'min_samples_leaf': 3}
Best RMSE for A_Regression_mat: 1.5166

Processing dataset: B_Regression_mat
Running 50 optimization trials for B_Regression_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 05:26:08,307] Trial 0 finished with value: 2.27429085168864 and parameters: {'n_estimators': 465, 'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 2.27429085168864.
[I 2025-04-21 05:26:16,628] Trial 1 finished with value: 2.275330786367008 and parameters: {'n_estimators': 290, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 0 with value: 2.27429085168864.
[I 2025-04-21 05:26:28,404] Trial 2 finished with value: 2.2706161906572064 and parameters: {'n_estimators': 400, 'max_depth': 30, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.2706161906572064.
[I 2025-04-21 05:26:45,167] Trial 3 finished with value: 2.28024104743892 and parameters: {'n_estimators': 710, 'max_depth': 24, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 2 with value: 2.2706161906572064.
[I 2025-04-21 05:26:58,550] Trial 4 finished with value: 2.2782409279440823 and parameters: {'n_estim


Best hyperparameters for B_Regression_mat: {'n_estimators': 670, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 3}
Best RMSE for B_Regression_mat: 2.2636

Processing dataset: C_Regression_mat
Running 50 optimization trials for C_Regression_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 05:36:50,543] Trial 0 finished with value: 3.8212352599777866 and parameters: {'n_estimators': 805, 'max_depth': 24, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 0 with value: 3.8212352599777866.
[I 2025-04-21 05:36:58,041] Trial 1 finished with value: 3.8348535302238647 and parameters: {'n_estimators': 335, 'max_depth': 30, 'min_samples_split': 10, 'min_samples_leaf': 9}. Best is trial 0 with value: 3.8212352599777866.
[I 2025-04-21 05:37:19,499] Trial 2 finished with value: 3.794329982797207 and parameters: {'n_estimators': 670, 'max_depth': 21, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 2 with value: 3.794329982797207.
[I 2025-04-21 05:37:52,955] Trial 3 finished with value: 3.7932449336121317 and parameters: {'n_estimators': 895, 'max_depth': 27, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 3 with value: 3.7932449336121317.
[I 2025-04-21 05:38:03,415] Trial 4 finished with value: 3.7987055642697327 and parameters: 


Best hyperparameters for C_Regression_mat: {'n_estimators': 900, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 1}
Best RMSE for C_Regression_mat: 3.7853

Processing dataset: A_Regression_por
Running 50 optimization trials for A_Regression_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 05:58:00,560] Trial 0 finished with value: 1.22289960529209 and parameters: {'n_estimators': 890, 'max_depth': 21, 'min_samples_split': 8, 'min_samples_leaf': 7}. Best is trial 0 with value: 1.22289960529209.
[I 2025-04-21 05:58:29,382] Trial 1 finished with value: 1.2502396292916402 and parameters: {'n_estimators': 865, 'max_depth': 21, 'min_samples_split': 8, 'min_samples_leaf': 9}. Best is trial 0 with value: 1.22289960529209.
[I 2025-04-21 05:58:47,834] Trial 2 finished with value: 1.2377314179604884 and parameters: {'n_estimators': 550, 'max_depth': 27, 'min_samples_split': 8, 'min_samples_leaf': 8}. Best is trial 0 with value: 1.22289960529209.
[I 2025-04-21 05:59:00,255] Trial 3 finished with value: 1.236114246027076 and parameters: {'n_estimators': 360, 'max_depth': 24, 'min_samples_split': 10, 'min_samples_leaf': 8}. Best is trial 0 with value: 1.22289960529209.
[I 2025-04-21 05:59:11,524] Trial 4 finished with value: 1.2287046077439059 and parameters: {'n_estima


Best hyperparameters for A_Regression_por: {'n_estimators': 380, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 3}
Best RMSE for A_Regression_por: 1.1994

Processing dataset: B_Regression_por
Running 50 optimization trials for B_Regression_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 06:15:20,080] Trial 0 finished with value: 1.682612223024158 and parameters: {'n_estimators': 520, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 1.682612223024158.
[I 2025-04-21 06:15:25,883] Trial 1 finished with value: 1.6835710819007086 and parameters: {'n_estimators': 160, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 1.682612223024158.
[I 2025-04-21 06:15:46,048] Trial 2 finished with value: 1.693707329382848 and parameters: {'n_estimators': 430, 'max_depth': 15, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 1.682612223024158.
[I 2025-04-21 06:15:58,779] Trial 3 finished with value: 1.6909411798801852 and parameters: {'n_estimators': 365, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 2}. Best is trial 0 with value: 1.682612223024158.
[I 2025-04-21 06:16:18,806] Trial 4 finished with value: 1.6811680378006042 and parameters: {'n_est


Best hyperparameters for B_Regression_por: {'n_estimators': 115, 'max_depth': 12, 'min_samples_split': 4, 'min_samples_leaf': 7}
Best RMSE for B_Regression_por: 1.6731

Processing dataset: C_Regression_por
Running 50 optimization trials for C_Regression_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 06:32:49,435] Trial 0 finished with value: 2.6843752145267734 and parameters: {'n_estimators': 355, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 0 with value: 2.6843752145267734.
[I 2025-04-21 06:33:14,217] Trial 1 finished with value: 2.6914370653494415 and parameters: {'n_estimators': 600, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.6843752145267734.
[I 2025-04-21 06:33:45,292] Trial 2 finished with value: 2.689632134768263 and parameters: {'n_estimators': 745, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.6843752145267734.
[I 2025-04-21 06:34:14,562] Trial 3 finished with value: 2.6876488022817586 and parameters: {'n_estimators': 755, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.6843752145267734.
[I 2025-04-21 06:34:42,829] Trial 4 finished with value: 2.6849626498569714 and parameters: 


Best hyperparameters for C_Regression_por: {'n_estimators': 680, 'max_depth': 24, 'min_samples_split': 5, 'min_samples_leaf': 6}
Best RMSE for C_Regression_por: 2.6807


In [None]:
# WE TRIED TO TUNE THE HYPERPARAMETERS OF RANDOM FOREST FOR CLASSIFICATION DATASET.

# Define objective function for Optuna
def objective3(trial, X, y, preprocessor, kf):
    # Suggest values for Random Forest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 6, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)


    # Create RF classifier
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', rf)
    ])

    # Cross-validate
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_fold_train, X_fold_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_fold_train, y_fold_val = y.iloc[train_idx].copy(), y.iloc[val_idx].copy()

        # Fit the pipeline on the training data for the current fold
        pipeline.fit(X_fold_train, y_fold_train)
        y_pred = pipeline.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_pred)
        scores.append(fold_accuracy)

    # Return positive accuracy to maximize
    return np.mean(scores)

# Setup 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define classification datasets
datasets = {
    # Binary classification datasets
    'A_Binary_mat': (df_A_Binary_mat, 'binary'),
    'B_Binary_mat': (df_B_Binary_mat, 'binary'),
    'C_Binary_mat': (df_C_Binary_mat, 'binary'),
    'A_Binary_por': (df_A_Binary_por, 'binary'),
    'B_Binary_por': (df_B_Binary_por, 'binary'),
    'C_Binary_por': (df_C_Binary_por, 'binary'),
    # 5-level classification datasets
    'A_5Level_mat': (df_A_5Level_mat, '5level'),
    'B_5Level_mat': (df_B_5Level_mat, '5level'),
    'C_5Level_mat': (df_C_5Level_mat, '5level'),
    'A_5Level_por': (df_A_5Level_por, '5level'),
    'B_5Level_por': (df_B_5Level_por, '5level'),
    'C_5Level_por': (df_C_5Level_por, '5level')
}

# Iterate over classification datasets
for name, (df, target) in datasets.items():
    print(f"\nProcessing dataset: {name}")

    # Check if target exists
    if target not in df.columns:
        print(f"Error: '{target}' not found in {name}. Skipping...")
        continue

    # Extract features and target
    y = df[target]
    X = df.drop(columns=[target])

    # Create preprocessor
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_features = [col for col in numerical_features if col != target]

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ('num', 'passthrough', numerical_features)
        ])


    # Run optimization
    n_trials = 50
    print(f"Running {n_trials} optimization trials for {name}...")
    study = optuna.create_study(direction='maximize')  # Directly maximize accuracy
    study.optimize(lambda trial: objective3(trial, X, y, preprocessor, kf), n_trials=n_trials, callbacks=[TqdmCallback(n_trials)])

    # Print best parameters and accuracy
    print(f"\nBest hyperparameters for {name}: {study.best_params}")
    print(f"Best accuracy for {name}: {study.best_value:.4f}")




[I 2025-04-21 07:39:38,775] A new study created in memory with name: no-name-98c46d52-b9bb-4f49-a888-3813757d6a8e



Processing dataset: A_Binary_mat
Running 50 optimization trials for A_Binary_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 07:39:46,191] Trial 0 finished with value: 0.9087820512820514 and parameters: {'n_estimators': 118, 'max_depth': 19, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9087820512820514.
[I 2025-04-21 07:40:06,731] Trial 1 finished with value: 0.9062179487179488 and parameters: {'n_estimators': 670, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.9087820512820514.
[I 2025-04-21 07:40:22,257] Trial 2 finished with value: 0.9137820512820513 and parameters: {'n_estimators': 806, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.9137820512820513.
[I 2025-04-21 07:40:31,839] Trial 3 finished with value: 0.9137820512820513 and parameters: {'n_estimators': 488, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.9137820512820513.
[I 2025-04-21 07:40:50,878] Trial 4 finished with value: 0.9036538461538463 and parameters: 


Best hyperparameters for A_Binary_mat: {'n_estimators': 786, 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 1}
Best accuracy for A_Binary_mat: 0.9163

Processing dataset: B_Binary_mat
Running 50 optimization trials for B_Binary_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 07:49:06,034] Trial 0 finished with value: 0.8076923076923077 and parameters: {'n_estimators': 797, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.8076923076923077.
[I 2025-04-21 07:49:16,516] Trial 1 finished with value: 0.8102564102564103 and parameters: {'n_estimators': 549, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.8102564102564103.
[I 2025-04-21 07:49:33,251] Trial 2 finished with value: 0.827948717948718 and parameters: {'n_estimators': 871, 'max_depth': 25, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.827948717948718.
[I 2025-04-21 07:49:39,918] Trial 3 finished with value: 0.8251923076923078 and parameters: {'n_estimators': 298, 'max_depth': 25, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.827948717948718.
[I 2025-04-21 07:49:44,856] Trial 4 finished with value: 0.812628205128205 and parameters: {'n_


Best hyperparameters for B_Binary_mat: {'n_estimators': 439, 'max_depth': 24, 'min_samples_split': 10, 'min_samples_leaf': 1}
Best accuracy for B_Binary_mat: 0.8456

Processing dataset: C_Binary_mat
Running 50 optimization trials for C_Binary_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 07:57:17,052] Trial 0 finished with value: 0.7068589743589744 and parameters: {'n_estimators': 769, 'max_depth': 24, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7068589743589744.
[I 2025-04-21 07:57:27,109] Trial 1 finished with value: 0.7017948717948718 and parameters: {'n_estimators': 496, 'max_depth': 26, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.7068589743589744.
[I 2025-04-21 07:57:30,028] Trial 2 finished with value: 0.6915384615384615 and parameters: {'n_estimators': 147, 'max_depth': 22, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.7068589743589744.
[I 2025-04-21 07:57:41,313] Trial 3 finished with value: 0.7017948717948718 and parameters: {'n_estimators': 594, 'max_depth': 28, 'min_samples_split': 8, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.7068589743589744.
[I 2025-04-21 07:57:44,050] Trial 4 finished with value: 0.6967948717948719 and parameters: 


Best hyperparameters for C_Binary_mat: {'n_estimators': 221, 'max_depth': 24, 'min_samples_split': 8, 'min_samples_leaf': 9}
Best accuracy for C_Binary_mat: 0.7120

Processing dataset: A_Binary_por
Running 50 optimization trials for A_Binary_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:04:29,881] Trial 0 finished with value: 0.9276442307692309 and parameters: {'n_estimators': 201, 'max_depth': 27, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.9276442307692309.
[I 2025-04-21 08:04:48,423] Trial 1 finished with value: 0.9245673076923078 and parameters: {'n_estimators': 896, 'max_depth': 23, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.9276442307692309.
[I 2025-04-21 08:04:56,046] Trial 2 finished with value: 0.9214663461538463 and parameters: {'n_estimators': 373, 'max_depth': 18, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.9276442307692309.
[I 2025-04-21 08:05:09,709] Trial 3 finished with value: 0.9214903846153847 and parameters: {'n_estimators': 665, 'max_depth': 24, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.9276442307692309.
[I 2025-04-21 08:05:17,585] Trial 4 finished with value: 0.9276442307692309 and parameters: 


Best hyperparameters for A_Binary_por: {'n_estimators': 171, 'max_depth': 30, 'min_samples_split': 9, 'min_samples_leaf': 5}
Best accuracy for A_Binary_por: 0.9338

Processing dataset: B_Binary_por
Running 50 optimization trials for B_Binary_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:10:12,279] Trial 0 finished with value: 0.8999038461538461 and parameters: {'n_estimators': 328, 'max_depth': 25, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8999038461538461.
[I 2025-04-21 08:10:22,363] Trial 1 finished with value: 0.8952644230769231 and parameters: {'n_estimators': 478, 'max_depth': 27, 'min_samples_split': 8, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8999038461538461.
[I 2025-04-21 08:10:37,398] Trial 2 finished with value: 0.8860336538461538 and parameters: {'n_estimators': 762, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.8999038461538461.
[I 2025-04-21 08:10:41,357] Trial 3 finished with value: 0.8737259615384616 and parameters: {'n_estimators': 173, 'max_depth': 17, 'min_samples_split': 6, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.8999038461538461.
[I 2025-04-21 08:10:44,694] Trial 4 finished with value: 0.8875480769230769 and parameters: 


Best hyperparameters for B_Binary_por: {'n_estimators': 297, 'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 2}
Best accuracy for B_Binary_por: 0.9045

Processing dataset: C_Binary_por
Running 50 optimization trials for C_Binary_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:19:11,279] Trial 0 finished with value: 0.8429326923076923 and parameters: {'n_estimators': 681, 'max_depth': 11, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8429326923076923.
[I 2025-04-21 08:19:14,437] Trial 1 finished with value: 0.8506730769230769 and parameters: {'n_estimators': 147, 'max_depth': 23, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.8506730769230769.
[I 2025-04-21 08:19:19,106] Trial 2 finished with value: 0.8460096153846154 and parameters: {'n_estimators': 204, 'max_depth': 23, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.8506730769230769.
[I 2025-04-21 08:19:28,546] Trial 3 finished with value: 0.8444711538461538 and parameters: {'n_estimators': 482, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 6}. Best is trial 1 with value: 0.8506730769230769.
[I 2025-04-21 08:19:46,466] Trial 4 finished with value: 0.8444471153846154 and parameters: 


Best hyperparameters for C_Binary_por: {'n_estimators': 611, 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 1}
Best accuracy for C_Binary_por: 0.8584

Processing dataset: A_5Level_mat
Running 50 optimization trials for A_5Level_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:26:19,509] Trial 0 finished with value: 0.6529487179487179 and parameters: {'n_estimators': 371, 'max_depth': 27, 'min_samples_split': 10, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.6529487179487179.
[I 2025-04-21 08:26:28,492] Trial 1 finished with value: 0.6809615384615385 and parameters: {'n_estimators': 414, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.6809615384615385.
[I 2025-04-21 08:26:33,168] Trial 2 finished with value: 0.6756410256410257 and parameters: {'n_estimators': 229, 'max_depth': 22, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.6809615384615385.
[I 2025-04-21 08:26:35,795] Trial 3 finished with value: 0.6377564102564103 and parameters: {'n_estimators': 139, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.6809615384615385.
[I 2025-04-21 08:26:54,503] Trial 4 finished with value: 0.6478846153846154 and parameters:


Best hyperparameters for A_5Level_mat: {'n_estimators': 835, 'max_depth': 25, 'min_samples_split': 7, 'min_samples_leaf': 2}
Best accuracy for A_5Level_mat: 0.7012

Processing dataset: B_5Level_mat
Running 50 optimization trials for B_5Level_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:38:07,785] Trial 0 finished with value: 0.47570512820512817 and parameters: {'n_estimators': 568, 'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.47570512820512817.
[I 2025-04-21 08:38:17,967] Trial 1 finished with value: 0.49602564102564106 and parameters: {'n_estimators': 494, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.49602564102564106.
[I 2025-04-21 08:38:27,836] Trial 2 finished with value: 0.4832692307692309 and parameters: {'n_estimators': 501, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.49602564102564106.
[I 2025-04-21 08:38:35,355] Trial 3 finished with value: 0.486025641025641 and parameters: {'n_estimators': 389, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.49602564102564106.
[I 2025-04-21 08:38:48,379] Trial 4 finished with value: 0.48589743589743595 and parame


Best hyperparameters for B_5Level_mat: {'n_estimators': 451, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1}
Best accuracy for B_5Level_mat: 0.5112

Processing dataset: C_5Level_mat
Running 50 optimization trials for C_5Level_mat...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:45:36,583] Trial 0 finished with value: 0.32897435897435895 and parameters: {'n_estimators': 730, 'max_depth': 21, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.32897435897435895.
[I 2025-04-21 08:45:51,853] Trial 1 finished with value: 0.33416666666666667 and parameters: {'n_estimators': 800, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.33416666666666667.
[I 2025-04-21 08:45:58,396] Trial 2 finished with value: 0.33147435897435895 and parameters: {'n_estimators': 351, 'max_depth': 30, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.33416666666666667.
[I 2025-04-21 08:46:06,077] Trial 3 finished with value: 0.3241025641025641 and parameters: {'n_estimators': 378, 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.33416666666666667.
[I 2025-04-21 08:46:09,397] Trial 4 finished with value: 0.33134615384615385 and param


Best hyperparameters for C_5Level_mat: {'n_estimators': 217, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 4}
Best accuracy for C_5Level_mat: 0.3519

Processing dataset: A_5Level_por
Running 50 optimization trials for A_5Level_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 08:51:44,469] Trial 0 finished with value: 0.7287980769230769 and parameters: {'n_estimators': 693, 'max_depth': 26, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.7287980769230769.
[I 2025-04-21 08:51:54,950] Trial 1 finished with value: 0.7242307692307692 and parameters: {'n_estimators': 511, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.7287980769230769.
[I 2025-04-21 08:52:08,199] Trial 2 finished with value: 0.7441826923076923 and parameters: {'n_estimators': 519, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.7441826923076923.
[I 2025-04-21 08:52:15,314] Trial 3 finished with value: 0.7242307692307692 and parameters: {'n_estimators': 353, 'max_depth': 13, 'min_samples_split': 9, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.7441826923076923.
[I 2025-04-21 08:52:33,518] Trial 4 finished with value: 0.733389423076923 and parameters: {


Best hyperparameters for A_5Level_por: {'n_estimators': 337, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy for A_5Level_por: 0.7457

Processing dataset: B_5Level_por
Running 50 optimization trials for B_5Level_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 09:01:24,583] Trial 0 finished with value: 0.5548076923076923 and parameters: {'n_estimators': 585, 'max_depth': 27, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.5548076923076923.
[I 2025-04-21 09:01:35,362] Trial 1 finished with value: 0.5533173076923077 and parameters: {'n_estimators': 506, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.5548076923076923.
[I 2025-04-21 09:01:47,258] Trial 2 finished with value: 0.5593990384615384 and parameters: {'n_estimators': 559, 'max_depth': 26, 'min_samples_split': 9, 'min_samples_leaf': 6}. Best is trial 2 with value: 0.5593990384615384.
[I 2025-04-21 09:02:07,882] Trial 3 finished with value: 0.5610096153846154 and parameters: {'n_estimators': 975, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.5610096153846154.
[I 2025-04-21 09:02:29,121] Trial 4 finished with value: 0.5563701923076924 and parameters: 


Best hyperparameters for B_5Level_por: {'n_estimators': 766, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 1}
Best accuracy for B_5Level_por: 0.5825

Processing dataset: C_5Level_por
Running 50 optimization trials for C_5Level_por...


Optimization:   0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-04-21 09:14:52,721] Trial 0 finished with value: 0.37305288461538466 and parameters: {'n_estimators': 980, 'max_depth': 15, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.37305288461538466.
[I 2025-04-21 09:14:59,919] Trial 1 finished with value: 0.37762019230769234 and parameters: {'n_estimators': 298, 'max_depth': 16, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.37762019230769234.
[I 2025-04-21 09:15:12,173] Trial 2 finished with value: 0.3792067307692307 and parameters: {'n_estimators': 549, 'max_depth': 28, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 2 with value: 0.3792067307692307.
[I 2025-04-21 09:15:19,753] Trial 3 finished with value: 0.3762019230769231 and parameters: {'n_estimators': 333, 'max_depth': 28, 'min_samples_split': 9, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.3792067307692307.
[I 2025-04-21 09:15:22,150] Trial 4 finished with value: 0.34838942307692305 and paramet


Best hyperparameters for C_5Level_por: {'n_estimators': 545, 'max_depth': 21, 'min_samples_split': 9, 'min_samples_leaf': 6}
Best accuracy for C_5Level_por: 0.3854
