In [None]:
import optuna
import pandas as pd

from typing import Literal
from warnings import simplefilter

from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer

from qmlp import QMLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier


simplefilter("ignore", UserWarning)

In [None]:
datasets = {
    "electricity": 44120,
    "covertype": 44121,
    "pol": 44122,
    "house_16H": 44123,
    "kdd": 44124,
    "MagicTelescope": 44125,
    "bank_marketing": 44126,
    "phoneme": 44127,
    "miniboone": 44128,
    "higgs": 44129,
    "eye_movements": 44130,
    "jannis": 44131,
    "credit": 44089,
    "california": 44090,
    "wine": 44091,
}

In [None]:
def objective(
    trial: optuna.Trial,
    clf_name: Literal["gbdt", "bmlp", "rmlp"],
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
) -> float:
    match clf_name:
        case "gbdt":
            params = {
                "learning_rate": trial.suggest_float("learning_rate", 1e-2, 10, log=True),
                "max_depth": trial.suggest_categorical("max_depth", [None, 2, 3, 4]),
                "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", [20, 2]),
                "max_leaf_nodes": trial.suggest_categorical("max_leaf_nodes", [31, 5]),
            }
            clf = HistGradientBoostingClassifier(**params)

        case "bmlp":
            params = {
                "dim_emb": trial.suggest_categorical("dim_emb", [4, 8, 16, 24]),
                "dim_hid": trial.suggest_categorical("dim_hid", [64, 128, 256, 512]),
                "num_hid": trial.suggest_int("num_hid", 1, 4),
                "num_bins": trial.suggest_categorical("num_bins", [4, 8, 16, 32, 64, 128]),
                "dropout": trial.suggest_float("dropout", 0, 0.6),
                "lr": trial.suggest_float("lr", 3e-4, 3e-3, log=True),
                "batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
                "patience": trial.suggest_categorical("patience", [50]),
                "max_iter": trial.suggest_categorical("max_iter", [200]),
                "valid_frac": trial.suggest_categorical("valid_frac", [0.2]),
            }
            clf = BinnedMLPClassifier(**params)

        case "rmlp":
            params = {
                "dim_hid": trial.suggest_categorical("dim_hid", [64, 128, 256, 512]),
                "num_hid": trial.suggest_int("num_hid", 1, 4),
                "dropout": trial.suggest_float("dropout", 0, 0.6),
                "lr": trial.suggest_float("lr", 3e-4, 3e-3, log=True),
                "batch_size": trial.suggest_categorical("batch_size", [128, 256, 512]),
                "patience": trial.suggest_categorical("patience", [50]),
                "max_iter": trial.suggest_categorical("max_iter", [200]),
                "valid_frac": trial.suggest_categorical("valid_frac", [0.2]),
            }
            clf = make_pipeline(
                QuantileTransformer(output_distribution="normal"),
                RawMLPClassifier(**params),
            )

        case _:
            raise ValueError(f"Unrecognised {clf_name=}")

    return accuracy_score(y_valid, clf.fit(X_train, y_train).predict(X_valid))

In [None]:
# N_FOLDS = 30
# N_TRIALS = 30
# VALID_SIZE = 0.3

# for data_key, data_id in datasets.items():
#     X: pd.DataFrame
#     y: pd.DataFrame
#     X, y = fetch_openml(data_id=data_id, return_X_y=True, as_frame=True)
#     print(f"\ndataset = {data_key} | size = {X.shape} | #NaN = {X.isna().sum().sum()}")

#     kfold = StratifiedKFold(n_splits=N_FOLDS, shuffle=True)
#     for fold_idx, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
#         print(f"Fold {fold_idx}:")

#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
#         X_test, X_valid, y_test, y_valid = train_test_split(
#             X_test,
#             y_test,
#             test_size=VALID_SIZE,
#             stratify=y_test,
#             shuffle=True,
#         )

#         # --- Compute baselines using default params
#         clf_lr_default = make_pipeline(
#             QuantileTransformer(output_distribution="normal"),
#             LogisticRegressionCV(random_state=0, max_iter=1_000),
#         )
#         clf_rf_default = RandomForestClassifier()
#         clf_gbdt_default = HistGradientBoostingClassifier()

#         clf_lr_default.fit(X_train, y_train)
#         clf_rf_default.fit(X_train, y_train)
#         clf_gbdt_default.fit(X_train, y_train)

#         acc_lr_default = accuracy_score(y_test, clf_lr_default.predict(X_test))
#         acc_rf_default = accuracy_score(y_test, clf_rf_default.predict(X_test))
#         acc_gbdt_default = accuracy_score(y_test, clf_gbdt_default.predict(X_test))

#         print(f"LRCV (Default) TEST ACC = {acc_lr_default:.2%}")
#         print(f"RF   (Default) TEST ACC = {acc_rf_default:.2%}")
#         print(f"GBDT (Default) TEST ACC = {acc_gbdt_default:.2%}")

#         # --- Tune hyperparams
#         optuna.logging.set_verbosity(optuna.logging.WARNING)

#         study_gbdt = optuna.create_study(study_name="gbdt", direction="maximize")
#         study_bmlp = optuna.create_study(study_name="bmlp", direction="maximize")
#         study_rmlp = optuna.create_study(study_name="rmlp", direction="maximize")

#         study_gbdt.optimize(
#             lambda trial: objective(trial, "gbdt", X_train, y_train, X_valid, y_valid),
#             N_TRIALS,
#             show_progress_bar=True,
#         )
#         study_bmlp.optimize(
#             lambda trial: objective(trial, "bmlp", X_train, y_train, X_valid, y_valid),
#             N_TRIALS,
#             show_progress_bar=True,
#         )
#         study_rmlp.optimize(
#             lambda trial: objective(trial, "rmlp", X_train, y_train, X_valid, y_valid),
#             N_TRIALS,
#             show_progress_bar=True,
#         )

#         clf_gbdt = HistGradientBoostingClassifier(**study_gbdt.best_params)
#         clf_bmlp = BinnedMLPClassifier(**study_bmlp.best_params)
#         clf_rmlp = make_pipeline(
#             QuantileTransformer(output_distribution="normal"),
#             RawMLPClassifier(**study_rmlp.best_params),
#         )

#         clf_gbdt.fit(X_train, y_train)
#         clf_bmlp.fit(X_train, y_train)
#         clf_rmlp.fit(X_train, y_train)

#         acc_gbdt = accuracy_score(y_test, clf_gbdt.predict(X_test))
#         acc_bmlp = accuracy_score(y_test, clf_bmlp.predict(X_test))
#         acc_rmlp = accuracy_score(y_test, clf_rmlp.predict(X_test))

#         print(f"GBDT (Tuned) TEST ACC. {acc_gbdt:.2%}")
#         print(f"BMLP (Tuned) TEST ACC. {acc_bmlp:.2%}")
#         print(f"RMLP (Tuned) TEST ACC. {acc_rmlp:.2%}")
#         print("-" * 60)

#     print("=" * 60)
#     print()

In [None]:
key = "phoneme"

X: pd.DataFrame
y: pd.DataFrame
X, y = fetch_openml(
    data_id=datasets[key],
    return_X_y=True,
    as_frame=True,
)
print(f"dataset = {key} | size = {X.shape} | #NaN = {X.isna().sum().sum()}")
cat_features = X.select_dtypes(include=["object", "category"]).columns
num_features = X.select_dtypes(exclude=["object", "category"]).columns
assert len(cat_features) == 0, ""

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for i, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    print(f"Fold {i}:")
    print(len(train_idx), len(test_idx))

    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]

    clf = make_pipeline(
        QuantileTransformer(output_distribution="normal", random_state=0),
        LogisticRegressionCV(max_iter=1_000, random_state=0),
    )
    clf.fit(X_train, y_train)
    print(f"LRCV {accuracy_score(y_test, clf.predict(X_test)):.2%}")

    clf = RandomForestClassifier(random_state=0)
    clf.fit(X_train, y_train)
    print(f"RF   {accuracy_score(y_test, clf.predict(X_test)):.2%}")

    clf = HistGradientBoostingClassifier(random_state=0)
    clf.fit(X_train, y_train)
    print(f"HGBC {accuracy_score(y_test, clf.predict(X_test)):.2%}")

    clf = QMLPClassifier(use_quantile_embedding=True, random_state=0, verbose=True)
    clf.fit(X_train, y_train)
    print(f"QMLP {accuracy_score(y_test, clf.predict(X_test)):.2%}")

    clf = make_pipeline(
        QuantileTransformer(output_distribution="normal", random_state=0),
        QMLPClassifier(use_quantile_embedding=False, random_state=0, verbose=True),
    )
    clf.fit(X_train, y_train)
    print(f"RMLP {accuracy_score(y_test, clf.predict(X_test)):.2%}")
    print("-" * 60)