In [23]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder, KBinsDiscretizer, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser

from mlp import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier

In [2]:
datasets = {
    "electricity": 44120,
    "covertype": 44121,
    "pol": 44122,
    "house_16H": 44123,
    "kdd_ipums_la_97-small": 44124,
    "MagicTelescope": 44125,
    "bank-marketing": 44126,
    "phoneme": 44127,
    "MiniBooNE": 44128,
    "Higgs": 44129,
    "eye_movements": 44130,
    "jannis": 44131,
    "credit": 44089,
    "california": 44090,
    "wine": 44091,
    "compass": 44162,
    "rl": 44160,
    "road-safety": 44161,
}

In [146]:
key = "phoneme"

X: pd.DataFrame
y: pd.DataFrame
X, y = fetch_openml(
    data_id=datasets[key],
    return_X_y=True,
    as_frame=True,
)
cat_features = X.select_dtypes(include="category").columns
num_features = X.select_dtypes(exclude="category").columns

print()
print(f"datset = {key} | size = {X.shape} | #NaN = {X.isna().sum().sum()}")


datset = phoneme | size = (3172, 5) | #NaN = 0


In [147]:
X

Unnamed: 0,V1,V2,V3,V4,V5
0,0.553823,-0.189665,-1.668229,-1.249093,-0.685406
1,-0.660158,-0.141079,2.535399,-1.544766,-0.136583
2,-0.757811,-0.176658,1.016982,-1.107165,-0.486269
3,-0.755110,-0.160173,0.880969,-1.090817,-0.491231
4,-0.565511,0.453241,0.829977,0.300258,-0.136583
...,...,...,...,...,...
3167,-0.612399,-0.179619,0.029014,2.340936,-1.220216
3168,-0.193525,-0.232036,1.681170,1.439754,-0.833708
3169,-0.044375,-0.010512,0.030989,-0.019379,1.281061
3170,0.246882,-0.793228,1.190101,1.423194,-1.303036


In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True)

cat_transformer = ColumnTransformer(
    [("cat", OneHotEncoder(sparse_output=False, drop="first"), cat_features)],
    remainder="passthrough",
    verbose_feature_names_out=False,
)
cat_transformer.set_output(transform="pandas")

X_train = cat_transformer.fit_transform(X_train)
X_test = cat_transformer.transform(X_test)

In [149]:
clf = make_pipeline(
    QuantileTransformer(output_distribution="normal", random_state=0),
    LogisticRegressionCV(random_state=0, max_iter=1_000),
)
clf.fit(X_train, y_train)

print(f"{accuracy_score(y_test, clf.predict(X_test)):.2%}")

74.58%


In [150]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
print(f"{accuracy_score(y_test, clf.predict(X_test)):.2%}")

88.55%


In [151]:
clf = HistGradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)
print(f"{accuracy_score(y_test, clf.predict(X_test)):.2%}")

87.92%


In [None]:
in_features = X_train.shape[1]
n_classes = len(np.unique(y))

clf = make_pipeline(
    StandardScaler(),
    MLPClassifier(
        in_features=in_features,
        out_features=n_classes,
        hidden_layer_size=in_features * 2,
        num_hidden_layers=2,
        dropout_rate=0.4,
        learning_rate=1e-3,
        max_iter=500,
        batch_size=512,
        validation_fraction=0.3,
        early_stopping=False,
        patience=50,
        random_state=0,
        verbose=True,
    ),
)

clf.fit(X_train, y_train)
print(f"{accuracy_score(y_test, clf.predict(X_test)):.2%}")

Iter 500/500 | Train acc. 66.80% | Valid acc. 77.93% | Best  acc.  78.68%: 100%|██████████| 500/500 [00:03<00:00, 148.42it/s]

80.88%



