In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
import warnings

# warnings.simplefilter("ignore", UserWarning | FutureWarning)
sklearn.set_config(transform_output="pandas")

from sklearn.datasets import fetch_openml
from sklearn.preprocessing import (
    KBinsDiscretizer,
    QuantileTransformer,
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler,
    OrdinalEncoder,
)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from catboost import CatBoostClassifier

In [None]:
DATASETS: list[str] = [
    "covertype",
    "adult",
    "higgs",
    "heloc",
]

dataset = "covertype"

df: pd.DataFrame
y: pd.Series

df, y = fetch_openml(name=dataset, version=1, return_X_y=True, as_frame=True)
y = y.cat.codes if isinstance(y.dtype, pd.CategoricalDtype) else y

cat_features = df.select_dtypes(include="category").columns
num_features = df.select_dtypes(exclude="category").columns


# --- Simple / automatic preprocessing
num_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value=0),
    QuantileTransformer(output_distribution="normal"),
    # KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans", random_state=0),
    # MinMaxScaler(),
)
cat_transformer = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)
col_transformer = ColumnTransformer(
    [
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ],
    verbose_feature_names_out=False,
)

# --- Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df,
    y,
    test_size=0.1,
    stratify=y,
    random_state=0,
)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,
    y_train,
    test_size=0.1,
    stratify=y_train,
    random_state=0,
)

X_train = col_transformer.fit_transform(X_train)
X_valid = col_transformer.transform(X_valid)
X_test = col_transformer.transform(X_test)

# --- Clf training
cat_params = {
    "iterations": 5000,
    "learning_rate": 0.277,
    "od_wait": 1000,
    "max_depth": 7,
    "task_type": "GPU",
    "l2_leaf_reg": 1.037,
    "eval_metric": "Accuracy",
    "devices": "0",
    "verbose": 1000,
}
clf = CatBoostClassifier(**cat_params)
clf.fit(X_train, y_train, eval_set=(X_valid, y_valid))
accuracy_score(clf.predict(X_test), y_test)

0:	learn: 0.6544617	test: 0.6507649	best: 0.6507649 (0)	total: 10.2ms	remaining: 50.8s
1000:	learn: 0.9399443	test: 0.8660427	best: 0.8660427 (999)	total: 10.4s	remaining: 41.5s
2000:	learn: 0.9767270	test: 0.8699678	best: 0.8699678 (1732)	total: 20.6s	remaining: 30.8s
3000:	learn: 0.9840187	test: 0.8717794	best: 0.8721820 (2996)	total: 30.6s	remaining: 20.4s
4000:	learn: 0.9859199	test: 0.8701691	best: 0.8722826 (3080)	total: 40.4s	remaining: 10.1s
bestTest = 0.8722826087
bestIteration = 3080
Shrink model to first 3081 iterations.


0.8697463768115942