In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler 
from sklearn import set_config
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score, ParameterSampler
from sklearn.metrics import roc_auc_score, make_scorer
import openml
import json
import importlib
from collections import defaultdict
from tqdm import tqdm
from scipy.stats import loguniform, randint, uniform

In [19]:
MODEL_SEARCH_SPACE = [
    # ===== LINEAR / HIGH-DIM =====
    {
        "name": "logreg",
        "class": "sklearn.linear_model.LogisticRegression",
        "search_space": {
            "C": loguniform(1e-4, 1e2),
            "penalty": ["l1", "l2"],
            "solver": ["liblinear"],
            "max_iter": [2000]
        },
        "n_iter": 20
    },
    {
        "name": "linear_svm",
        "class": "sklearn.svm.LinearSVC",
        "search_space": {
            "C": loguniform(1e-4, 1e2)
        },
        "n_iter": 20
    },

    # ===== INSTANCE-BASED =====
    {
        "name": "knn",
        "class": "sklearn.neighbors.KNeighborsClassifier",
        "search_space": {
            "n_neighbors": randint(3, 50),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        },
        "n_iter": 20
    },

    # ===== TREE ENSEMBLES =====
    {
        "name": "random_forest",
        "class": "sklearn.ensemble.RandomForestClassifier",
        "search_space": {
            "n_estimators": randint(100, 800),
            "max_depth": [None] + list(range(3, 20)),
            "min_samples_leaf": randint(1, 20),
            "max_features": ["sqrt", "log2", None]
        },
        "n_iter": 20
    },
    {
        "name": "extra_trees",
        "class": "sklearn.ensemble.ExtraTreesClassifier",
        "search_space": {
            "n_estimators": randint(200, 800),
            "max_depth": [None] + list(range(3, 20)),
            "min_samples_leaf": randint(1, 20),
            "max_features": ["sqrt", "log2", None]
        },
        "n_iter": 20
    },

    # ===== GRADIENT BOOSTING =====
    {
        "name": "lightgbm",
        "class": "lightgbm.LGBMClassifier",
        "search_space": {
            "n_estimators": randint(100, 800),
            "num_leaves": randint(16, 256),
            "learning_rate": loguniform(1e-3, 0.3),
            "min_child_samples": randint(5, 100),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4)
        },
        "n_iter": 20
    }
]


In [20]:
def prepare_model_configs(global_seed=42):
    configs = []

    for model_def in MODEL_SEARCH_SPACE:
        sampler = ParameterSampler(
            model_def["search_space"],
            n_iter=model_def["n_iter"],
            random_state=global_seed
        )

        for params in sampler:
            configs.append({
                "name": model_def["name"],
                "class": model_def["class"],
                "params": params
            })

    return configs


In [21]:
def load_openml_datasets(max_datasets=20):
    suite = openml.study.get_suite(99)  # OpenML-CC18
    datasets = []

    for did in suite.data[:max_datasets]:
        dataset = openml.datasets.get_dataset(did)
        X, y, _, _ = dataset.get_data(
            dataset_format="dataframe",
            target=dataset.default_target_attribute
        )
        datasets.append((dataset.name, X, y))

    return datasets


In [23]:
def evaluate_model(model_class_path, params, X, y, seed=42):
    module_name, class_name = model_class_path.rsplit(".", 1)
    cls = getattr(importlib.import_module(module_name), class_name)

    model = cls(**params)
    
    num_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer()),
        ('scale', MinMaxScaler())
        ])


    cat_pipeline= Pipeline(steps = [
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one_hot', OneHotEncoder(handle_unknown='ignore', drop='first'))
        ])


    col_trans = ColumnTransformer(
        transformers=[
            (
                "numeric_preprocessing",
                num_pipeline,
                make_column_selector(dtype_include=np.number),
            ),
            (
                "categorical_preprocessing",
                cat_pipeline,
                make_column_selector(dtype_include=['category', 'object']),
            ),
        ],
        remainder="passthrough",
        )

    pipe = Pipeline([
        ('preprocessing', col_trans),
        ("model", model)
    ])


    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=seed
    )

    scores = cross_val_score(
        pipe,
        X,
        y,
        scoring="balanced_accuracy",
        cv=cv,
        n_jobs=-1
    )

    return float(np.mean(scores))


In [24]:
def run_benchmark(datasets, model_configs):
    results = defaultdict(list)

    for ds_name, X, y in datasets:
        print(f"\nDataset: {ds_name}")

        for cfg in tqdm(model_configs):
            score = evaluate_model(
                cfg["class"],
                cfg["params"],
                X,
                y
            )

            key = (
                cfg["class"],
                frozenset(cfg["params"].items())
            )

            results[key].append(score)

    return results


In [25]:
def select_top_models(results, top_n=50):
    summary = []

    for (cls, params), scores in results.items():
        summary.append({
            "class": cls,
            "params": dict(params),
            "datasets": len(scores),
            "mean_balanced_accuracy": float(np.mean(scores))
        })

    summary.sort(
        key=lambda x: (x["datasets"], x["mean_balanced_accuracy"]),
        reverse=True
    )

    return summary[:top_n]


In [26]:
def save_models_to_json(models, path="best_models.json"):
    output = []

    for i, m in enumerate(models):
        output.append({
            "name": f"model_{i}",
            "class": m["class"],
            "params": m["params"]
        })

    with open(path, "w") as f:
        json.dump(output, f, indent=2)


In [27]:
datasets = load_openml_datasets(max_datasets=20)

In [28]:
MODEL_CONFIGS = prepare_model_configs(global_seed=42)
print(f"Total model configs: {len(MODEL_CONFIGS)}")

Total model configs: 120


In [30]:
RESULTS = run_benchmark(datasets, MODEL_CONFIGS)


Dataset: kr-vs-kp


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [01:52<00:00,  1.07it/s]



Dataset: letter


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [41:57<00:00, 20.98s/it]



Dataset: balance-scale


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [00:45<00:00,  2.65it/s]



Dataset: mfeat-factors


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [30:24<00:00, 15.20s/it]



Dataset: mfeat-fourier


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [21:14<00:00, 10.62s/it]



Dataset: breast-w


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [00:36<00:00,  3.29it/s]



Dataset: mfeat-karhunen


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [16:34<00:00,  8.29s/it]



Dataset: mfeat-morphological


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [04:17<00:00,  2.14s/it]



Dataset: mfeat-zernike


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [11:43<00:00,  5.87s/it]



Dataset: cmc


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [01:40<00:00,  1.19it/s]



Dataset: optdigits


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [11:23<00:00,  5.70s/it]



Dataset: credit-approval


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [00:45<00:00,  2.66it/s]



Dataset: credit-g


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [01:00<00:00,  1.99it/s]



Dataset: pendigits


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [14:38<00:00,  7.32s/it]



Dataset: diabetes


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [00:42<00:00,  2.84it/s]



Dataset: spambase


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [04:27<00:00,  2.23s/it]



Dataset: splice


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [08:46<00:00,  4.38s/it]



Dataset: tic-tac-toe


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [00:47<00:00,  2.52it/s]



Dataset: vehicle


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [01:33<00:00,  1.29it/s]



Dataset: electricity


100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [18:26<00:00,  9.22s/it]


In [33]:
TOP_MODELS = select_top_models(RESULTS, top_n=50)

In [35]:
save_models_to_json(TOP_MODELS, "best_models.json")