In [None]:
import os
import pandas as pd
import optuna
import numpy as np
import openml
import logging
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from optuna.samplers import TPESampler, CmaEsSampler
from plotly.io import show
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [None]:
def fetch_and_prepare(openml_id):
    dataset = openml.datasets.get_dataset(openml_id)
    print(f">>> {dataset.name} (ID: {openml_id})")
    X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format='dataframe')

    num_cols = X.select_dtypes(include=['number']).columns
    cat_cols = X.select_dtypes(include=['category', 'object']).columns

    if len(num_cols) > 0:
        imputer_num = SimpleImputer(strategy='mean')
        X[num_cols] = imputer_num.fit_transform(X[num_cols])
        scaler = MinMaxScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])

    if len(cat_cols) > 0:
        imputer_cat = SimpleImputer(strategy='most_frequent')
        X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        X[cat_cols] = encoder.fit_transform(X[cat_cols].astype(str))
        
    if y.dtype == 'object' or y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    return X, y

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

def objective(trial):
    ccp_alpha = trial.suggest_float("ccp_alpha", 0.0, 0.008)
    max_depth = trial.suggest_int("max_depth", 12, 27)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 4, 42)
    min_samples_split = trial.suggest_int("min_samples_split", 5, 49)

    model = DecisionTreeClassifier(
        ccp_alpha=ccp_alpha, 
        max_depth=max_depth, 
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=42
    )
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    return scores.mean()

In [None]:
import json
import optuna
from optuna.samplers import TPESampler

dataset_ids = [1590, 1461, 24, 40945, 31, 44, 1464, 37, 3, 59]
results_list = []  

for i in dataset_ids:
    X, y = fetch_and_prepare(i)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    study = optuna.create_study(direction='maximize', sampler=TPESampler())
    study.optimize(objective, n_trials=1000)

    new_config = {
        "name": f"dt_100_ds_{i}", 
        "class": "sklearn.tree.DecisionTreeClassifier",
        "params": {
             
        }
    }
    
    new_config["params"].update(study.best_params)

    
    results_list.append(new_config)

with open('all_configs_dt.json', 'w') as f:
    json.dump(results_list, f, indent=2)

print("Plik JSON zawiera teraz wszystkie 10 konfiguracji.")