In [None]:
import os
import pandas as pd
import optuna
import numpy as np
import openml
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from optuna.samplers import TPESampler, CmaEsSampler
from xgboost import XGBClassifier

In [None]:
dataset_ids = [
    1590,   # Adult
    1510,   # Breast Cancer Wisconsin (Diagnostic)
    1461,   # Bank Marketing
    24,     # Mushroom
    40945   # Titanic
]

In [None]:
def fetch_and_prepare(openml_id):
    global y
    global X
    dataset = openml.datasets.get_dataset(openml_id)
    print(f">>> {dataset.name} (ID: {openml_id})")
    X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format='dataframe')
    for col in X.select_dtypes(include=['category', 'object']):
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    scaler = MinMaxScaler()
    X_imputed = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)
    if y.dtype == 'category':
        y = LabelEncoder().fit_transform(y)
    zbior = X_imputed['Target'] = y
    return X_imputed

In [None]:
zbior = fetch_and_prepare(1590)

# Optymalizacja hiperparametrów - zbiór Adult

## Regresja logistyczna

In [None]:
X_train, X_test, y_train, y_test = train_test_split(zbior.iloc[:, :-1], zbior['Target'], test_size=0.30, random_state=42)

def objective(trial):

    solver = trial.suggest_categorical('solver',['lbfgs','newton-cg','newton-cholesky','sag','saga'])
    penalty,l1_ratio = None,None

    if solver == 'lbfgs' or solver == 'newton-cg' or solver == 'newton-cholesky' or solver == 'sag':
        penalty = trial.suggest_categorical('penalty', ['l2'])
    elif solver == 'saga':
        penalty = trial.suggest_categorical('penalty_saga', ['l1', 'l2', 'elasticnet'])
        if penalty == 'elasticnet':
            l1_ratio = trial.suggest_float('l1_ratio', 1e-5, 1.0)
         
    C = trial.suggest_float('C', 1e-4, 1e4, log=True)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    wyniki = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        X_train_fold = X_train_fold.reset_index(drop=True)
        X_test_fold = X_test_fold.reset_index(drop=True)
        y_train_fold = y_train_fold.reset_index(drop=True)
        y_test_fold = y_test_fold.reset_index(drop=True)

        X_train_fold = list(zip(*[X_train_fold[col] for col in X_train_fold]))
        X_test_fold = list(zip(*[X_test_fold[col] for col in X_test_fold]))

        regresja_logistyczna = LogisticRegression(penalty = penalty, C=C, solver=solver, l1_ratio=l1_ratio, max_iter=10000, random_state=42,
                                                  ).fit(X_train_fold, y_train_fold)
        regresja_logistyczna.predict(X_test_fold)

        wynik = regresja_logistyczna.score(X_test_fold, y_test_fold)
        wyniki.append(wynik)

    rezultat = np.mean(wyniki)
    return rezultat

In [None]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
wynik = study.optimize(objective, n_trials=100)

In [None]:
study.best_params

In [None]:
regresja_logistyczna = LogisticRegression(penalty = 'elasticnet', C=0.3803420691574056, solver='saga', l1_ratio=0.9175083483232548, max_iter=5000, random_state=42,
                                                  ).fit(X_train, y_train)
regresja_logistyczna.predict(X_test)
wynik = regresja_logistyczna.score(X_test, y_test)
print(wynik)

## Random forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(zbior.iloc[:, :-1], zbior['Target'], test_size=0.30, random_state=42)

def objective(trial):

    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10) 

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    wyniki = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        X_train_fold = X_train_fold.reset_index(drop=True)
        X_test_fold = X_test_fold.reset_index(drop=True)
        y_train_fold = y_train_fold.reset_index(drop=True)
        y_test_fold = y_test_fold.reset_index(drop=True)

        X_train_fold = list(zip(*[X_train_fold[col] for col in X_train_fold]))
        X_test_fold = list(zip(*[X_test_fold[col] for col in X_test_fold]))

        Las_losowy = RandomForestClassifier(n_estimators = n_estimators, max_depth=max_depth, min_samples_split=min_samples_split,
                                            min_samples_leaf = min_samples_leaf)

        Las_losowy.fit(X_train_fold, y_train_fold)

        Las_losowy.predict(X_test_fold)
                                            
        wynik = Las_losowy.score(X_test_fold, y_test_fold)
        wyniki.append(wynik)

    rezultat = np.mean(wyniki)
    return rezultat

In [None]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
wynik = study.optimize(objective, n_trials=100)

In [None]:
study.best_params

In [None]:
Las_losowy = RandomForestClassifier(n_estimators = 1000, max_depth=18, min_samples_split=7,
                                            min_samples_leaf = 1).fit(X_train,y_train)
Las_losowy.predict(X_test)
wynik = Las_losowy.score(X_test, y_test)
print(wynik)

## XGBoost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(zbior.iloc[:, :-1], zbior['Target'], test_size=0.30, random_state=42)

def objective(trial):
    
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    wyniki = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        X_train_fold = X_train_fold.reset_index(drop=True)
        X_test_fold = X_test_fold.reset_index(drop=True)
        y_train_fold = y_train_fold.reset_index(drop=True)
        y_test_fold = y_test_fold.reset_index(drop=True)

        X_train_fold = list(zip(*[X_train_fold[col] for col in X_train_fold]))
        X_test_fold = list(zip(*[X_test_fold[col] for col in X_test_fold]))

        XGBoost = XGBClassifier(max_depth = max_depth, learning_rate = learning_rate, n_estimators = n_estimators,
                               subsample = subsample, colsample_bytree = colsample_bytree, min_child_weight = min_child_weight)

        XGBoost.fit(X_train_fold, y_train_fold)

        XGBoost.predict(X_test_fold)
                                            
        wynik = XGBoost.score(X_test_fold, y_test_fold)
        wyniki.append(wynik)

    rezultat = np.mean(wyniki)
    return rezultat

In [None]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
wynik = study.optimize(objective, n_trials=100)

In [None]:
study.best_params

In [None]:
XGBoost = XGBClassifier(max_depth = 7, learning_rate = 0.014059646736310738, n_estimators = 811,
                               subsample = 0.8746931123404836, colsample_bytree = 0.7067553318391634, min_child_weight = 1).fit(X_train,y_train)
XGBoost.predict(X_test)
wynik = XGBoost.score(X_test, y_test)
print(wynik)