# Setup

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import CatBoostClassifier
import optuna

# Optuna

In [2]:
def load_data():
    data = pd.read_csv(
        'https://raw.githubusercontent.com/antbartash/australian_rain/main/data/data_transformed.csv',
        index_col=0
    )
    X, y = data.drop(columns=['RainTomorrow', 'RainToday']), data['RainTomorrow']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for column in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
        X_train[column] = X_train[column].astype(np.float32).fillna(-1).apply(lambda x: str(x))
        X_test[column] = X_test[column].astype(np.float32).fillna(-1).apply(lambda x: str(x))
    return X_train, X_test, y_train, y_test

In [3]:
def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-6, 0.5),
        'depth': trial.suggest_int('depth', 1, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 100.0),
        # 'random_strength': trial.suggest_float('random_strength', 0.0, 100.0), # CPU only
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 100.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise']),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 4)
    }
    X_train, _, y_train, _ = load_data()
    model = CatBoostClassifier(
        cat_features=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'],
        custom_metric='MCC',
        random_state=42, verbose=False, task_type='GPU'
        )
    model.set_params(**PARAMS)
    model.fit(X_train, y_train)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc'))
    return score

In [None]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials = 10, # the random sampling is used instead of the TPE algorithm until the given number of trials finish in the same study
    n_ei_candidates = 24, # number of candidate samples used to calculate the expected improvement.
    multivariate = True, # multivariate TPE when suggesting candidates; default: False
    seed = 42
)
study = optuna.create_study(direction='maximize', sampler=sampler, storage='sqlite:///db.sqlite3')
study.optimize(
    objective, n_trials = 100,
    timeout = 3600, # in seconds
    n_jobs = 1,
    show_progress_bar = True
)

[I 2023-12-29 07:04:09,517] A new study created in RDB with name: no-name-de814cb8-bac4-4245-a8e0-49e85132317f


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-29 07:06:06,819] Trial 0 finished with value: 0.8693246003801333 and parameters: {'n_estimators': 780, 'learning_rate': 0.47535720249065166, 'depth': 8, 'l2_leaf_reg': 59.86584841970366, 'bagging_temperature': 15.601864044243651, 'grow_policy': 'SymmetricTree', 'scale_pos_weight': 3.5985284373248057}. Best is trial 0 with value: 0.8693246003801333.
[I 2023-12-29 07:06:56,585] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 1222, 'learning_rate': 0.3540365808254449, 'depth': 1, 'l2_leaf_reg': 96.99098521619943, 'bagging_temperature': 83.24426408004217, 'grow_policy': 'SymmetricTree', 'scale_pos_weight': 1.5502135295603015}. Best is trial 0 with value: 0.8693246003801333.
[I 2023-12-29 07:08:04,697] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 643, 'learning_rate': 0.26237869105968725, 'depth': 5, 'l2_leaf_reg': 29.122914019804192, 'bagging_temperature': 61.18528947223795, 'grow_policy': 'Depthwise', 'scale_pos_weight': 2.099085529881075}

In [24]:
print(f"Best Gini: {study.best_trial.value * 2 - 1}")
print(f"Best params: {study.best_trial.params}")

Best Gini: 0.7881823351631307
Best params: {'n_estimators': 1185, 'learning_rate': 0.11112896316725643, 'depth': 9, 'l2_leaf_reg': 3.776668285917072, 'bagging_temperature': 2.487905997718464, 'grow_policy': 'Depthwise', 'scale_pos_weight': 1.0799300928659206}


# Plots

In [14]:
optuna.visualization.plot_edf(study)

In [18]:
optuna.visualization.plot_optimization_history(study)

In [11]:
optuna.visualization.plot_contour(study, params=['n_estimators', 'learning_rate'])

In [19]:
optuna.visualization.plot_parallel_coordinate(study)

In [20]:
optuna.visualization.plot_param_importances(study)

In [21]:
optuna.visualization.plot_slice(study)

In [23]:
optuna.visualization.plot_timeline(study)


plot_timeline is experimental (supported from v3.2.0). The interface can change in the future.

