# Setup

In [1]:
! pip install catboost
! pip install optuna

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2
Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m12.8 MB/s[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import CatBoostClassifier
import optuna

# Optuna

In [3]:
def load_data():
    data = pd.read_csv(
        'https://raw.githubusercontent.com/antbartash/australian_rain/main/data/data_transformed.csv',
        index_col=0
    )
    X, y = data.drop(columns=['RainTomorrow', 'RainToday']), data['RainTomorrow']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for column in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
        X_train[column] = X_train[column].astype(np.float32).fillna(-1).apply(lambda x: str(x))
        X_test[column] = X_test[column].astype(np.float32).fillna(-1).apply(lambda x: str(x))
    return X_train, X_test, y_train, y_test

In [9]:
def objective(trial):
    PARAMS = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1500),
        # 'learning_rate': trial.suggest_float('learning_rate', 1e-6, 0.5),
        'depth': trial.suggest_int('depth', 1, 10),
        # 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0, 100.0),
        # 'random_strength': trial.suggest_float('random_strength', 0.0, 100.0), # CPU only
        # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 100.0),
        # 'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise']),
        # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 4)
    }
    X_train, _, y_train, _ = load_data()
    model = CatBoostClassifier(
        cat_features=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'],
        custom_metric='MCC',
        random_state=42, verbose=False, task_type='GPU'
        )
    model.set_params(**PARAMS)
    model.fit(X_train, y_train)
    score = np.mean(cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc'))
    return score

In [10]:
search_space = {
    'n_estimators': [100, 500, 1000, 1500],
    'depth': np.arange(1, 11).tolist()
}
sampler = optuna.samplers.GridSampler(search_space)
study = optuna.create_study(direction='maximize', sampler=sampler, storage='sqlite:///db.sqlite3')
study.optimize(objective, n_jobs = 1)

[I 2024-01-02 09:55:41,983] A new study created in RDB with name: no-name-ba6f061f-9251-4726-81d7-73187c3c67c7
[I 2024-01-02 09:55:50,379] Trial 0 finished with value: 0.870625107629178 and parameters: {'n_estimators': 100, 'depth': 2}. Best is trial 0 with value: 0.870625107629178.
[I 2024-01-02 09:58:15,636] Trial 1 finished with value: 0.895834147676842 and parameters: {'n_estimators': 1500, 'depth': 6}. Best is trial 1 with value: 0.895834147676842.
[I 2024-01-02 10:04:00,098] Trial 2 finished with value: 0.9004952568212768 and parameters: {'n_estimators': 1500, 'depth': 9}. Best is trial 2 with value: 0.9004952568212768.
[I 2024-01-02 10:05:38,265] Trial 3 finished with value: 0.8897351476049079 and parameters: {'n_estimators': 1500, 'depth': 4}. Best is trial 2 with value: 0.9004952568212768.
[I 2024-01-02 10:06:57,437] Trial 4 finished with value: 0.8927986652692824 and parameters: {'n_estimators': 1000, 'depth': 5}. Best is trial 2 with value: 0.9004952568212768.
[I 2024-01-02 

In [11]:
print(f"Best Gini: {study.best_trial.value * 2 - 1}")
print(f"Best params: {study.best_trial.params}")

Best Gini: 0.801628911011546
Best params: {'n_estimators': 1000, 'depth': 10}


# Plots

In [12]:
optuna.visualization.plot_edf(study)

In [13]:
optuna.visualization.plot_optimization_history(study)

In [15]:
optuna.visualization.plot_contour(study, params=['n_estimators', 'depth'])

In [16]:
optuna.visualization.plot_parallel_coordinate(study)

In [17]:
optuna.visualization.plot_param_importances(study)

In [18]:
optuna.visualization.plot_slice(study)

In [19]:
optuna.visualization.plot_timeline(study)


plot_timeline is experimental (supported from v3.2.0). The interface can change in the future.

