In [4]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import optuna
import numpy as np

In [10]:
data = pd.read_csv('data/train_sample_cat.csv')

y_data = data['flag']
X_data = data.drop('flag', axis=1)

In [6]:
X_data.replace(np.inf, 0, inplace=True)

In [11]:
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [69]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
    }
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return f1_score(y_pred, y_val)


# Запускаем оптимизацию
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-05-16 13:30:56,069] A new study created in memory with name: no-name-6bfe508a-84b2-43e2-bf24-f8a03f0eb318
[I 2025-05-16 13:31:08,315] Trial 0 finished with value: 0.6603482477407979 and parameters: {'iterations': 146, 'depth': 6, 'learning_rate': 0.02524110630916377, 'subsample': 0.6845452627352011, 'l2_leaf_reg': 0.2346421085337576, 'random_strength': 2.805363432343142, 'bagging_temperature': 0.019997649608886836, 'border_count': 112}. Best is trial 0 with value: 0.6603482477407979.
[I 2025-05-16 13:31:30,660] Trial 1 finished with value: 0.6972043010752688 and parameters: {'iterations': 320, 'depth': 6, 'learning_rate': 0.27494203663430433, 'subsample': 0.7306588490746503, 'l2_leaf_reg': 0.8487989443552472, 'random_strength': 6.526886259716379, 'bagging_temperature': 0.9382432232649237, 'border_count': 56}. Best is trial 1 with value: 0.6972043010752688.
[I 2025-05-16 13:32:10,388] Trial 2 finished with value: 0.6845901639344262 and parameters: {'iterations': 669, 'depth': 4,

In [8]:
params = {'iterations': 916,
 'depth': 9,
 'learning_rate': 0.08417417879301282,
 'subsample': 0.6300537145115088,
 'l2_leaf_reg': 4.917199591062558,
 'random_strength': 5.3579819031797005,
 'bagging_temperature': 0.03127270504975477,
 'border_count': 237}

Оценка качества на валидационной выборке:

In [15]:
cat_clf = CatBoostClassifier(**params, random_state=42, verbose=0)
cat_clf.fit(X_train, y_train)
cat_preds = cat_clf.predict_proba(X_val)[:, 1]
print(f'Accuracy: {accuracy_score(cat_preds.round(), y_val)}')
print(f'F1 score: {f1_score(cat_preds.round(), y_val)}')
print(f'Roc auc: {roc_auc_score(y_val, cat_preds)}')

Accuracy: 0.7113294314381271
F1 score: 0.6980100590422043
Roc auc: 0.7835198841848017


Оценка качества на тестовой:

In [12]:
cat_clf = CatBoostClassifier(**params, random_state=42, verbose=0)
cat_clf.fit(X, y)
cat_preds = cat_clf.predict_proba(X_test)[:, 1]
print(f'Accuracy: {accuracy_score(cat_preds.round(), y_test)}')
print(f'F1 score: {f1_score(cat_preds.round(), y_test)}')
print(f'Roc auc: {roc_auc_score(y_test, cat_preds)}')

Accuracy: 0.7125658389766741
F1 score: 0.7118068653338363
Roc auc: 0.7951718568879511
