In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data = pd.read_csv('train_sample_final.csv')

y_data = data['flag']
X_data = data.drop(['flag'], axis=1)
X_data = X_data.fillna(0)

In [None]:
X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42)

In [7]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "verbose": 0,
    }
    all_f1 = []
    for i, (train_ind, val_ind) in enumerate(skf.split(X.values, y)):
        X_train, X_val = X.iloc[train_ind, :], X.iloc[val_ind, :]
        y_train, y_val = y.iloc[train_ind], y.iloc[val_ind]
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        f1 = f1_score(y_pred, y_val)
        all_f1.append(f1) 
    
    return sum(all_f1) / len(all_f1)

# Запускаем оптимизацию
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2025-05-29 01:15:54,659] A new study created in memory with name: no-name-27f94823-c06e-4b0b-8d3a-43883bea9b1a
[I 2025-05-29 01:16:11,324] Trial 0 finished with value: 0.6930043101822083 and parameters: {'iterations': 113, 'depth': 7, 'learning_rate': 0.12543383180747675, 'subsample': 0.7843747546781813, 'l2_leaf_reg': 0.0033441599746892015, 'random_strength': 1.6358464561866126, 'bagging_temperature': 0.2591995081201057, 'border_count': 60}. Best is trial 0 with value: 0.6930043101822083.
[I 2025-05-29 01:24:11,753] Trial 1 finished with value: 0.6943158985012792 and parameters: {'iterations': 911, 'depth': 9, 'learning_rate': 0.1981960343112285, 'subsample': 0.8527034680478586, 'l2_leaf_reg': 1.1352190683103394, 'random_strength': 0.878085383692494, 'bagging_temperature': 0.5597684286479548, 'border_count': 146}. Best is trial 1 with value: 0.6943158985012792.
[I 2025-05-29 01:44:06,845] Trial 2 finished with value: 0.7058610391012892 and parameters: {'iterations': 868, 'depth': 1

In [4]:
params = {'iterations': 916,
 'depth': 9,
 'learning_rate': 0.08417417879301282,
 'subsample': 0.6300537145115088,
 'l2_leaf_reg': 4.917199591062558,
 'random_strength': 5.3579819031797005,
 'bagging_temperature': 0.03127270504975477,
 'border_count': 237}

Оценка качества на валидационной выборке:

In [11]:
all_f1 = []
all_roc_auc = []
all_accuracy = []
for i, (train_ind, val_ind) in enumerate(skf.split(X.values, y)):
    X_train, X_val = X.iloc[train_ind, :], X.iloc[val_ind, :]
    y_train, y_val = y.iloc[train_ind], y.iloc[val_ind]
    model = CatBoostClassifier(**params, verbose=False)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    f1 = f1_score(y_pred.round(), y_val)
    roc_auc = roc_auc_score(y_val, y_pred)
    accuracy = accuracy_score(y_pred.round(), y_val)
    all_f1.append(f1)
    all_roc_auc.append(roc_auc)
    all_accuracy.append(accuracy)

print(f'Accuracy: {sum(all_accuracy) / len(all_accuracy)}')
print(f'F1 score: {sum(all_f1) / len(all_f1)}')
print(f'Roc auc: {sum(all_roc_auc) / len(all_roc_auc)}')

Accuracy: 0.7081400826349495
F1 score: 0.7067734217155921
Roc auc: 0.7823032131966091


Оценка качества на тестовой:

In [5]:
cat_clf = CatBoostClassifier(**params, random_state=42, verbose=0)
cat_clf.fit(X, y)
cat_preds = cat_clf.predict_proba(X_test)[:, 1]
print(f'Accuracy: {accuracy_score(cat_preds.round(), y_test)}')
print(f'F1 score: {f1_score(cat_preds.round(), y_test)}')
print(f'Roc auc: {roc_auc_score(y_test, cat_preds)}')

Accuracy: 0.7262979683972912
F1 score: 0.7262464722483537
Roc auc: 0.8006491811336547
