In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer

PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data"
RANDOM_STATE = 42
N_SPLITS = 5

X_train = pd.read_feather(DATA_PATH / 'X_train.feather')
y_train = pd.read_feather(DATA_PATH / 'y_train.feather')['Class']
print(f"X_train Shape: {X_train.shape}")
print(f"y_train Shape: {y_train.shape}")
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
print(f"KFold Splits: {N_SPLITS}")

In [None]:

lr_model = LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced', max_iter=1000)

pr_auc_scores_lr = []
roc_auc_scores_lr = []

print("Cross-validation for Logistic Regression:")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lr_model.fit(X_train_fold, y_train_fold)
    y_pred_proba = lr_model.predict_proba(X_val_fold)[:, 1]

    pr_auc = average_precision_score(y_val_fold, y_pred_proba)
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

    pr_auc_scores_lr.append(pr_auc)
    roc_auc_scores_lr.append(roc_auc)

    print(f"Fold {fold}: PR-AUC = {pr_auc:.4f}, ROC-AUC = {roc_auc:.4f}")

print("\nResults:")
print(f"Average PR-AUC: {np.mean(pr_auc_scores_lr):.4f} ± {np.std(pr_auc_scores_lr):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_auc_scores_lr):.4f} ± {np.std(roc_auc_scores_lr):.4f}")

In [None]:
if_model = IsolationForest(n_estimators=100, contamination=0.01, random_state=RANDOM_STATE, n_jobs=-1)

pr_auc_scores_if = []
roc_auc_scores_if = []

print("\nCross-validation for Isolation Forest:")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    if_model.fit(X_train_fold)
    y_pred_scores = if_model.score_samples(X_val_fold)
    y_pred_scores_inverted = -y_pred_scores

    pr_auc = average_precision_score(y_val_fold, y_pred_scores_inverted)
    roc_auc = roc_auc_score(y_val_fold, y_pred_scores_inverted)

    pr_auc_scores_if.append(pr_auc)
    roc_auc_scores_if.append(roc_auc)

    print(f"Fold {fold}: PR-AUC = {pr_auc:.4f}, ROC-AUC = {roc_auc:.4f}")

print("\nResults:")
print(f"Average PR-AUC: {np.mean(pr_auc_scores_if):.4f} ± {np.std(pr_auc_scores_if):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_auc_scores_if):.4f} ± {np.std(roc_auc_scores_if):.4f}")

In [None]:
from catboost import CatBoostClassifier

neg_count = y_train.value_counts()[0]
pos_count = y_train.value_counts()[1]
scale_pos_weight_value = neg_count / pos_count

print(f"scale_pos_weight: {scale_pos_weight_value:.2f}")

cb_model = CatBoostClassifier(
    random_state=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight_value,
    verbose=False
)

pr_auc_scores_cb = []
roc_auc_scores_cb = []

print("\nCross-validation for CatBoost:")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    cb_model.fit(X_train_fold, y_train_fold)
    y_pred_proba = cb_model.predict_proba(X_val_fold)[:, 1]

    pr_auc = average_precision_score(y_val_fold, y_pred_proba)
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

    pr_auc_scores_cb.append(pr_auc)
    roc_auc_scores_cb.append(roc_auc)

    print(f"Fold {fold}: PR-AUC = {pr_auc:.4f}, ROC-AUC = {roc_auc:.4f}")

# Выводим средний результат
print("\nResults:")
print(f"Average PR-AUC: {np.mean(pr_auc_scores_cb):.4f} ± {np.std(pr_auc_scores_cb):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_auc_scores_cb):.4f} ± {np.std(roc_auc_scores_cb):.4f}")

In [None]:
cb_tuned_model = CatBoostClassifier(
    iterations=2000,
    depth=6,
    learning_rate=0.05,
    random_state=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight_value,
    verbose=0
)

pr_auc_scores_cb_tuned = []
roc_auc_scores_cb_tuned = []

print("\nCross-validation for tuned CatBoost:")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    cb_tuned_model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        early_stopping_rounds=50,
        verbose=False
    )

    y_pred_proba = cb_tuned_model.predict_proba(X_val_fold)[:, 1]

    pr_auc = average_precision_score(y_val_fold, y_pred_proba)
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

    pr_auc_scores_cb_tuned.append(pr_auc)
    roc_auc_scores_cb_tuned.append(roc_auc)

    print(f"Fold {fold}: PR-AUC = {pr_auc:.4f}, ROC-AUC = {roc_auc:.4f}")

print("\nResults:")
print(f"Average PR-AUC: {np.mean(pr_auc_scores_cb_tuned):.4f} ± {np.std(pr_auc_scores_cb_tuned):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_auc_scores_cb_tuned):.4f} ± {np.std(roc_auc_scores_cb_tuned):.4f}")