In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, make_scorer

PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data"
RANDOM_STATE = 42
N_SPLITS = 5

X_train = pd.read_feather(DATA_PATH / 'X_train.feather')
y_train = pd.read_feather(DATA_PATH / 'y_train.feather')['Class']
print(f"X_train Shape: {X_train.shape}")
print(f"y_train Shape: {y_train.shape}")
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
print(f"KFold Splits: {N_SPLITS}")

X_train Shape: (227845, 29)
y_train Shape: (227845,)
KFold Splits: 5


In [7]:

lr_model = LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced', max_iter=1000)

pr_auc_scores_lr = []
roc_auc_scores_lr = []

print("Cross-validation for Logistic Regression:")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lr_model.fit(X_train_fold, y_train_fold)
    y_pred_proba = lr_model.predict_proba(X_val_fold)[:, 1]

    pr_auc = average_precision_score(y_val_fold, y_pred_proba)
    roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

    pr_auc_scores_lr.append(pr_auc)
    roc_auc_scores_lr.append(roc_auc)

    print(f"Fold {fold}: PR-AUC = {pr_auc:.4f}, ROC-AUC = {roc_auc:.4f}")

print("\nResults:")
print(f"Average PR-AUC: {np.mean(pr_auc_scores_lr):.4f} ± {np.std(pr_auc_scores_lr):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_auc_scores_lr):.4f} ± {np.std(roc_auc_scores_lr):.4f}")

Cross-validation for Logistic Regression:
Fold 1: PR-AUC = 0.7958, ROC-AUC = 0.9922
Fold 2: PR-AUC = 0.7392, ROC-AUC = 0.9804
Fold 3: PR-AUC = 0.6670, ROC-AUC = 0.9684
Fold 4: PR-AUC = 0.8085, ROC-AUC = 0.9954
Fold 5: PR-AUC = 0.7232, ROC-AUC = 0.9777

Results:
Average PR-AUC: 0.7468 ± 0.0514
Average ROC-AUC: 0.9828 ± 0.0099


In [9]:
if_model = IsolationForest(n_estimators=100, contamination=0.01, random_state=RANDOM_STATE, n_jobs=-1)

pr_auc_scores_if = []
roc_auc_scores_if = []

print("\nCross-validation for Isolation Forest:")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    if_model.fit(X_train_fold)
    y_pred_scores = if_model.score_samples(X_val_fold)
    y_pred_scores_inverted = -y_pred_scores

    pr_auc = average_precision_score(y_val_fold, y_pred_scores_inverted)
    roc_auc = roc_auc_score(y_val_fold, y_pred_scores_inverted)

    pr_auc_scores_if.append(pr_auc)
    roc_auc_scores_if.append(roc_auc)

    print(f"Fold {fold}: PR-AUC = {pr_auc:.4f}, ROC-AUC = {roc_auc:.4f}")

print("\nResults:")
print(f"Average PR-AUC: {np.mean(pr_auc_scores_if):.4f} ± {np.std(pr_auc_scores_if):.4f}")
print(f"Average ROC-AUC: {np.mean(roc_auc_scores_if):.4f} ± {np.std(roc_auc_scores_if):.4f}")


Cross-validation for Isolation Forest:
Fold 1: PR-AUC = 0.1477, ROC-AUC = 0.9640
Fold 2: PR-AUC = 0.2332, ROC-AUC = 0.9336
Fold 3: PR-AUC = 0.1643, ROC-AUC = 0.9399
Fold 4: PR-AUC = 0.1700, ROC-AUC = 0.9722
Fold 5: PR-AUC = 0.1078, ROC-AUC = 0.9402

Results:
Average PR-AUC: 0.1646 ± 0.0406
Average ROC-AUC: 0.9500 ± 0.0152
