In [None]:
# ============================================================
#  Car Claim Fraud – Full Reproducible Experimental Pipeline
#  - Preprocessing & feature engineering (as in original code)
#  - Repeated Stratified 5x3 CV
#  - Resampling ONLY on training folds
#  - Metrics: Accuracy, Precision, Recall, F1, ROC-AUC, PR-AUC, FP/TP
#  - Saves all results for tables & figures
# ============================================================

# ---- If running in Colab, uncomment these to get matching libs ----
!pip install -q scikeras catboost xgboost lightgbm imbalanced-learn
# !pip install -q "scikit-learn==1.2.2"

# ============================================================
#  Imports
# ============================================================
import os
import random
import warnings
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, average_precision_score,
    confusion_matrix
)
from sklearn.base import clone

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, ClusterCentroids, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from scikeras.wrappers import KerasClassifier

from scipy.stats import wilcoxon

warnings.filterwarnings("ignore")

# ============================================================
#  Reproducibility: random seeds
# ============================================================
RANDOM_STATE = 42
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# ============================================================
#  Load Dataset
# ============================================================
# Make sure 'fraud_oracle.xlsx' is in the working directory
data = pd.read_excel("carclaimtxt.xlsx")
print("Original data shape:", data.shape)
print(data.head())

# ============================================================
#  Basic Cleaning
# ============================================================

# Drop obvious invalid / placeholder values
data.drop(data[data["MonthClaimed"] == "0"].index, inplace=True)
data.drop(data[data["DayOfWeekClaimed"] == "0"].index, inplace=True)

data.drop(data[data["Days:Policy-Accident"] == "none"].index, inplace=True)
data.drop(data[data["Days:Policy-Claim"] == "none"].index, inplace=True)

# Drop irrelevant identifiers
data.drop(["PolicyNumber", "RepNumber"], axis=1, inplace=True)

print("After basic cleaning:", data.shape)

# ============================================================
#  Frequency-based filtering of rare categories
# ============================================================
features_cat = [
    col for col in data.columns
    if data[col].dtype == "object" and col != "FraudFound"
]

df = data.copy(deep=True)

for feature in features_cat:
    value_percent = round(data[feature].value_counts() / len(data) * 100, 2)
    values_to_remove = value_percent[value_percent <= 7].index
    df = df[~df[feature].isin(values_to_remove)]

# Drop categorical features that ended up with only 1 level
for feature in features_cat:
    if feature in df.columns:
        if df[feature].nunique() <= 1:
            df.drop(columns=[feature], inplace=True)

print("After rare-category filtering:", df.shape)

# ============================================================
#  Feature Engineering (as in original code)
# ============================================================

def feature_engineering(df_in: pd.DataFrame) -> None:
    """
    In-place feature engineering similar to your original notebook.
    Note: this will also encode FraudFound if it is binary.
    """

    # 1) Label encode all binary features (nunique == 2)
    binary_features = [col for col in df_in.columns if df_in[col].nunique() == 2]
    le = LabelEncoder()
    for feature in binary_features:
        df_in[feature] = le.fit_transform(df_in[feature])

    # 2) Frequency encoding for temporal categorical vars
    months_weeks = [
        "WeekOfMonth", "Month", "DayOfWeek", "DayOfWeekClaimed",
        "MonthClaimed", "WeekOfMonthClaimed"
    ]
    for column in months_weeks:
        if column not in df_in.columns:
            continue
        freq_1 = df_in[df_in["FraudFound"] == 1].groupby(column).size() / len(df_in)
        freq_0 = df_in[df_in["FraudFound"] == 0].groupby(column).size() / len(df_in)
        freq_encoding = (15 * freq_1 + freq_0) / 2
        df_in[column] = df_in[column].map(freq_encoding)

    # 3) Encodings for specific features (same mapping as your code)

    # AgeOfPolicyHolder
    if "AgeOfPolicyHolder" in df_in.columns:
        age_mapping = {
            "31 to 35": 0,
            "36 to 40": 1,
            "41 to 50": 2,
            "51 to 65": 3,
        }
        df_in["AgeOfPolicyHolder"] = df_in["AgeOfPolicyHolder"].map(age_mapping)

    # VehiclePrice
    if "VehiclePrice" in df_in.columns:
        encoding_mp = {
            "20,000 to 29,000": 1,
            "more than 69,000": 3,
            "30,000 to 39,000": 2,
            "less than 20,000": 0,
        }
        df_in["VehiclePrice"] = df_in["VehiclePrice"].map(encoding_mp).fillna(-1)

    # AgeOfVehicle
    if "AgeOfVehicle" in df_in.columns:
        age_mapping_vehicle = {
            "7 years": 2,
            "6 years": 1,
            "more than 7": 3,
            "5 years": 0,
        }
        df_in["AgeOfVehicle"] = df_in["AgeOfVehicle"].map(age_mapping_vehicle)

    # NumberOfSuppliments
    if "NumberOfSuppliments" in df_in.columns:
        suppliments_mapping = {
            "none": 0,
            "more than 5": 3,
            "1 to 2": 1,
            "3 to 5": 2,
        }
        df_in["NumberOfSuppliments"] = df_in["NumberOfSuppliments"].map(
            suppliments_mapping
        )

    # PastNumberOfClaims
    if "PastNumberOfClaims" in df_in.columns:
        claim_mapping = {
            "2 to 4": 2,
            "none": 0,
            "1": 1,
            "more than 4": 3,
        }
        df_in["PastNumberOfClaims"] = df_in["PastNumberOfClaims"].map(claim_mapping)


feature_engineering(df)
print("After feature engineering:", df.shape)
print(df.head())

# Drop highly correlated Age (as you did)
if "Age" in df.columns:
    df.drop("Age", axis=1, inplace=True)

# ============================================================
#  Final dataset: drop some columns + one-hot
# ============================================================

def get_final_data(df_in: pd.DataFrame) -> pd.DataFrame:
    df_work = df_in.copy()

    # Drop columns you decided not to use
    cols_to_drop = [
        "WeekOfMonth",
        "DayOfWeekClaimed",
        "DriverRating",
        "Deductible",
        "Sex",
        "AccidentArea",
        "VehiclePrice",
        "MaritalStatus",
        "AgeOfVehicle",
        "Make",
    ]
    cols_to_drop = [c for c in cols_to_drop if c in df_work.columns]
    df_work.drop(cols_to_drop, axis=1, inplace=True)

    # One-hot encode some categoricals
    for cat_col in ["PolicyType", "BasePolicy"]:
        if cat_col in df_work.columns:
            df_work = pd.get_dummies(df_work, columns=[cat_col], drop_first=True)

    return df_work


df = get_final_data(df)
df.reset_index(drop=True, inplace=True)

print("Final dataset shape:", df.shape)
print(df.head())

# ============================================================
#  Features and Labels
# ============================================================
assert "FraudFound" in df.columns, "FraudFound column is missing!"

X = df.drop(["FraudFound"], axis=1)
y = df["FraudFound"]

print("Label distribution:", y.value_counts(normalize=True))

# Ensure binary labels {0,1}
print("Unique labels in y:", y.unique())

INPUT_DIM = X.shape[1]
print("Number of features:", INPUT_DIM)

# ============================================================
#  Define Models
# ============================================================

def build_ann():
    model = keras.Sequential(
        [
            layers.Input(shape=(INPUT_DIM,)),
            layers.Dense(64, activation="relu"),
            layers.Dropout(0.3),
            layers.Dense(32, activation="relu"),
            layers.Dropout(0.3),
            layers.Dense(16, activation="relu"),
            layers.Dense(1, activation="sigmoid"),
        ]
    )
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


# KerasClassifier from SciKeras
ann_model = KerasClassifier(
    model=build_ann,
    epochs=20,
    batch_size=32,
    verbose=0,
    random_state=RANDOM_STATE,
)

def get_base_models():
    models_dict = {}

    models_dict["ANN"] = ann_model

    models_dict["XGBoost"] = XGBClassifier(
        random_state=RANDOM_STATE,
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        n_jobs=-1,
    )

    models_dict["CatBoost"] = CatBoostClassifier(
        random_state=RANDOM_STATE,
        verbose=0,
        depth=6,
        learning_rate=0.1,
        iterations=300,
        loss_function="Logloss",
    )

    models_dict["LightGBM"] = LGBMClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
    )

    models_dict["RandomForest"] = RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        n_jobs=-1,
    )

    models_dict["AdaBoost"] = AdaBoostClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        learning_rate=0.1,
    )

    return models_dict


models = get_base_models()
print("Models:", list(models.keys()))

# ============================================================
#  Sampling Strategies (with explicit target ratio)
# ============================================================

SAMPLING_STRATEGY = 1.0  # approx 1:1 minority:majority on training folds

samplers = {
    "No Sampling": None,  # baseline
    "Cost Sensitive": None,  # handled via class_weight / sample_weight
    "RandomOverSampler": RandomOverSampler(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "SMOTE": SMOTE(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY, k_neighbors=5
    ),
    "ADASYN": ADASYN(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "BorderlineSMOTE": BorderlineSMOTE(
        random_state=RANDOM_STATE,
        sampling_strategy=SAMPLING_STRATEGY,
        kind="borderline-1",
    ),
    "RandomUnderSampler": RandomUnderSampler(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "NearMiss": NearMiss(version=1, sampling_strategy=SAMPLING_STRATEGY),
    "TomekLinks": TomekLinks(),
    "ClusterCentroids": ClusterCentroids(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "SMOTETomek": SMOTETomek(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "SMOTEENN": SMOTEENN(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
}

print("Samplers:", list(samplers.keys()))

# ============================================================
#  Core Evaluation Function (Repeated Stratified CV)
# ============================================================

def evaluate_model(
    model_name,
    base_model,
    sampler_name,
    sampler,
    X_df,
    y_series,
    n_splits=5,
    n_repeats=3,
    random_state=RANDOM_STATE,
    cost_sensitive_weights={0: 1.0, 1: 18.0},
):
    """
    Repeated stratified CV with:
      - scaling inside each fold
      - resampling ONLY on the training fold
      - consistent metrics with FraudFound=1 as positive class
      - cost-sensitive handling for 'Cost Sensitive' sampler
    """
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )

    rows = []
    cms = []
    pr_curves = []
    fold_idx = 0

    for train_idx, test_idx in cv.split(X_df, y_series):
        fold_idx += 1

        X_train, X_test = X_df.iloc[train_idx], X_df.iloc[test_idx]
        y_train, y_test = y_series.iloc[train_idx], y_series.iloc[test_idx]

        # Scale within fold
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Resample training data only (except cost-sensitive case)
        X_train_res, y_train_res = X_train_scaled, y_train
        if sampler is not None and sampler_name != "Cost Sensitive":
            X_train_res, y_train_res = sampler.fit_resample(
                X_train_scaled, y_train
            )

        clf = clone(base_model)
        fit_kwargs = {}

        # -------- Cost-sensitive handling (fixed) --------
        if sampler_name == "Cost Sensitive":
            # Models that support class_weight directly
            if model_name in ["RandomForest", "LightGBM"]:
                clf.set_params(class_weight=cost_sensitive_weights)

            # Models that use sample_weight in fit (boosting)
            elif model_name in ["XGBoost", "CatBoost", "AdaBoost"]:
                sample_weight = np.where(
                    y_train_res == 1,
                    cost_sensitive_weights[1],
                    cost_sensitive_weights[0],
                )
                fit_kwargs["sample_weight"] = sample_weight

            # ANN via SciKeras: pass class_weight to Keras fit
            elif model_name == "ANN":
                fit_kwargs["class_weight"] = cost_sensitive_weights
        # -------------------------------------------------

        start = time.perf_counter()
        clf.fit(X_train_res, y_train_res, **fit_kwargs)
        train_time = time.perf_counter() - start

        # Get probabilities for positive class = 1
        if hasattr(clf, "predict_proba"):
            y_proba_raw = clf.predict_proba(X_test_scaled)
            if y_proba_raw.ndim == 2 and y_proba_raw.shape[1] == 2:
                y_proba = y_proba_raw[:, 1]
            else:
                y_proba = y_proba_raw.ravel()
        else:
            scores = clf.decision_function(X_test_scaled)
            y_proba = (scores - scores.min()) / (
                scores.max() - scores.min() + 1e-8
            )

        # Fixed threshold = 0.5
        y_pred = (y_proba >= 0.5).astype(int)

        cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc = roc_auc_score(y_test, y_proba)
        pr_auc = average_precision_score(y_test, y_proba)
        fp_per_tp = fp / tp if tp > 0 else np.inf

        rows.append(
            {
                "model": model_name,
                "sampler": sampler_name,
                "fold": fold_idx,
                "accuracy": acc,
                "precision": prec,
                "recall": rec,
                "f1": f1,
                "roc_auc": roc,
                "pr_auc": pr_auc,
                "fp_per_tp": fp_per_tp,
                "train_time_sec": train_time,
                "tn": tn,
                "fp": fp,
                "fn": fn,
                "tp": tp,
            }
        )

        cms.append(
            {
                "model": model_name,
                "sampler": sampler_name,
                "fold": fold_idx,
                "cm": cm,
            }
        )

        # Store PR curve only for the first fold (for plotting)
        if fold_idx == 1:
            pr_prec, pr_rec, _ = precision_recall_curve(y_test, y_proba)
            pr_curves.append(
                {
                    "model": model_name,
                    "sampler": sampler_name,
                    "precision": pr_prec,
                    "recall": pr_rec,
                }
            )

    metrics_df = pd.DataFrame(rows)
    return metrics_df, cms, pr_curves


# ============================================================
#  Run All Model × Sampler experiments
# ============================================================

all_metrics_list = []
all_cms_list = []
all_pr_curves_list = []

for model_name, base_model in models.items():
    for sampler_name, sampler in samplers.items():
        print(f"Running {model_name} with {sampler_name}...")
        metrics_df, cms, pr_curves = evaluate_model(
            model_name=model_name,
            base_model=base_model,
            sampler_name=sampler_name,
            sampler=sampler,
            X_df=X,
            y_series=y,
            n_splits=5,
            n_repeats=3,
            random_state=RANDOM_STATE,
        )
        all_metrics_list.append(metrics_df)
        all_cms_list.extend(cms)
        all_pr_curves_list.extend(pr_curves)

all_metrics_df = pd.concat(all_metrics_list, ignore_index=True)
print("All metrics per fold shape:", all_metrics_df.shape)

# Save per-fold metrics
all_metrics_df.to_csv("all_metrics_per_fold.csv", index=False)

# ============================================================
#  Summary Table: mean ± std for each (model, sampler)
# ============================================================

summary = (
    all_metrics_df.groupby(["model", "sampler"])
    .agg(
        accuracy_mean=("accuracy", "mean"),
        accuracy_std=("accuracy", "std"),
        precision_mean=("precision", "mean"),
        precision_std=("precision", "std"),
        recall_mean=("recall", "mean"),
        recall_std=("recall", "std"),
        f1_mean=("f1", "mean"),
        f1_std=("f1", "std"),
        roc_auc_mean=("roc_auc", "mean"),
        roc_auc_std=("roc_auc", "std"),
        pr_auc_mean=("pr_auc", "mean"),
        pr_auc_std=("pr_auc", "std"),
        fp_per_tp_mean=("fp_per_tp", "mean"),
        fp_per_tp_std=("fp_per_tp", "std"),
        train_time_mean=("train_time_sec", "mean"),
        train_time_std=("train_time_sec", "std"),
    )
    .reset_index()
)

summary.to_excel("summary_metrics_mean_std.xlsx", index=False)
print("Summary table saved to summary_metrics_mean_std.xlsx")
print(summary.head())

# ============================================================
#  Confusion Matrices: numeric form + optional plots
# ============================================================

cm_records = []
for entry in all_cms_list:
    cm = entry["cm"]
    tn, fp, fn, tp = cm.ravel()
    cm_records.append(
        {
            "model": entry["model"],
            "sampler": entry["sampler"],
            "fold": entry["fold"],
            "tn": tn,
            "fp": fp,
            "fn": fn,
            "tp": tp,
        }
    )

cm_df = pd.DataFrame(cm_records)
cm_df.to_csv("confusion_matrices_all_folds.csv", index=False)
print("Confusion matrices saved to confusion_matrices_all_folds.csv")

def plot_confusion_matrix(cm, model_name, sampler_name, fold):
    plt.figure(figsize=(4, 4))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Pred 0", "Pred 1"],
        yticklabels=["True 0", "True 1"],
    )
    plt.title(f"Confusion Matrix: {model_name} + {sampler_name} (fold {fold})")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    fname = f"cm_{model_name}_{sampler_name}_fold{fold}.png"
    plt.savefig(fname, dpi=300)
    plt.close()
    return fname

# Example: CM plots for fold 1 of some key hybrids
for entry in all_cms_list:
    if (
        entry["fold"] == 1
        and entry["sampler"] in ["No Sampling", "SMOTEENN"]
        and entry["model"] in ["ANN", "RandomForest", "XGBoost"]
    ):
        plot_confusion_matrix(
            entry["cm"], entry["model"], entry["sampler"], entry["fold"]
        )

# ============================================================
#  Precision–Recall Curves
# ============================================================

def plot_pr_curves(pr_curves, model_name, samplers_to_plot):
    plt.figure(figsize=(6, 5))
    for entry in pr_curves:
        if entry["model"] == model_name and entry["sampler"] in samplers_to_plot:
            plt.plot(entry["recall"], entry["precision"], label=entry["sampler"])
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision–Recall curves ({model_name})")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    fname = f"pr_curves_{model_name}.png"
    plt.savefig(fname, dpi=300)
    plt.close()
    return fname

# Example: ANN PR curves for a few samplers
plot_pr_curves(
    all_pr_curves_list, "ANN", ["No Sampling", "RandomOverSampler", "SMOTEENN"]
)

# ============================================================
#  Feature Importances for best tree-based models (example)
# ============================================================

feature_importances_records = []
best_sampler_for_importance = "SMOTEENN"  # you can change this later

for model_name in ["RandomForest", "XGBoost", "LightGBM", "CatBoost"]:
    if model_name not in models:
        continue

    sampler = samplers[best_sampler_for_importance]

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_res, y_res = sampler.fit_resample(X_scaled, y)

    base_model = models[model_name]
    clf = clone(base_model)
    clf.fit(X_res, y_res)

    if hasattr(clf, "feature_importances_"):
        imps = clf.feature_importances_
        tmp_df = pd.DataFrame(
            {
                "feature": X.columns,
                "importance": imps,
                "model": model_name,
                "sampler": best_sampler_for_importance,
            }
        )
        feature_importances_records.append(tmp_df)

if feature_importances_records:
    feature_importances_df = pd.concat(feature_importances_records, ignore_index=True)
    feature_importances_df.to_csv(
        "feature_importances_best_models.csv", index=False
    )
    print("Feature importances saved to feature_importances_best_models.csv")

# ============================================================
#  Example: Wilcoxon tests for significance
# ============================================================

def wilcoxon_test(all_metrics_df, model_name, sampler_a, sampler_b, metric="f1"):
    a = all_metrics_df[
        (all_metrics_df["model"] == model_name)
        & (all_metrics_df["sampler"] == sampler_a)
    ][metric]
    b = all_metrics_df[
        (all_metrics_df["model"] == model_name)
        & (all_metrics_df["sampler"] == sampler_b)
    ][metric]

    stat, p = wilcoxon(a, b, zero_method="wilcox", correction=False)
    print(
        f"Wilcoxon ({metric}) for {model_name}: {sampler_a} vs {sampler_b} -> p = {p:.4g}"
    )
    return stat, p

# Example comparisons (you can comment/uncomment as needed)
wilcoxon_test(all_metrics_df, "ANN", "SMOTEENN", "No Sampling", metric="f1")
wilcoxon_test(all_metrics_df, "RandomForest", "SMOTEENN", "No Sampling", metric="f1")
wilcoxon_test(all_metrics_df, "XGBoost", "SMOTEENN", "No Sampling", metric="pr_auc")

print("DONE. All result files saved in the working directory.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

FileNotFoundError: [Errno 2] No such file or directory: 'carclaimtxt.xlsx'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel("summary_metrics_mean_std.xlsx")

metric_cols = ["accuracy_mean", "precision_mean", "recall_mean",
               "f1_mean", "roc_auc_mean"]

if df["recall_mean"].max() <= 1.0:
    for col in metric_cols:
        df[col] = df[col] * 100.0

sampler_order = [
    "No Sampling",
    "Cost Sensitive",
    "RandomOverSampler",
    "SMOTE",
    "ADASYN",
    "BorderlineSMOTE",
    "RandomUnderSampler",
    "NearMiss",
    "TomekLinks",
    "ClusterCentroids",
    "SMOTETomek",
    "SMOTEENN",
]

model_order  = ["ANN", "XGBoost", "CatBoost", "LightGBM", "RandomForest", "AdaBoost"]
model_labels = ["ANN", "XGBoost", "CatBoost", "LightGBM", "Random Forest", "AdaBoost"]

df["sampler"] = pd.Categorical(df["sampler"], categories=sampler_order, ordered=True)
df = df.sort_values(["sampler", "model"])

def plot_metric(metric_col, ylabel, filename):
    pivot = df.pivot(index="sampler", columns="model", values=metric_col)
    pivot = pivot.loc[sampler_order, model_order]

    x = np.arange(len(pivot.index))
    width = 0.12

    fig, ax = plt.subplots(figsize=(12, 6))

    for i, model in enumerate(model_order):
        ax.bar(
            x + (i - len(model_order)/2)*width + width/2,
            pivot[model].values,
            width,
            label=model_labels[i],
        )

    ax.set_xticks(x)
    ax.set_xticklabels(pivot.index, rotation=45, ha="right", fontsize=15)
    ax.set_ylabel(ylabel, fontsize=16)
    ax.set_xlabel("Sampling method", fontsize=16)

    ax.tick_params(axis="y", labelsize=15)

    ax.legend(fontsize=15)

    ax.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

    fig.tight_layout()
    fig.savefig(f"{filename}.pdf")
    plt.close(fig)

plot_metric("recall_mean", "Recall (%)", "Recall")
plot_metric("f1_mean", "F1-score (%)", "F1")
plot_metric("roc_auc_mean", "AUC (%)", "AUC")

print("Done. Files saved: Recall.pdf, F1.pdf, AUC.pdf")


Done. Files saved: Recall.pdf, F1.pdf, AUC.pdf


In [None]:
# ============================================================
#  AICD – Full Reproducible Experimental Pipeline
#  - Preprocessing & feature engineering for AICD
#  - Repeated Stratified 5x3 CV
#  - Resampling ONLY on training folds
#  - Metrics: Accuracy, Precision, Recall, F1, ROC-AUC, PR-AUC, FP/TP
#  - Saves all results for tables & figures (same as previous code)
# ============================================================

# ---- If running in Colab, uncomment these to get matching libs ----
# !pip install -q scikeras catboost xgboost lightgbm imbalanced-learn

# ============================================================
#  Imports
# ============================================================
import os
import random
import warnings
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, average_precision_score,
    confusion_matrix
)
from sklearn.base import clone

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, ClusterCentroids, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from scikeras.wrappers import KerasClassifier

from scipy.stats import wilcoxon

warnings.filterwarnings("ignore")

# ============================================================
#  Reproducibility: random seeds
# ============================================================
RANDOM_STATE = 42
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# ============================================================
#  Load AICD Dataset
# ============================================================
# Make sure 'buntyshahauto-insurance-claims-data.xlsx' is in the working directory
DATA_PATH = "buntyshahauto-insurance-claims-data.xlsx"
data = pd.read_excel(DATA_PATH)
print("Original AICD data shape:", data.shape)
print(data.head())

# ============================================================
#  Preprocessing for AICD
# ============================================================

def preprocess_aicd(df_in: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocessing / feature engineering for the AICD dataset.
    - Convert dates to numeric components (year, month, day-of-week)
    - Drop obvious identifier-like / high-cardinality ID columns
    - One-hot encode remaining categorical features
    """
    df = df_in.copy()

    # Ensure fraud_reported is numeric {0,1} (in your file it already is)
    # If you ever get 'Y'/'N' instead, uncomment the mapping below:
    # if df['fraud_reported'].dtype == 'object':
    #     df['fraud_reported'] = df['fraud_reported'].map({'Y': 1, 'N': 0})

    # 1) Date features -> numeric components
    for col in ["policy_bind_date", "incident_date"]:
        if col in df.columns:
            df[col + "_year"] = df[col].dt.year
            df[col + "_month"] = df[col].dt.month
            df[col + "_dow"] = df[col].dt.dayofweek
            df.drop(columns=[col], inplace=True)

    # 2) Drop high-cardinality identifier-like columns
    id_like_cols = ["policy_number", "insured_zip", "incident_location"]
    for col in id_like_cols:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    # 3) One-hot encode remaining categorical (object) features
    cat_cols = [c for c in df.columns if df[c].dtype == "object"]
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    return df

df = preprocess_aicd(data)
print("After preprocessing:", df.shape)
print(df.head())

# ============================================================
#  Features and Labels
# ============================================================
assert "fraud_reported" in df.columns, "fraud_reported column is missing!"

X = df.drop(["fraud_reported"], axis=1)
y = df["fraud_reported"]

print("Label distribution:", y.value_counts(normalize=True))
print("Unique labels in y:", y.unique())

INPUT_DIM = X.shape[1]
print("Number of features:", INPUT_DIM)

# ============================================================
#  Cost-sensitive weights (based on imbalance in AICD)
# ============================================================
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
pos_weight = n_neg / n_pos  # approx inverse prevalence
cost_sensitive_weights = {0: 1.0, 1: float(pos_weight)}
print("Cost-sensitive class weights:", cost_sensitive_weights)

# ============================================================
#  Define Models
# ============================================================

def build_ann():
    model = keras.Sequential(
        [
            layers.Input(shape=(INPUT_DIM,)),
            layers.Dense(64, activation="relu"),
            layers.Dropout(0.3),
            layers.Dense(32, activation="relu"),
            layers.Dropout(0.3),
            layers.Dense(16, activation="relu"),
            layers.Dense(1, activation="sigmoid"),
        ]
    )
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


ann_model = KerasClassifier(
    model=build_ann,
    epochs=20,
    batch_size=32,
    verbose=0,
    random_state=RANDOM_STATE,
)

def get_base_models():
    models_dict = {}

    models_dict["ANN"] = ann_model

    models_dict["XGBoost"] = XGBClassifier(
        random_state=RANDOM_STATE,
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        n_jobs=-1,
    )

    models_dict["CatBoost"] = CatBoostClassifier(
        random_state=RANDOM_STATE,
        verbose=0,
        depth=6,
        learning_rate=0.1,
        iterations=300,
        loss_function="Logloss",
    )

    models_dict["LightGBM"] = LGBMClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
    )

    models_dict["RandomForest"] = RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        n_jobs=-1,
    )

    models_dict["AdaBoost"] = AdaBoostClassifier(
        random_state=RANDOM_STATE,
        n_estimators=300,
        learning_rate=0.1,
    )

    return models_dict


models = get_base_models()
print("Models:", list(models.keys()))

# ============================================================
#  Sampling Strategies (with explicit target ratio)
# ============================================================

SAMPLING_STRATEGY = 1.0  # approx 1:1 minority:majority on training folds

samplers = {
    "No Sampling": None,  # baseline
    "Cost Sensitive": None,  # handled via class_weight / sample_weight
    "RandomOverSampler": RandomOverSampler(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "SMOTE": SMOTE(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY, k_neighbors=5
    ),
    "ADASYN": ADASYN(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "BorderlineSMOTE": BorderlineSMOTE(
        random_state=RANDOM_STATE,
        sampling_strategy=SAMPLING_STRATEGY,
        kind="borderline-1",
    ),
    "RandomUnderSampler": RandomUnderSampler(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "NearMiss": NearMiss(version=1, sampling_strategy=SAMPLING_STRATEGY),
    "TomekLinks": TomekLinks(),
    "ClusterCentroids": ClusterCentroids(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "SMOTETomek": SMOTETomek(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
    "SMOTEENN": SMOTEENN(
        random_state=RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY
    ),
}

print("Samplers:", list(samplers.keys()))

# ============================================================
#  Core Evaluation Function (Repeated Stratified CV)
# ============================================================

def evaluate_model(
    model_name,
    base_model,
    sampler_name,
    sampler,
    X_df,
    y_series,
    n_splits=5,
    n_repeats=3,
    random_state=RANDOM_STATE,
    cost_sensitive_weights=cost_sensitive_weights,
):
    """
    Repeated stratified CV with:
      - scaling inside each fold
      - resampling ONLY on the training fold
      - consistent metrics with fraud_reported=1 as positive class
      - cost-sensitive handling for 'Cost Sensitive' sampler
    """
    cv = RepeatedStratifiedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )

    rows = []
    cms = []
    pr_curves = []
    fold_idx = 0

    for train_idx, test_idx in cv.split(X_df, y_series):
        fold_idx += 1

        X_train, X_test = X_df.iloc[train_idx], X_df.iloc[test_idx]
        y_train, y_test = y_series.iloc[train_idx], y_series.iloc[test_idx]

        # Scale within fold
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Resample training data only (except cost-sensitive case)
        X_train_res, y_train_res = X_train_scaled, y_train
        if sampler is not None and sampler_name != "Cost Sensitive":
            X_train_res, y_train_res = sampler.fit_resample(
                X_train_scaled, y_train
            )

        clf = clone(base_model)
        fit_kwargs = {}

        # -------- Cost-sensitive handling --------
        if sampler_name == "Cost Sensitive":
            # Models that support class_weight directly
            if model_name in ["RandomForest", "LightGBM"]:
                clf.set_params(class_weight=cost_sensitive_weights)

            # Models that use sample_weight in fit (boosting)
            elif model_name in ["XGBoost", "CatBoost", "AdaBoost"]:
                sample_weight = np.where(
                    y_train_res == 1,
                    cost_sensitive_weights[1],
                    cost_sensitive_weights[0],
                )
                fit_kwargs["sample_weight"] = sample_weight

            # ANN via SciKeras: pass class_weight to Keras fit
            elif model_name == "ANN":
                fit_kwargs["class_weight"] = cost_sensitive_weights
        # ----------------------------------------

        start = time.perf_counter()
        clf.fit(X_train_res, y_train_res, **fit_kwargs)
        train_time = time.perf_counter() - start

        # Get probabilities for positive class = 1
        if hasattr(clf, "predict_proba"):
            y_proba_raw = clf.predict_proba(X_test_scaled)
            if y_proba_raw.ndim == 2 and y_proba_raw.shape[1] == 2:
                y_proba = y_proba_raw[:, 1]
            else:
                y_proba = y_proba_raw.ravel()
        else:
            scores = clf.decision_function(X_test_scaled)
            y_proba = (scores - scores.min()) / (
                scores.max() - scores.min() + 1e-8
            )

        # Fixed threshold = 0.5
        y_pred = (y_proba >= 0.5).astype(int)

        cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc = roc_auc_score(y_test, y_proba)
        pr_auc = average_precision_score(y_test, y_proba)
        fp_per_tp = fp / tp if tp > 0 else np.inf

        rows.append(
            {
                "model": model_name,
                "sampler": sampler_name,
                "fold": fold_idx,
                "accuracy": acc,
                "precision": prec,
                "recall": rec,
                "f1": f1,
                "roc_auc": roc,
                "pr_auc": pr_auc,
                "fp_per_tp": fp_per_tp,
                "train_time_sec": train_time,
                "tn": tn,
                "fp": fp,
                "fn": fn,
                "tp": tp,
            }
        )

        cms.append(
            {
                "model": model_name,
                "sampler": sampler_name,
                "fold": fold_idx,
                "cm": cm,
            }
        )

        # Store PR curve only for the first fold (for plotting)
        if fold_idx == 1:
            pr_prec, pr_rec, _ = precision_recall_curve(y_test, y_proba)
            pr_curves.append(
                {
                    "model": model_name,
                    "sampler": sampler_name,
                    "precision": pr_prec,
                    "recall": pr_rec,
                }
            )

    metrics_df = pd.DataFrame(rows)
    return metrics_df, cms, pr_curves


# ============================================================
#  Run All Model × Sampler experiments
# ============================================================

all_metrics_list = []
all_cms_list = []
all_pr_curves_list = []

for model_name, base_model in models.items():
    for sampler_name, sampler in samplers.items():
        print(f"Running {model_name} with {sampler_name} on AICD...")
        metrics_df, cms, pr_curves = evaluate_model(
            model_name=model_name,
            base_model=base_model,
            sampler_name=sampler_name,
            sampler=sampler,
            X_df=X,
            y_series=y,
            n_splits=5,
            n_repeats=3,
            random_state=RANDOM_STATE,
            cost_sensitive_weights=cost_sensitive_weights,
        )
        all_metrics_list.append(metrics_df)
        all_cms_list.extend(cms)
        all_pr_curves_list.extend(pr_curves)

all_metrics_df = pd.concat(all_metrics_list, ignore_index=True)
print("All metrics per fold shape:", all_metrics_df.shape)

# Save per-fold metrics
all_metrics_df.to_csv("all_metrics_per_fold_AICD.csv", index=False)

# ============================================================
#  Summary Table: mean ± std for each (model, sampler)
# ============================================================

summary = (
    all_metrics_df.groupby(["model", "sampler"])
    .agg(
        accuracy_mean=("accuracy", "mean"),
        accuracy_std=("accuracy", "std"),
        precision_mean=("precision", "mean"),
        precision_std=("precision", "std"),
        recall_mean=("recall", "mean"),
        recall_std=("recall", "std"),
        f1_mean=("f1", "mean"),
        f1_std=("f1", "std"),
        roc_auc_mean=("roc_auc", "mean"),
        roc_auc_std=("roc_auc", "std"),
        pr_auc_mean=("pr_auc", "mean"),
        pr_auc_std=("pr_auc", "std"),
        fp_per_tp_mean=("fp_per_tp", "mean"),
        fp_per_tp_std=("fp_per_tp", "std"),
        train_time_mean=("train_time_sec", "mean"),
        train_time_std=("train_time_sec", "std"),
    )
    .reset_index()
)

summary.to_excel("summary_metrics_mean_std_AICD.xlsx", index=False)
print("Summary table saved to summary_metrics_mean_std_AICD.xlsx")
print(summary.head())

# ============================================================
#  Confusion Matrices: numeric form + optional plots
# ============================================================

cm_records = []
for entry in all_cms_list:
    cm = entry["cm"]
    tn, fp, fn, tp = cm.ravel()
    cm_records.append(
        {
            "model": entry["model"],
            "sampler": entry["sampler"],
            "fold": entry["fold"],
            "tn": tn,
            "fp": fp,
            "fn": fn,
            "tp": tp,
        }
    )

cm_df = pd.DataFrame(cm_records)
cm_df.to_csv("confusion_matrices_all_folds_AICD.csv", index=False)
print("Confusion matrices saved to confusion_matrices_all_folds_AICD.csv")

def plot_confusion_matrix(cm, model_name, sampler_name, fold):
    plt.figure(figsize=(4, 4))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Pred 0", "Pred 1"],
        yticklabels=["True 0", "True 1"],
    )
    plt.title(f"Confusion Matrix (AICD): {model_name} + {sampler_name} (fold {fold})")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    fname = f"cm_{model_name}_{sampler_name}_AICD_fold{fold}.png"
    plt.savefig(fname, dpi=300)
    plt.close()
    return fname

# Example: CM plots for fold 1 of some key hybrids
for entry in all_cms_list:
    if (
        entry["fold"] == 1
        and entry["sampler"] in ["No Sampling", "SMOTEENN"]
        and entry["model"] in ["ANN", "RandomForest", "XGBoost"]
    ):
        plot_confusion_matrix(
            entry["cm"], entry["model"], entry["sampler"], entry["fold"]
        )

# ============================================================
#  Precision–Recall Curves
# ============================================================

def plot_pr_curves(pr_curves, model_name, samplers_to_plot):
    plt.figure(figsize=(6, 5))
    for entry in pr_curves:
        if entry["model"] == model_name and entry["sampler"] in samplers_to_plot:
            plt.plot(entry["recall"], entry["precision"], label=entry["sampler"])
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision–Recall curves (AICD, {model_name})")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    fname = f"pr_curves_{model_name}_AICD.png"
    plt.savefig(fname, dpi=300)
    plt.close()
    return fname

# Example: ANN PR curves for a few samplers
plot_pr_curves(
    all_pr_curves_list, "ANN", ["No Sampling", "RandomOverSampler", "SMOTEENN"]
)

# ============================================================
#  Feature Importances for best tree-based models (example)
# ============================================================

feature_importances_records = []
best_sampler_for_importance = "SMOTEENN"  # same idea as before

for model_name in ["RandomForest", "XGBoost", "LightGBM", "CatBoost"]:
    if model_name not in models:
        continue

    sampler = samplers[best_sampler_for_importance]

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_res, y_res = sampler.fit_resample(X_scaled, y)

    base_model = models[model_name]
    clf = clone(base_model)
    clf.fit(X_res, y_res)

    if hasattr(clf, "feature_importances_"):
        imps = clf.feature_importances_
        tmp_df = pd.DataFrame(
            {
                "feature": X.columns,
                "importance": imps,
                "model": model_name,
                "sampler": best_sampler_for_importance,
            }
        )
        feature_importances_records.append(tmp_df)

if feature_importances_records:
    feature_importances_df = pd.concat(feature_importances_records, ignore_index=True)
    feature_importances_df.to_csv(
        "feature_importances_best_models_AICD.csv", index=False
    )
    print("Feature importances saved to feature_importances_best_models_AICD.csv")

# ============================================================
#  Example: Wilcoxon tests for significance (AICD)
# ============================================================

def wilcoxon_test(all_metrics_df, model_name, sampler_a, sampler_b, metric="f1"):
    a = all_metrics_df[
        (all_metrics_df["model"] == model_name)
        & (all_metrics_df["sampler"] == sampler_a)
    ][metric]
    b = all_metrics_df[
        (all_metrics_df["model"] == model_name)
        & (all_metrics_df["sampler"] == sampler_b)
    ][metric]

    stat, p = wilcoxon(a, b, zero_method="wilcox", correction=False)
    print(
        f"Wilcoxon ({metric}) for {model_name} (AICD): {sampler_a} vs {sampler_b} -> p = {p:.4g}"
    )
    return stat, p

# Example comparisons (you can comment/uncomment as needed)
wilcoxon_test(all_metrics_df, "ANN", "SMOTEENN", "No Sampling", metric="f1")
wilcoxon_test(all_metrics_df, "RandomForest", "SMOTEENN", "No Sampling", metric="f1")
wilcoxon_test(all_metrics_df, "XGBoost", "SMOTEENN", "No Sampling", metric="pr_auc")

print("DONE (AICD). All result files saved in the working directory.")


Original AICD data shape: (1000, 39)
   months_as_customer  age  policy_number policy_bind_date policy_state  \
0                 328   48         521585       2014-10-17           OH   
1                 228   42         342868       2006-06-27           IN   
2                 134   29         687698       2000-09-06           OH   
3                 256   41         227811       1990-05-25           IL   
4                 228   44         367455       2014-06-06           IL   

   policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \
0       250.5               1000                1406.91               0   
1       250.5               2000                1197.22         5000000   
2       100.3               2000                1413.14         5000000   
3       250.5               2000                1415.74         6000000   
4       500.1               1000                1583.91         6000000   

   insured_zip  ... witnesses police_report_available total_c



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 198, number of negative: 198
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2290
[LightGBM] [Info] Number of data points in the train set: 396, number of used features: 134
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 197, number of negative: 197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094631 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2332
[LightGBM] [Info] Number of data points in the train set: 394, number of used features: 136
[LightGBM] [Info] 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel("summary_metrics_mean_std_AICD.xlsx")

metric_cols = ["accuracy_mean", "precision_mean", "recall_mean",
               "f1_mean", "roc_auc_mean"]

if df["recall_mean"].max() <= 1.0:
    for col in metric_cols:
        df[col] = df[col] * 100.0

sampler_order = [
    "No Sampling",
    "Cost Sensitive",
    "RandomOverSampler",
    "SMOTE",
    "ADASYN",
    "BorderlineSMOTE",
    "RandomUnderSampler",
    "NearMiss",
    "TomekLinks",
    "ClusterCentroids",
    "SMOTETomek",
    "SMOTEENN",
]

model_order  = ["ANN", "XGBoost", "CatBoost", "LightGBM", "RandomForest", "AdaBoost"]
model_labels = ["ANN", "XGBoost", "CatBoost", "LightGBM", "Random Forest", "AdaBoost"]

df["sampler"] = pd.Categorical(df["sampler"], categories=sampler_order, ordered=True)
df = df.sort_values(["sampler", "model"])

def plot_metric(metric_col, ylabel, filename):
    pivot = df.pivot(index="sampler", columns="model", values=metric_col)
    pivot = pivot.loc[sampler_order, model_order]

    x = np.arange(len(pivot.index))
    width = 0.12

    fig, ax = plt.subplots(figsize=(12, 6))

    for i, model in enumerate(model_order):
        ax.bar(
            x + (i - len(model_order)/2)*width + width/2,
            pivot[model].values,
            width,
            label=model_labels[i],
        )

    ax.set_xticks(x)
    ax.set_xticklabels(pivot.index, rotation=45, ha="right", fontsize=15)
    ax.set_ylabel(ylabel, fontsize=16)
    ax.set_xlabel("Sampling method", fontsize=16)

    ax.tick_params(axis="y", labelsize=15)

    ax.legend(fontsize=15)

    ax.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

    fig.tight_layout()
    fig.savefig(f"{filename}.pdf")
    plt.close(fig)

plot_metric("recall_mean", "Recall (%)", "Recall")
plot_metric("f1_mean", "F1-score (%)", "F1")
plot_metric("roc_auc_mean", "AUC (%)", "AUC")

print("Done. Files saved: Recall.pdf, F1.pdf, AUC.pdf")


Done. Files saved: Recall_AICD.pdf, F1_AICD.pdf, AUC_AICD.pdf
