In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import wilcoxon

In [17]:
# Fold Results for Logistic Regression and XGBoost
lg_xg_fold_df = pd.read_csv('./data/4_final_fold_results.csv')
lg_xg_fold_df.columns = lg_xg_fold_df.columns.str.lower()
lg_xg_fold_df

Unnamed: 0,fold,model name,accuracy,macro precision,macro recall,macro f1,roc auc,class 0.0 precision,class 0.0 recall,class 1.0 precision,class 1.0 recall,class 2.0 precision,class 2.0 recall
0,1,Logistic Regression,0.656188,0.636743,0.631663,0.630089,0.813556,0.684241,0.537775,0.681942,0.736786,0.544047,0.620429
1,2,Logistic Regression,0.654219,0.633989,0.631809,0.630154,0.810149,0.66574,0.548535,0.682755,0.726883,0.553471,0.620008
2,3,Logistic Regression,0.658063,0.638786,0.631811,0.631687,0.813442,0.682997,0.542707,0.68234,0.739864,0.55102,0.612863
3,4,Logistic Regression,0.656657,0.637401,0.630989,0.63018,0.815688,0.674949,0.528738,0.682282,0.74254,0.554972,0.62169
4,5,Logistic Regression,0.649905,0.629709,0.627788,0.624831,0.80979,0.670893,0.533089,0.678795,0.726482,0.53944,0.623792
5,1,Logistic Regression ROS,0.698515,0.696682,0.698515,0.695504,0.827916,0.727698,0.686605,0.649287,0.584906,0.713062,0.824033
6,2,Logistic Regression ROS,0.700566,0.698204,0.700566,0.697366,0.826176,0.727324,0.693162,0.647753,0.58049,0.719535,0.828048
7,3,Logistic Regression ROS,0.695526,0.694199,0.695526,0.692593,0.825359,0.732762,0.675498,0.640902,0.585842,0.708932,0.825238
8,4,Logistic Regression ROS,0.704804,0.703546,0.704804,0.701771,0.831105,0.736304,0.694233,0.663213,0.591061,0.711121,0.829118
9,5,Logistic Regression ROS,0.696686,0.69508,0.696686,0.693669,0.824775,0.731252,0.686337,0.645713,0.581426,0.708276,0.822294


In [16]:
# Fold Results for MLP and TabTransformer
mlp_tabtrans_fold_df = pd.read_csv('./data/5_final_fold_results.csv')
folds = [1, 2, 3, 4, 5] * 6
mlp_tabtrans_fold_df['Fold'] = folds
model_names = ['MLP'] * 5 + ['MLP with ROS'] * 5 + ['MLP with SMOTE'] * 5 + ['TabTransformer'] * 5 + ['TabTransformer with ROS'] * 5 + ['TabTransformer with SMOTE'] * 5
mlp_tabtrans_fold_df['Model Name'] = model_names
# Change column order
mlp_tabtrans_fold_df = mlp_tabtrans_fold_df[['Fold', 'Model Name', 'accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'attention_weights']]
mlp_tabtrans_fold_df.columns = mlp_tabtrans_fold_df.columns.str.lower()
mlp_tabtrans_fold_df

Unnamed: 0,fold,model name,accuracy,precision,recall,f1_score,roc_auc,attention_weights
0,1,MLP,0.757032,0.739363,0.748921,0.743902,0.898574,
1,2,MLP,0.749086,0.729895,0.732064,0.730798,0.89187,
2,3,MLP,0.750686,0.729153,0.745876,0.736666,0.893049,
3,4,MLP,0.757015,0.744099,0.747905,0.745501,0.898466,
4,5,MLP,0.758,0.740146,0.749656,0.74463,0.898536,
5,1,MLP with ROS,0.848967,0.853474,0.848967,0.842633,0.939195,
6,2,MLP with ROS,0.841251,0.848059,0.841251,0.833061,0.936388,
7,3,MLP with ROS,0.843659,0.84876,0.843659,0.836824,0.938925,
8,4,MLP with ROS,0.844105,0.848669,0.844105,0.83747,0.938506,
9,5,MLP with ROS,0.844016,0.846548,0.844016,0.83808,0.938037,


In [24]:
lg_original_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'Logistic Regression']['macro f1'].values.tolist()
lg_ros_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'Logistic Regression ROS']['macro f1'].values.tolist()
lg_smote_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'Logistic Regression SMOTE']['macro f1'].values.tolist()
xg_original_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'XGBoost']['macro f1'].values.tolist()
xg_ros_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'XGBoost ROS']['macro f1'].values.tolist()
xg_smote_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'XGBoost SMOTE']['macro f1'].values.tolist()
mlp_original_f1 = mlp_tabtrans_fold_df[mlp_tabtrans_fold_df['model name'] == 'MLP']['f1_score'].values.tolist()
mlp_ros_f1 = mlp_tabtrans_fold_df[mlp_tabtrans_fold_df['model name'] == 'MLP with ROS']['f1_score'].values.tolist()
mlp_smote_f1 = mlp_tabtrans_fold_df[mlp_tabtrans_fold_df['model name'] == 'MLP with SMOTE']['f1_score'].values.tolist()
tabtrans_original_f1 = mlp_tabtrans_fold_df[mlp_tabtrans_fold_df['model name'] == 'TabTransformer']['f1_score'].values.tolist()
tabtrans_ros_f1 = mlp_tabtrans_fold_df[mlp_tabtrans_fold_df['model name'] == 'TabTransformer with ROS']['f1_score'].values.tolist()
tabtrans_smote_f1 = mlp_tabtrans_fold_df[mlp_tabtrans_fold_df['model name'] == 'TabTransformer with SMOTE']['f1_score'].values.tolist()

In [29]:
# RQ1
from scipy.stats import wilcoxon
import pandas as pd

# 模拟数据结构（用户已定义）
results = {}

# 模拟：RQ1 - 模型对比（原始数据）
def run_rq1_tests():
    comparisons = [
        ("MLP", "Logistic Regression"),
        ("MLP", "XGBoost"),
        ("TabTransformer", "Logistic Regression"),
        ("TabTransformer", "XGBoost"),
        ("TabTransformer", "MLP")  # 可选
    ]
    
    model_f1 = {
        "Logistic Regression": lg_original_f1,
        "XGBoost": xg_original_f1,
        "MLP": mlp_original_f1,
        "TabTransformer": tabtrans_original_f1
    }

    rq1_results = []
    for model_a, model_b in comparisons:
        stat, p = wilcoxon(model_f1[model_a], model_f1[model_b])
        rq1_results.append({
            "Comparison": f"{model_a} vs {model_b}",
            "Statistic": stat,
            "p-value": p
        })
    return pd.DataFrame(rq1_results)

# 模拟：RQ2 - 采样方法对比（每个模型原始 vs ROS / SMOTE）
def run_rq2_tests():
    comparisons = [
        ("Logistic Regression", lg_original_f1, lg_ros_f1, "ROS"),
        ("Logistic Regression", lg_original_f1, lg_smote_f1, "SMOTE"),
        ("XGBoost", xg_original_f1, xg_ros_f1, "ROS"),
        ("XGBoost", xg_original_f1, xg_smote_f1, "SMOTE"),
        ("MLP", mlp_original_f1, mlp_ros_f1, "ROS"),
        ("MLP", mlp_original_f1, mlp_smote_f1, "SMOTE"),
        ("TabTransformer", tabtrans_original_f1, tabtrans_ros_f1, "ROS"),
        ("TabTransformer", tabtrans_original_f1, tabtrans_smote_f1, "SMOTE"),
    ]

    rq2_results = []
    for model, original, sampled, method in comparisons:
        stat, p = wilcoxon(sampled, original)
        rq2_results.append({
            "Model": model,
            "Method": method,
            "Statistic": stat,
            "p-value": p
        })
    return pd.DataFrame(rq2_results)

# 执行检验
rq1_df = run_rq1_tests()
rq2_df = run_rq2_tests()


In [30]:
rq1_df

Unnamed: 0,Comparison,Statistic,p-value
0,MLP vs Logistic Regression,0.0,0.0625
1,MLP vs XGBoost,0.0,0.0625
2,TabTransformer vs Logistic Regression,0.0,0.0625
3,TabTransformer vs XGBoost,1.0,0.125
4,TabTransformer vs MLP,0.0,0.0625


In [31]:
rq2_df

Unnamed: 0,Model,Method,Statistic,p-value
0,Logistic Regression,ROS,0.0,0.0625
1,Logistic Regression,SMOTE,0.0,0.0625
2,XGBoost,ROS,0.0,0.0625
3,XGBoost,SMOTE,0.0,0.0625
4,MLP,ROS,0.0,0.0625
5,MLP,SMOTE,0.0,0.0625
6,TabTransformer,ROS,0.0,0.0625
7,TabTransformer,SMOTE,0.0,0.0625


In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from typing import List, Tuple

# Define permutation test function
def permutation_test(x: List[float], y: List[float], n_permutations=10000, metric='mean', alternative='two-sided', seed=42) -> Tuple[float, float]:
    np.random.seed(seed)
    x = np.array(x)
    y = np.array(y)
    observed_diff = np.mean(x - y) if metric == 'mean' else np.median(x - y)
    diffs = []

    for _ in range(n_permutations):
        mask = np.random.rand(len(x)) > 0.5
        x_perm = np.where(mask, x, y)
        y_perm = np.where(mask, y, x)
        diff = np.mean(x_perm - y_perm) if metric == 'mean' else np.median(x_perm - y_perm)
        diffs.append(diff)

    diffs = np.array(diffs)

    if alternative == 'two-sided':
        p = np.mean(np.abs(diffs) >= np.abs(observed_diff))
    elif alternative == 'greater':
        p = np.mean(diffs >= observed_diff)
    else:
        p = np.mean(diffs <= observed_diff)

    return observed_diff, p

# Placeholder for actual score lists (to be filled from user data)
scores = {
    "Logistic Regression": [],
    "Logistic Regression ROS": [],
    "Logistic Regression SMOTE": [],
    "XGBoost": [],
    "XGBoost ROS": [],
    "XGBoost SMOTE": [],
    "MLP": [],
    "MLP with ROS": [],
    "MLP with SMOTE": [],
    "TabTransformer": [],
    "TabTransformer with ROS": [],
    "TabTransformer with SMOTE": [],
}

# Replace this with actual values from user
scores["Logistic Regression"] = lg_original_f1
scores["Logistic Regression ROS"] = lg_ros_f1
scores["Logistic Regression SMOTE"] = lg_smote_f1
scores["XGBoost"] = xg_original_f1
scores["XGBoost ROS"] = xg_ros_f1
scores["XGBoost SMOTE"] = xg_smote_f1
scores["MLP"] = mlp_original_f1
scores["MLP with ROS"] = mlp_ros_f1
scores["MLP with SMOTE"] = mlp_smote_f1
scores["TabTransformer"] = tabtrans_original_f1
scores["TabTransformer with ROS"] = tabtrans_ros_f1
scores["TabTransformer with SMOTE"] = tabtrans_smote_f1

# Define RQ1 comparisons (between models on original data)
rq1_comparisons = [
    ("MLP", "Logistic Regression"),
    ("MLP", "XGBoost"),
    ("TabTransformer", "Logistic Regression"),
    ("TabTransformer", "XGBoost"),
    ("TabTransformer", "MLP"),
]

# Define RQ2 comparisons (same model with different sampling)
rq2_comparisons = [
    ("Logistic Regression", "Logistic Regression ROS"),
    ("Logistic Regression", "Logistic Regression SMOTE"),
    ("XGBoost", "XGBoost ROS"),
    ("XGBoost", "XGBoost SMOTE"),
    ("MLP", "MLP with ROS"),
    ("MLP", "MLP with SMOTE"),
    ("TabTransformer", "TabTransformer with ROS"),
    ("TabTransformer", "TabTransformer with SMOTE"),
]

# Run permutation tests
def run_tests(pairs, label):
    results = []
    for a, b in pairs:
        diff, p = permutation_test(scores[a], scores[b])
        results.append({
            "Comparison": f"{a} vs {b}",
            "Mean Difference": round(diff, 4),
            "p-value": round(p, 4)
        })
    return pd.DataFrame(results)

rq1_results = run_tests(rq1_comparisons, "RQ1")
rq2_results = run_tests(rq2_comparisons, "RQ2")

In [37]:
tabtrans_original_f1

[0.782872988384616,
 0.7652688371220492,
 0.7649799252842872,
 0.7633080228917368,
 0.7687255576920458]

In [38]:
xg_original_f1

[0.7612187122225885,
 0.7610477202007528,
 0.7653837374656515,
 0.7616275090757973,
 0.7609683057688739]

In [35]:
rq1_results

Unnamed: 0,Comparison,Mean Difference,p-value
0,MLP vs Logistic Regression,0.1109,0.0619
1,MLP vs XGBoost,-0.0217,0.0619
2,TabTransformer vs Logistic Regression,0.1396,0.0619
3,TabTransformer vs XGBoost,0.007,0.1257
4,TabTransformer vs MLP,0.0287,0.0619


In [36]:
rq2_results

Unnamed: 0,Comparison,Mean Difference,p-value
0,Logistic Regression vs Logistic Regression ROS,-0.0668,0.0619
1,Logistic Regression vs Logistic Regression SMOTE,-0.0895,0.0619
2,XGBoost vs XGBoost ROS,-0.0659,0.0619
3,XGBoost vs XGBoost SMOTE,-0.0766,0.0619
4,MLP vs MLP with ROS,-0.0973,0.0619
5,MLP vs MLP with SMOTE,-0.0918,0.0619
6,TabTransformer vs TabTransformer with ROS,-0.0914,0.0619
7,TabTransformer vs TabTransformer with SMOTE,-0.0826,0.0619
