In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import wilcoxon

In [3]:
# Fold Results for Logistic Regression and XGBoost
lg_xg_fold_df = pd.read_csv('./data/4_final_fold_results.csv')
lg_xg_fold_df.columns = lg_xg_fold_df.columns.str.lower()
lg_xg_fold_df

Unnamed: 0,fold,model name,accuracy,macro precision,macro recall,macro f1,roc auc,class 0.0 precision,class 0.0 recall,class 1.0 precision,class 1.0 recall,class 2.0 precision,class 2.0 recall
0,1,Logistic Regression,0.662447,0.64266,0.635844,0.634841,0.815304,0.694526,0.540293,0.687931,0.747391,0.545522,0.619849
1,2,Logistic Regression,0.648805,0.629192,0.624645,0.623378,0.811437,0.672206,0.537088,0.674789,0.726251,0.540581,0.610597
2,3,Logistic Regression,0.654149,0.633419,0.631316,0.629259,0.811161,0.670601,0.546496,0.683325,0.728124,0.546331,0.619328
3,4,Logistic Regression,0.655274,0.635565,0.631775,0.631068,0.809459,0.663889,0.547412,0.682591,0.730265,0.560213,0.617647
4,5,Logistic Regression,0.652461,0.633642,0.631076,0.628585,0.808803,0.671461,0.534585,0.678803,0.728392,0.550661,0.630252
5,6,Logistic Regression,0.667229,0.648072,0.63679,0.639103,0.818342,0.697595,0.557692,0.688372,0.752677,0.558249,0.6
6,7,Logistic Regression,0.660338,0.640918,0.636118,0.634445,0.816665,0.683353,0.537546,0.686556,0.742238,0.552846,0.628571
7,8,Logistic Regression,0.651287,0.631882,0.623944,0.62394,0.814761,0.666862,0.520604,0.676694,0.740632,0.552091,0.610597
8,9,Logistic Regression,0.651006,0.629906,0.627827,0.625117,0.810023,0.680754,0.545788,0.680322,0.724572,0.528644,0.61312
9,10,Logistic Regression,0.651287,0.632062,0.628146,0.626266,0.809271,0.665701,0.526099,0.678147,0.732602,0.552339,0.625736


In [4]:
# Fold Results for MLP and Transformer
mlp_trans_fold_df = pd.read_csv('./data/5_final_fold_results.csv')
num_folds = 10
model_names = ['MLP'] * num_folds + ['MLP with ROS'] * num_folds + ['MLP with SMOTE'] * num_folds + ['Transformer'] * num_folds + ['Transformer with ROS'] * num_folds + ['Transformer with SMOTE'] * num_folds + ['TabTransformer'] * num_folds + ['TabTransformer with ROS'] * num_folds + ['TabTransformer with SMOTE'] * num_folds + ['FTTransformer'] * num_folds + ['FTTransformer with ROS'] * num_folds + ['FTTransformer with SMOTE'] * num_folds
mlp_trans_fold_df['Model Name'] = model_names
# Change column order
mlp_trans_fold_df.columns = mlp_trans_fold_df.columns.str.lower()
mlp_trans_fold_df

Unnamed: 0,fold,model name,accuracy,precision,recall,f1_score,roc_auc,attention_weights
0,1,MLP,0.772293,0.756238,0.760196,0.757818,0.905977,
1,2,MLP,0.763010,0.746337,0.755540,0.750704,0.899468,
2,3,MLP,0.763010,0.743496,0.765572,0.753224,0.902575,
3,4,MLP,0.774824,0.759398,0.771537,0.764981,0.906906,
4,5,MLP,0.762447,0.745330,0.757961,0.751172,0.898570,
...,...,...,...,...,...,...,...,...
115,6,FTTransformer with SMOTE,0.873851,0.874368,0.873848,0.872224,0.960346,
116,7,FTTransformer with SMOTE,0.874565,0.875111,0.874562,0.872525,0.958017,
117,8,FTTransformer with SMOTE,0.877777,0.877824,0.877774,0.876277,0.961730,
118,9,FTTransformer with SMOTE,0.880364,0.880113,0.880361,0.879117,0.959837,


In [5]:
lg_original_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'Logistic Regression']['macro f1'].values.tolist()
lg_ros_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'Logistic Regression ROS']['macro f1'].values.tolist()
lg_smote_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'Logistic Regression SMOTE']['macro f1'].values.tolist()
xg_original_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'XGBoost']['macro f1'].values.tolist()
xg_ros_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'XGBoost ROS']['macro f1'].values.tolist()
xg_smote_f1 = lg_xg_fold_df[lg_xg_fold_df['model name'] == 'XGBoost SMOTE']['macro f1'].values.tolist()
mlp_original_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'MLP']['f1_score'].values.tolist()
mlp_ros_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'MLP with ROS']['f1_score'].values.tolist()
mlp_smote_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'MLP with SMOTE']['f1_score'].values.tolist()
transformer_original_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'Transformer']['f1_score'].values.tolist()
transformer_ros_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'Transformer with ROS']['f1_score'].values.tolist()
transformer_smote_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'Transformer with SMOTE']['f1_score'].values.tolist()
tabtrans_original_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'TabTransformer']['f1_score'].values.tolist()
tabtrans_ros_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'TabTransformer with ROS']['f1_score'].values.tolist()
tabtrans_smote_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'TabTransformer with SMOTE']['f1_score'].values.tolist()
tftransformer_original_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'FTTransformer']['f1_score'].values.tolist()
tftransformer_ros_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'FTTransformer with ROS']['f1_score'].values.tolist()
tftransformer_smote_f1 = mlp_trans_fold_df[mlp_trans_fold_df['model name'] == 'FTTransformer with SMOTE']['f1_score'].values.tolist()

Baseline

In [32]:
# MLP vs LR
stat_mlp_lg, p_value_mlp_lg = wilcoxon(mlp_original_f1, lg_original_f1)
# MLP vs XGBoost
stat_mlp_xg, p_value_mlp_xg = wilcoxon(mlp_original_f1, xg_original_f1)
# Transformer vs LR
stat_transformer_lg, p_value_transformer_lg = wilcoxon(transformer_original_f1, lg_original_f1)
# Transformer vs XGBoost
stat_transformer_xg, p_value_transformer_xg = wilcoxon(transformer_original_f1, xg_original_f1)
# Transformer vs MLP
stat_transformer_mlp, p_value_transformer_mlp = wilcoxon(transformer_original_f1, mlp_original_f1)

print(f"MLP vs LR: statistic={stat_mlp_lg}, p-value={p_value_mlp_lg}")
print(f"MLP vs XGBoost: statistic={stat_mlp_xg}, p-value={p_value_mlp_xg}")
print(f"Transformer vs LR: statistic={stat_transformer_lg}, p-value={p_value_transformer_lg}")
print(f"Transformer vs XGBoost: statistic={stat_transformer_xg}, p-value={p_value_transformer_xg}")
print(f"Transformer vs MLP: statistic={stat_transformer_mlp}, p-value={p_value_transformer_mlp}")

MLP vs LR: statistic=0.0, p-value=0.001953125
MLP vs XGBoost: statistic=0.0, p-value=0.001953125
Transformer vs LR: statistic=0.0, p-value=0.001953125
Transformer vs XGBoost: statistic=0.0, p-value=0.001953125
Transformer vs MLP: statistic=0.0, p-value=0.001953125


In [33]:
# MLP vs TabTransformer
stat_mlp_tt, p_mlp_tt = wilcoxon(mlp_original_f1, tabtrans_original_f1)
# MLP vs FTTransformer
stat_mlp_ttf, p_mlp_ttf = wilcoxon(mlp_original_f1, tftransformer_original_f1)
# Transformer vs TabTransformer
stat_tt, p_tt = wilcoxon(transformer_original_f1, tabtrans_original_f1)
# Transformer vs FTTransformer
stat_ft, p_ft = wilcoxon(transformer_original_f1, tftransformer_original_f1)
# TabTransformer vs FTTransformer
stat_ttf, p_ttf = wilcoxon(tabtrans_original_f1, tftransformer_original_f1)


print(f"MLP vs TabTransformer: statistic={stat_mlp_tt}, p-value={p_mlp_tt}")
print(f"MLP vs FTTransformer: statistic={stat_mlp_ttf}, p-value={p_mlp_ttf}")
print(f"Transformer vs TabTransformer: statistic={stat_tt}, p-value={p_tt}")
print(f"Transformer vs FTTransformer: statistic={stat_ft}, p-value={p_ft}")
print(f"TabTransformer vs FTTransformer: statistic={stat_ttf}, p-value={p_ttf}")

MLP vs TabTransformer: statistic=0.0, p-value=0.001953125
MLP vs FTTransformer: statistic=0.0, p-value=0.001953125
Transformer vs TabTransformer: statistic=0.0, p-value=0.001953125
Transformer vs FTTransformer: statistic=9.0, p-value=0.064453125
TabTransformer vs FTTransformer: statistic=0.0, p-value=0.001953125


In [39]:
from statsmodels.stats.multitest import multipletests

# Combine p-values for multiple comparisons
p_values = [
    p_value_mlp_lg, p_value_mlp_xg, p_value_transformer_lg, p_value_transformer_xg,
    p_value_transformer_mlp, p_mlp_tt, p_mlp_ttf, p_tt, p_ft, p_ttf
]
comparison_names = [
    "MLP vs LR", "MLP vs XGBoost", "Transformer vs LR", "Transformer vs XGBoost",
    "Transformer vs MLP", "MLP vs TabTransformer", "MLP vs FTTransformer",
    "Transformer vs TabTransformer", "Transformer vs FTTransformer",
    "TabTransformer vs FTTransformer"
]

# # Bonferroni Correction
# _, pvals_bonf, _, _ = multipletests(p_values, method='bonferroni')
# FDR Correction
_, pvals_bonf, _, _ = multipletests(p_values, method='fdr_bh')

# Significance function
def get_sig(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return 'ns'

# Results
print("Comparison  \t\t\t Raw p \t\t FDR p\t\tSignificance")
for name, raw, bonf in zip(comparison_names, p_values, pvals_bonf):
    print(f"{name:<30}\t {raw:.4f}\t\t {bonf:.4f}\t\t{get_sig(bonf)}")


Comparison  			 Raw p 		 FDR p		Significance
MLP vs LR                     	 0.0020		 0.0022		**
MLP vs XGBoost                	 0.0020		 0.0022		**
Transformer vs LR             	 0.0020		 0.0022		**
Transformer vs XGBoost        	 0.0020		 0.0022		**
Transformer vs MLP            	 0.0020		 0.0022		**
MLP vs TabTransformer         	 0.0020		 0.0022		**
MLP vs FTTransformer          	 0.0020		 0.0022		**
Transformer vs TabTransformer 	 0.0020		 0.0022		**
Transformer vs FTTransformer  	 0.0645		 0.0645		ns
TabTransformer vs FTTransformer	 0.0020		 0.0022		**


ROS / SMOTE

In [41]:
# Logistic Regression: Original vs ROS
stat_lo_ros, p_lo_ros = wilcoxon(lg_original_f1, lg_ros_f1)
# Logistic Regression: Original vs SMOTE
stat_lo_smote, p_lo_smote = wilcoxon(lg_original_f1, lg_smote_f1)
print("Logistic Regression Original vs ROS: statistic={}, p-value={}".format(stat_lo_ros, p_lo_ros))
print("Logistic Regression Original vs SMOTE: statistic={}, p-value={}".format(stat_lo_smote, p_lo_smote))

# XGBoost: Original vs ROS
stat_xg_ros, p_xg_ros = wilcoxon(xg_original_f1, xg_ros_f1)
# XGBoost: Original vs SMOTE
stat_xg_smote, p_xg_smote = wilcoxon(xg_original_f1, xg_smote_f1)
print("XGBoost Original vs ROS: statistic={}, p-value={}".format(stat_xg_ros, p_xg_ros))
print("XGBoost Original vs SMOTE: statistic={}, p-value={}".format(stat_xg_smote, p_xg_smote))

# MLP: Original vs ROS
stat_mlp_ros, p_mlp_ros = wilcoxon(mlp_original_f1, mlp_ros_f1)
# MLP: Original vs SMOTE
stat_mlp_smote, p_mlp_smote = wilcoxon(mlp_original_f1, mlp_smote_f1)
print("MLP Original vs ROS: statistic={}, p-value={}".format(stat_mlp_ros, p_mlp_ros))
print("MLP Original vs SMOTE: statistic={}, p-value={}".format(stat_mlp_smote, p_mlp_smote))

# Transformer: Original vs ROS
stat_transformer_ros, p_transformer_ros = wilcoxon(transformer_original_f1, transformer_ros_f1)
# Transformer: Original vs SMOTE
stat_transformer_smote, p_transformer_smote = wilcoxon(transformer_original_f1, transformer_smote_f1)
print("Transformer Original vs ROS: statistic={}, p-value={}".format(stat_transformer_ros, p_transformer_ros))
print("Transformer Original vs SMOTE: statistic={}, p-value={}".format(stat_transformer_smote, p_transformer_smote))

# TabTransformer: Original vs ROS
stat_tabtrans_ros, p_tabtrans_ros = wilcoxon(tabtrans_original_f1, tabtrans_ros_f1)
# TabTransformer: Original vs SMOTE
stat_tabtrans_smote, p_tabtrans_smote = wilcoxon(tabtrans_original_f1, tabtrans_smote_f1)
print("TabTransformer Original vs ROS: statistic={}, p-value={}".format(stat_tabtrans_ros, p_tabtrans_ros))
print("TabTransformer Original vs SMOTE: statistic={}, p-value={}".format(stat_tabtrans_smote, p_tabtrans_smote))

# FTTransformer: Original vs ROS
stat_tftransformer_ros, p_tftransformer_ros = wilcoxon(tftransformer_original_f1, tftransformer_ros_f1)
# FTTransformer: Original vs SMOTE
stat_tftransformer_smote, p_tftransformer_smote = wilcoxon(tftransformer_original_f1, tftransformer_smote_f1)
print("FTTransformer Original vs ROS: statistic={}, p-value={}".format(stat_tftransformer_ros, p_tftransformer_ros))
print("FTTransformer Original vs SMOTE: statistic={}, p-value={}".format(stat_tftransformer_smote, p_tftransformer_smote))

# Combine p-values for multiple comparisons
p_values = [
    p_lo_ros, p_lo_smote,
    p_xg_ros, p_xg_smote,
    p_mlp_ros, p_mlp_smote,
    p_transformer_ros, p_transformer_smote,
    p_tabtrans_ros, p_tabtrans_smote,
    p_tftransformer_ros, p_tftransformer_smote
]

comparison_names = [
    "Logistic Regression Original vs ROS", "Logistic Regression Original vs SMOTE",
    "XGBoost Original vs ROS", "XGBoost Original vs SMOTE",
    "MLP Original vs ROS", "MLP Original vs SMOTE",
    "Transformer Original vs ROS", "Transformer Original vs SMOTE",
    "TabTransformer Original vs ROS", "TabTransformer Original vs SMOTE",
    "FTTransformer Original vs ROS", "FTTransformer Original vs SMOTE"
]

# # Bonferroni Correction
# _, pvals_bonf, _, _ = multipletests(p_values, method='bonferroni')
# FDR Correction
_, pvals_bonf, _, _ = multipletests(p_values, method='fdr_bh')

# Significance function
def get_sig(p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return 'ns'
    
# Results
print("Comparison  \t\t\t\t\t Raw p\t\t FDR p\t\tSignificance")
for name, raw, bonf in zip(comparison_names, p_values, pvals_bonf):
    print(f"{name:<40}\t {raw:.4f}\t\t {bonf:.4f}\t\t{get_sig(bonf)}")

Logistic Regression Original vs ROS: statistic=0.0, p-value=0.001953125
Logistic Regression Original vs SMOTE: statistic=0.0, p-value=0.001953125
XGBoost Original vs ROS: statistic=0.0, p-value=0.001953125
XGBoost Original vs SMOTE: statistic=0.0, p-value=0.001953125
MLP Original vs ROS: statistic=0.0, p-value=0.001953125
MLP Original vs SMOTE: statistic=0.0, p-value=0.001953125
Transformer Original vs ROS: statistic=0.0, p-value=0.001953125
Transformer Original vs SMOTE: statistic=0.0, p-value=0.001953125
TabTransformer Original vs ROS: statistic=0.0, p-value=0.001953125
TabTransformer Original vs SMOTE: statistic=0.0, p-value=0.001953125
FTTransformer Original vs ROS: statistic=0.0, p-value=0.001953125
FTTransformer Original vs SMOTE: statistic=0.0, p-value=0.001953125
Comparison  					 Raw p		 FDR p		Significance
Logistic Regression Original vs ROS     	 0.0020		 0.0020		**
Logistic Regression Original vs SMOTE   	 0.0020		 0.0020		**
XGBoost Original vs ROS                 	 0.0020

In [None]:
# # RQ1
# from scipy.stats import wilcoxon
# import pandas as pd

# # 模拟数据结构（用户已定义）
# results = {}

# # 模拟：RQ1 - 模型对比（原始数据）
# def run_rq1_tests():
#     comparisons = [
#         ("MLP", "Logistic Regression"),
#         ("MLP", "XGBoost"),
#         ("TabTransformer", "Logistic Regression"),
#         ("TabTransformer", "XGBoost"),
#         ("TabTransformer", "MLP")  # 可选
#     ]
    
#     model_f1 = {
#         "Logistic Regression": lg_original_f1,
#         "XGBoost": xg_original_f1,
#         "MLP": mlp_original_f1,
#         "TabTransformer": tabtrans_original_f1
#     }

#     rq1_results = []
#     for model_a, model_b in comparisons:
#         stat, p = wilcoxon(model_f1[model_a], model_f1[model_b])
#         rq1_results.append({
#             "Comparison": f"{model_a} vs {model_b}",
#             "Statistic": stat,
#             "p-value": p
#         })
#     return pd.DataFrame(rq1_results)

# # 模拟：RQ2 - 采样方法对比（每个模型原始 vs ROS / SMOTE）
# def run_rq2_tests():
#     comparisons = [
#         ("Logistic Regression", lg_original_f1, lg_ros_f1, "ROS"),
#         ("Logistic Regression", lg_original_f1, lg_smote_f1, "SMOTE"),
#         ("XGBoost", xg_original_f1, xg_ros_f1, "ROS"),
#         ("XGBoost", xg_original_f1, xg_smote_f1, "SMOTE"),
#         ("MLP", mlp_original_f1, mlp_ros_f1, "ROS"),
#         ("MLP", mlp_original_f1, mlp_smote_f1, "SMOTE"),
#         ("TabTransformer", tabtrans_original_f1, tabtrans_ros_f1, "ROS"),
#         ("TabTransformer", tabtrans_original_f1, tabtrans_smote_f1, "SMOTE"),
#     ]

#     rq2_results = []
#     for model, original, sampled, method in comparisons:
#         stat, p = wilcoxon(sampled, original)
#         rq2_results.append({
#             "Model": model,
#             "Method": method,
#             "Statistic": stat,
#             "p-value": p
#         })
#     return pd.DataFrame(rq2_results)

# # 执行检验
# rq1_df = run_rq1_tests()
# rq2_df = run_rq2_tests()


In [None]:
# import numpy as np
# import pandas as pd
# from itertools import combinations
# from typing import List, Tuple

# # Define permutation test function
# def permutation_test(x: List[float], y: List[float], n_permutations=10000, metric='mean', alternative='two-sided', seed=42) -> Tuple[float, float]:
#     np.random.seed(seed)
#     x = np.array(x)
#     y = np.array(y)
#     observed_diff = np.mean(x - y) if metric == 'mean' else np.median(x - y)
#     diffs = []

#     for _ in range(n_permutations):
#         mask = np.random.rand(len(x)) > 0.5
#         x_perm = np.where(mask, x, y)
#         y_perm = np.where(mask, y, x)
#         diff = np.mean(x_perm - y_perm) if metric == 'mean' else np.median(x_perm - y_perm)
#         diffs.append(diff)

#     diffs = np.array(diffs)

#     if alternative == 'two-sided':
#         p = np.mean(np.abs(diffs) >= np.abs(observed_diff))
#     elif alternative == 'greater':
#         p = np.mean(diffs >= observed_diff)
#     else:
#         p = np.mean(diffs <= observed_diff)

#     return observed_diff, p

# # Placeholder for actual score lists (to be filled from user data)
# scores = {
#     "Logistic Regression": [],
#     "Logistic Regression ROS": [],
#     "Logistic Regression SMOTE": [],
#     "XGBoost": [],
#     "XGBoost ROS": [],
#     "XGBoost SMOTE": [],
#     "MLP": [],
#     "MLP with ROS": [],
#     "MLP with SMOTE": [],
#     "TabTransformer": [],
#     "TabTransformer with ROS": [],
#     "TabTransformer with SMOTE": [],
# }

# # Replace this with actual values from user
# scores["Logistic Regression"] = lg_original_f1
# scores["Logistic Regression ROS"] = lg_ros_f1
# scores["Logistic Regression SMOTE"] = lg_smote_f1
# scores["XGBoost"] = xg_original_f1
# scores["XGBoost ROS"] = xg_ros_f1
# scores["XGBoost SMOTE"] = xg_smote_f1
# scores["MLP"] = mlp_original_f1
# scores["MLP with ROS"] = mlp_ros_f1
# scores["MLP with SMOTE"] = mlp_smote_f1
# scores["TabTransformer"] = tabtrans_original_f1
# scores["TabTransformer with ROS"] = tabtrans_ros_f1
# scores["TabTransformer with SMOTE"] = tabtrans_smote_f1

# # Define RQ1 comparisons (between models on original data)
# rq1_comparisons = [
#     ("MLP", "Logistic Regression"),
#     ("MLP", "XGBoost"),
#     ("TabTransformer", "Logistic Regression"),
#     ("TabTransformer", "XGBoost"),
#     ("TabTransformer", "MLP"),
# ]

# # Define RQ2 comparisons (same model with different sampling)
# rq2_comparisons = [
#     ("Logistic Regression", "Logistic Regression ROS"),
#     ("Logistic Regression", "Logistic Regression SMOTE"),
#     ("XGBoost", "XGBoost ROS"),
#     ("XGBoost", "XGBoost SMOTE"),
#     ("MLP", "MLP with ROS"),
#     ("MLP", "MLP with SMOTE"),
#     ("TabTransformer", "TabTransformer with ROS"),
#     ("TabTransformer", "TabTransformer with SMOTE"),
# ]

# # Run permutation tests
# def run_tests(pairs, label):
#     results = []
#     for a, b in pairs:
#         diff, p = permutation_test(scores[a], scores[b])
#         results.append({
#             "Comparison": f"{a} vs {b}",
#             "Mean Difference": round(diff, 4),
#             "p-value": round(p, 4)
#         })
#     return pd.DataFrame(results)

# rq1_results = run_tests(rq1_comparisons, "RQ1")
# rq2_results = run_tests(rq2_comparisons, "RQ2")

In [None]:
# import pandas as pd
# from scipy.stats import wilcoxon

# # 定义模型对比组（RQ1）—— 深度 vs 传统
# rq1_comparisons = {
#     "MLP vs Logistic": ("mlp_original_f1", "lg_original_f1"),
#     "MLP vs XGBoost": ("mlp_original_f1", "xg_original_f1"),
#     "TabTransformer vs Logistic": ("tabtrans_original_f1", "lg_original_f1"),
#     "TabTransformer vs XGBoost": ("tabtrans_original_f1", "xg_original_f1"),
#     "MLP vs TabTransformer": ("mlp_original_f1", "tabtrans_original_f1"),
# }

# # 定义不同采样策略对比组（RQ2）
# rq2_comparisons = {
#     "Logistic: Original vs ROS": ("lg_original_f1", "lg_ros_f1"),
#     "Logistic: Original vs SMOTE": ("lg_original_f1", "lg_smote_f1"),
#     "XGBoost: Original vs ROS": ("xg_original_f1", "xg_ros_f1"),
#     "XGBoost: Original vs SMOTE": ("xg_original_f1", "xg_smote_f1"),
#     "MLP: Original vs ROS": ("mlp_original_f1", "mlp_ros_f1"),
#     "MLP: Original vs SMOTE": ("mlp_original_f1", "mlp_smote_f1"),
#     "TabTransformer: Original vs ROS": ("tabtrans_original_f1", "tabtrans_ros_f1"),
#     "TabTransformer: Original vs SMOTE": ("tabtrans_original_f1", "tabtrans_smote_f1"),
# }

# # 从全局中获取所有模型对应的 F1 分数列表
# global_vars = globals()

# # 执行 Wilcoxon 检验
# def run_wilcoxon_tests(comparison_dict):
#     results = []
#     for label, (a_name, b_name) in comparison_dict.items():
#         a = global_vars[a_name]
#         b = global_vars[b_name]
#         stat, p = wilcoxon(a, b)
#         results.append({
#             "Comparison": label,
#             "Mean A": round(pd.Series(a).mean(), 4),
#             "Mean B": round(pd.Series(b).mean(), 4),
#             'Statistic": round(stat, 4),'
#             'Mean Difference': round(pd.Series(a).mean() - pd.Series(b).mean(), 4),
#             "p-value": round(p, 4),
#             "Significant (p < 0.05)": p < 0.05
#         })
#     return pd.DataFrame(results)

# rq1_results = run_wilcoxon_tests(rq1_comparisons)
# rq2_results = run_wilcoxon_tests(rq2_comparisons)

# print("RQ1 Results:")
# print(rq1_results)
# print("\nRQ2 Results:")
# print(rq2_results)


RQ1 Results:
                   Comparison  Mean A  Mean B  \
0             MLP vs Logistic  0.7547  0.6296   
1              MLP vs XGBoost  0.7547  0.7661   
2  TabTransformer vs Logistic  0.7794  0.6296   
3   TabTransformer vs XGBoost  0.7794  0.7661   
4       MLP vs TabTransformer  0.7547  0.7794   

   Statistic": round(stat, 4),Mean Difference  p-value  Significant (p < 0.05)  
0                                      0.1251   0.0020                    True  
1                                     -0.0114   0.0020                    True  
2                                      0.1498   0.0020                    True  
3                                      0.0133   0.0039                    True  
4                                     -0.0247   0.0020                    True  

RQ2 Results:
                          Comparison  Mean A  Mean B  \
0          Logistic: Original vs ROS  0.6296  0.6961   
1        Logistic: Original vs SMOTE  0.6296  0.7188   
2           XGBoost: Ori