In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm

In [2]:
# 参数组合
split_indices = ["0", "1", "2", "3", "4"]
split_types = ["random_split", "mmseqs2_split", "cdhit_split"]
model_names = ["rf", "adaboost", "gradboost", "knn", "xgboost", "lightgbm"]

In [3]:
# 汇总结果
results = []
task = "Antibacterial"
# task = "Nonfouling"
# 遍历所有组合
for split_type in split_types:
    for split_index in split_indices:
        for model_name in model_names:
            result_dir = os.path.join(
                "checkpoints", task, split_type, model_name, split_index
            )

            if not os.path.exists(result_dir):
                print(f"Directory {result_dir} does not exist, skipping...")
                continue

            row = {
                "split_type": split_type,
                "split_index": split_index,
                "model_name": model_name,
            }
            mode = "test"
            metric_file = os.path.join(result_dir, "metrics.csv")
            res = pd.read_csv(metric_file)
            # 将row与res的第一行拼接
            row.update(res.iloc[2].to_dict())

            results.append(row)

# 创建DataFrame
df = pd.DataFrame(results)


In [4]:
df

Unnamed: 0,split_type,split_index,model_name,Split,Model,Fingerprint,accuracy,balanced-accuracy,precision,recall,...,macro-f1,weighted-f1,mcc,kappa,g-mean,roc-auc,avg-roc-auc,pr-auc,brier-score,log-loss
0,random_split,0,rf,Test,rf,ecfp6,0.808333,0.807972,0.829630,0.771232,...,0.807950,0.808034,0.617871,0.616366,0.807136,0.887144,0.887144,0.884876,0.138650,0.467811
1,random_split,0,adaboost,Test,adaboost,ecfp6,0.739773,0.739796,0.734848,0.742158,...,0.739766,0.739779,0.479569,0.479545,0.739792,0.811292,0.811292,0.817079,0.204362,0.599287
2,random_split,0,gradboost,Test,gradboost,ecfp6,0.771212,0.771227,0.766894,0.772762,...,0.771204,0.771217,0.542430,0.542414,0.771226,0.848948,0.848948,0.846459,0.159023,0.486544
3,random_split,0,knn,Test,knn,ecfp6,0.793939,0.794337,0.768851,0.834736,...,0.793720,0.793654,0.590309,0.588183,0.793309,0.864713,0.864713,0.827921,0.151303,1.519038
4,random_split,0,xgboost,Test,xgboost,ecfp6,0.799621,0.799561,0.800154,0.793420,...,0.799582,0.799610,0.599191,0.599171,0.799537,0.878959,0.878959,0.875044,0.140812,0.437795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,cdhit_split,4,adaboost,Test,adaboost,ecfp6,0.684729,0.646152,0.565728,0.510593,...,0.648903,0.680819,0.299657,0.298754,0.631772,0.719678,0.719678,0.577957,0.215937,0.623111
86,cdhit_split,4,gradboost,Test,gradboost,ecfp6,0.705191,0.672876,0.593258,0.559322,...,0.674945,0.703162,0.350547,0.350190,0.663225,0.741005,0.741005,0.606326,0.194264,0.569612
87,cdhit_split,4,knn,Test,knn,ecfp6,0.635089,0.625344,0.491630,0.591102,...,0.617882,0.640958,0.242708,0.239934,0.624406,0.664853,0.664853,0.489130,0.246093,2.312845
88,cdhit_split,4,xgboost,Test,xgboost,ecfp6,0.693066,0.658039,0.576484,0.534958,...,0.660353,0.690349,0.321727,0.321204,0.646426,0.733111,0.733111,0.588611,0.203362,0.601823


In [6]:
# 按照split_type和split_index进行分组，并只对数值型列计算平均值和方差
numeric_cols = df.select_dtypes(include="number").columns
grouped_df = (
    df.groupby(["split_type", "model_name"])[numeric_cols]
    .agg(["mean", "std"])
    .reset_index()
)


In [7]:
df_result = grouped_df[[ "split_type", "model_name","roc-auc"]]