# analysis of the similarity between train and test data in AM-I, AM-II, AM-III

In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from multiprocessing import Pool, cpu_count

# 相似度阈值列表（从高到低排序）
THRESHOLDS = [0.95, 0.90, 0.80, 0.70, 0.60, 0.50]
PERCENTILES = [50, 80, 95]  # 要输出的分位数
MAX_CORES = 26  # 并行计算核数

# 可自定义输出目录（默认 ./similarity）
OUTPUT_DIR = input("请输入输出目录路径（默认 ./similarity1）: ").strip() or "./similarity1"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 定义train_test_split目录路径
DATA_DIR = './train_test_split/'

# 将 SMILES 转换为 Morgan 指纹
def smiles_to_fp(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)

# 计算单个 test 分子的最大相似度
def compute_max_sim(args):
    test_fp, train_fps = args
    if test_fp is None:
        return 0.0
    sims = DataStructs.BulkTanimotoSimilarity(test_fp, train_fps)
    return max(sims) if sims else 0.0

# 主函数
for file in os.listdir(DATA_DIR):
    if file.endswith('_test.csv'):
        dataset_name = os.path.splitext(file)[0]
        prefix = file.replace('_test.csv', '')
        train_file = os.path.join(DATA_DIR, prefix + '_train.csv')
        test_file = os.path.join(DATA_DIR, file)  # 完整的test文件路径

        if not os.path.exists(train_file):
            print(f"❌ 未找到对应的 train 文件: {train_file}")
            continue

        print(f"处理: {file} vs {train_file}")

        # 读取数据 - 使用完整路径
        test_df = pd.read_csv(test_file)
        train_df = pd.read_csv(train_file)

        # 转 fingerprint
        train_fps = [smiles_to_fp(s) for s in train_df['SMILES'].dropna()]
        train_fps = [fp for fp in train_fps if fp is not None]
        test_fps = [smiles_to_fp(s) for s in test_df['SMILES'].dropna()]

        # 并行计算最大相似度
        with Pool(min(MAX_CORES, cpu_count())) as pool:
            max_sims = pool.map(compute_max_sim, [(fp, train_fps) for fp in test_fps])

        # 添加到 DataFrame
        test_df["Max_Similarity"] = max_sims

        # 建立结果字典
        results = {thr: [] for thr in THRESHOLDS}
        for idx, sim in enumerate(max_sims):
            for thr in THRESHOLDS:
                if sim >= thr:
                    results[thr].append(idx)

        # 保存结果
        total_test = len(test_df)
        stats = []
        for thr in THRESHOLDS:
            indices = results[thr]
            if indices:
                out_df = test_df.iloc[indices]
                out_name = os.path.join(OUTPUT_DIR, f"{dataset_name}_sim{int(thr*100)}.csv")
                out_df.to_csv(out_name, index=False)
                count = len(out_df)
                stats.append((thr, count, count / total_test))
                print(f"阈值 {thr:.2f}: 样本数={count}, 占比={count/total_test:.2%}, 已保存 {out_name}")

        # 分位数
        percentiles_values = np.percentile(max_sims, PERCENTILES)
        percentile_stats = [(f"P{p}", val) for p, val in zip(PERCENTILES, percentiles_values)]

        # 保存统计结果
        stats_df = pd.DataFrame(stats, columns=["Threshold", "Count", "Proportion"])
        percentiles_df = pd.DataFrame(percentile_stats, columns=["Percentile", "Similarity"])
        stats_path = os.path.join(OUTPUT_DIR, f"{dataset_name}_similarity_stats.xlsx")
        with pd.ExcelWriter(stats_path) as writer:
            stats_df.to_excel(writer, sheet_name="Threshold_Stats", index=False)
            percentiles_df.to_excel(writer, sheet_name="Percentiles", index=False)
        print(f"📑 已保存统计结果和分位数: {stats_path}")

        # 绘图直方图
        plt.figure(figsize=(8,6))
        plt.hist(max_sims, bins=30, edgecolor='black', alpha=0.7)
        plt.xlabel("Maximum Similarity to Train Set")
        plt.ylabel("Frequency")
        plt.title(f"{dataset_name} - Max Similarity Distribution")
        plt.tight_layout()
        hist_path = os.path.join(OUTPUT_DIR, f"{dataset_name}_max_similarity_hist.png")
        plt.savefig(hist_path, dpi=300)
        plt.close()
        print(f"📊 已保存最大相似度分布直方图: {hist_path}")

        # 绘制CDF并标注分位点
        sorted_sims = np.sort(max_sims)
        cdf = np.arange(1, len(sorted_sims)+1) / len(sorted_sims)

        plt.figure(figsize=(8,6))
        plt.plot(sorted_sims, cdf, marker='.', linestyle='-', label='CDF')
        for p, val in zip(PERCENTILES, percentiles_values):
            plt.axvline(x=val, linestyle='--', alpha=0.7, label=f"P{p}={val:.2f}")
            plt.scatter([val], [p/100], color='red')
            plt.text(val, p/100, f" P{p}", fontsize=9, verticalalignment='bottom')
        plt.xlabel("Maximum Similarity to Train Set")
        plt.ylabel("Cumulative Proportion")
        plt.title(f"{dataset_name} - Max Similarity CDF")
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.tight_layout()
        cdf_path = os.path.join(OUTPUT_DIR, f"{dataset_name}_max_similarity_cdf.png")
        plt.savefig(cdf_path, dpi=600)
        plt.close()
        print(f"📈 已保存最大相似度累积分布曲线: {cdf_path}")

print("✅ 所有文件处理完成!")

处理: AM-II-filtered_with_labels_k3_test.csv vs ./train_test_split/AM-II-filtered_with_labels_k3_train.csv




阈值 0.95: 样本数=1, 占比=0.54%, 已保存 ./similarity1/AM-II-filtered_with_labels_k3_test_sim95.csv
阈值 0.90: 样本数=1, 占比=0.54%, 已保存 ./similarity1/AM-II-filtered_with_labels_k3_test_sim90.csv
阈值 0.80: 样本数=15, 占比=8.06%, 已保存 ./similarity1/AM-II-filtered_with_labels_k3_test_sim80.csv
阈值 0.70: 样本数=51, 占比=27.42%, 已保存 ./similarity1/AM-II-filtered_with_labels_k3_test_sim70.csv
阈值 0.60: 样本数=120, 占比=64.52%, 已保存 ./similarity1/AM-II-filtered_with_labels_k3_test_sim60.csv
阈值 0.50: 样本数=171, 占比=91.94%, 已保存 ./similarity1/AM-II-filtered_with_labels_k3_test_sim50.csv
📑 已保存统计结果和分位数: ./similarity1/AM-II-filtered_with_labels_k3_test_similarity_stats.xlsx
📊 已保存最大相似度分布直方图: ./similarity1/AM-II-filtered_with_labels_k3_test_max_similarity_hist.png
📈 已保存最大相似度累积分布曲线: ./similarity1/AM-II-filtered_with_labels_k3_test_max_similarity_cdf.png
处理: AM-III-filtered_with_labels_k4_test.csv vs ./train_test_split/AM-III-filtered_with_labels_k4_train.csv




阈值 0.95: 样本数=1, 占比=0.81%, 已保存 ./similarity1/AM-III-filtered_with_labels_k4_test_sim95.csv
阈值 0.90: 样本数=2, 占比=1.63%, 已保存 ./similarity1/AM-III-filtered_with_labels_k4_test_sim90.csv
阈值 0.80: 样本数=10, 占比=8.13%, 已保存 ./similarity1/AM-III-filtered_with_labels_k4_test_sim80.csv
阈值 0.70: 样本数=37, 占比=30.08%, 已保存 ./similarity1/AM-III-filtered_with_labels_k4_test_sim70.csv
阈值 0.60: 样本数=81, 占比=65.85%, 已保存 ./similarity1/AM-III-filtered_with_labels_k4_test_sim60.csv
阈值 0.50: 样本数=112, 占比=91.06%, 已保存 ./similarity1/AM-III-filtered_with_labels_k4_test_sim50.csv
📑 已保存统计结果和分位数: ./similarity1/AM-III-filtered_with_labels_k4_test_similarity_stats.xlsx
📊 已保存最大相似度分布直方图: ./similarity1/AM-III-filtered_with_labels_k4_test_max_similarity_hist.png
📈 已保存最大相似度累积分布曲线: ./similarity1/AM-III-filtered_with_labels_k4_test_max_similarity_cdf.png
处理: AM-I-filtered_with_labels_k4_test.csv vs ./train_test_split/AM-I-filtered_with_labels_k4_train.csv




阈值 0.95: 样本数=37, 占比=5.43%, 已保存 ./similarity1/AM-I-filtered_with_labels_k4_test_sim95.csv
阈值 0.90: 样本数=41, 占比=6.02%, 已保存 ./similarity1/AM-I-filtered_with_labels_k4_test_sim90.csv
阈值 0.80: 样本数=60, 占比=8.81%, 已保存 ./similarity1/AM-I-filtered_with_labels_k4_test_sim80.csv
阈值 0.70: 样本数=208, 占比=30.54%, 已保存 ./similarity1/AM-I-filtered_with_labels_k4_test_sim70.csv
阈值 0.60: 样本数=458, 占比=67.25%, 已保存 ./similarity1/AM-I-filtered_with_labels_k4_test_sim60.csv
阈值 0.50: 样本数=622, 占比=91.34%, 已保存 ./similarity1/AM-I-filtered_with_labels_k4_test_sim50.csv
📑 已保存统计结果和分位数: ./similarity1/AM-I-filtered_with_labels_k4_test_similarity_stats.xlsx
📊 已保存最大相似度分布直方图: ./similarity1/AM-I-filtered_with_labels_k4_test_max_similarity_hist.png
📈 已保存最大相似度累积分布曲线: ./similarity1/AM-I-filtered_with_labels_k4_test_max_similarity_cdf.png
✅ 所有文件处理完成!


# retention time prediction with respect to chemical similarity

In [8]:
import os
import glob
import re
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib.colors import LinearSegmentedColormap

# ================= Configuration =================
DATA_FOLDER     = './similarity1'
OUTPUT_FOLDER   = './similarity-prediction'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

MODEL_FOLDERS = {
    'AM-I-filtered_with_labels_k4': './svr-models/AM-I-svr-model',
    'AM-II-filtered_with_labels_k3': './svr-models/AM-II-svr-model',
    'AM-III-filtered_with_labels_k4': './lgb-models'
}
SUMMARY_CSV_PATH = os.path.join(OUTPUT_FOLDER, 'similarity_prediction_summary.csv')
BOXPLOT_PATH    = os.path.join(OUTPUT_FOLDER, 'mae_mre_boxplot.png')
BOXSUMMARY_PATH = os.path.join(OUTPUT_FOLDER, 'mae_mre_box_summary.csv')

# Feature column definitions (consistent with training script)
NUMERIC_FEATS   = ['MolWt', 'logP', 'TPSA', 'H_bond_donors', 'H_bond_acceptors']
MORGAN_FP       = [f'fp_{i}' for i in range(1024)]
OTHER_FP        = [f'col{i}' for i in range(823)]
FEATURE_COLS    = NUMERIC_FEATS + OTHER_FP + MORGAN_FP
TARGET_COL      = 'UV_RT-s'

# Evaluation thresholds
SIMILARITY_THRESHOLDS = [0.5, 0.6, 0.7, 0.8, 0.9]

# Collect results
all_eval_results = []
all_pred_details = []

# ---------------------------- Utilities -----------------------------
def get_model_name(file_name: str):
    if file_name.startswith('AM-I-filtered_with_labels_k4_test_sim'):
        return 'AM-I-filtered_with_labels_k4'
    elif file_name.startswith('AM-II-filtered_with_labels_k3_test_sim'):
        return 'AM-II-filtered_with_labels_k3'
    elif file_name.startswith('AM-III-filtered_with_labels_k4_test_sim'):
        return 'AM-III-filtered_with_labels_k4'
    else:
        return None

def extract_similarity(file_name: str):
    m = re.search(r'sim(\d+)', file_name)
    return int(m.group(1)) / 100.0 if m else None

# ---------------------------- Evaluation Functions -----------------------------
def evaluate_svr(model, scaler, X, y_true):
    X_scaled = X.copy()
    X_scaled[:, :len(NUMERIC_FEATS)] = scaler.transform(
        X_scaled[:, :len(NUMERIC_FEATS)])
    y_pred = model.predict(X_scaled)

    abs_err = np.abs(y_true - y_pred)
    rel_err = np.abs((y_true - y_pred) / y_true) * 100

    mae  = abs_err.mean()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mre  = rel_err.mean()

    return y_pred, mae, rmse, mre, abs_err, rel_err


def evaluate_lgb(txt_path, imputer_path, feature_path, X, y_true):
    booster   = lgb.Booster(model_file=txt_path)
    imputer   = joblib.load(imputer_path)
    feat_list = joblib.load(feature_path)

    df   = pd.DataFrame(X, columns=FEATURE_COLS)
    Xsub = df[feat_list].values
    Xsub = imputer.transform(Xsub)

    y_pred = booster.predict(Xsub)

    abs_err = np.abs(y_true - y_pred)
    rel_err = np.abs((y_true - y_pred) / y_true) * 100

    mae  = abs_err.mean()
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mre  = rel_err.mean()

    return y_pred, mae, rmse, mre, abs_err, rel_err

# ---------------------------- Plotting Functions -----------------------------
# Apple 2023 Color Palette (RGB 8-bit → 0-1)
APPLE_BLUE  = [(0, 102/255, 204/255),  # Dark blue
               (51/255, 153/255, 255/255),
               (102/255, 178/255, 255/255),
               (153/255, 204/255, 255/255),
               (204/255, 229/255, 255/255)]

APPLE_GREEN = [(0, 133/255, 50/255),  # Dark green
               (0, 166/255, 81/255),
               (77/255, 196/255, 110/255),
               (153/255, 221/255, 176/255),
               (230/255, 242/255, 230/255)]

APPLE_ORANGE= [(204/255, 88/255, 4/255),  # Dark orange
               (230/255, 113/255, 0),
               (242/255, 140/255, 40/255),
               (255/255, 179/255, 102/255),
               (255/255, 217/255, 179/255)]

# Color palettes for each model
APPLE_PALETTES = {
    'AM-I-filtered_with_labels_k4': APPLE_BLUE[::-1],  # 0.9→0.5 dark to light
    'AM-II-filtered_with_labels_k3': APPLE_GREEN[::-1],
    'AM-III-filtered_with_labels_k4': APPLE_ORANGE[::-1]
}

def _get_color(dataset: str):
    """Return Apple color scheme based on dataset name"""
    if dataset.startswith('AM-I-filtered_with_labels_k4_test_sim'):
        idx = SIMILARITY_THRESHOLDS.index(extract_similarity(dataset))
        return APPLE_PALETTES['AM-I-filtered_with_labels_k4'][idx]
    if dataset.startswith('AM-II-filtered_with_labels_k3_test_sim'):
        idx = SIMILARITY_THRESHOLDS.index(extract_similarity(dataset))
        return APPLE_PALETTES['AM-II-filtered_with_labels_k3'][idx]
    if dataset.startswith('AM-III-filtered_with_labels_k4_test_sim'):
        idx = SIMILARITY_THRESHOLDS.index(extract_similarity(dataset))
        return APPLE_PALETTES['AM-III-filtered_with_labels_k4'][idx]
    return 'gray'

def generate_fixed_order():
    """Generate fixed order for plotting: AM-I, AM-II, AM-III datasets, each with similarity from small to large"""
    fixed_order = []
    
    # 定义数据集的顺序
    model_order = ['AM-I-filtered_with_labels_k4', 'AM-II-filtered_with_labels_k3', 'AM-III-filtered_with_labels_k4']
    
    for model in model_order:
        # 对每个数据集，按相似度从小到大排列
        for sim in sorted(SIMILARITY_THRESHOLDS):
            sim_str = f"sim{int(sim*100):02d}"
            dataset_name = f"{model}_test_{sim_str}"
            fixed_order.append(dataset_name)
    
    return fixed_order

def plot_boxplots(df_details, save_path):
    """Plot box plots with fixed order"""
    # 生成固定的顺序
    FIXED_ORDER = generate_fixed_order()
    
    metrics = ["AbsError", "RelError"]
    titles  = ["MAE (s)", "MRE (%)"]

    for metric, title in zip(metrics, titles):
        plt.figure(figsize=(10, 4.8))
        plt.rcParams['font.size'] = 12

        # 按照FIXED_ORDER准备数据
        data_sorted, labels_sorted = [], []
        for ds in FIXED_ORDER:
            if ds in df_details['Dataset'].values:
                data_sorted.append(df_details[df_details['Dataset'] == ds][metric].values)
                labels_sorted.append(ds)
            else:
                # 如果数据集不存在，添加空数组以保持位置
                data_sorted.append(np.array([]))
                labels_sorted.append(ds)

        bp = plt.boxplot(
            data_sorted,
            labels=labels_sorted,
            patch_artist=True,
            flierprops=dict(marker='o', color='red', markersize=6, alpha=0),
            medianprops=dict(linewidth=2.2, color='black'),
            whiskerprops=dict(linewidth=1.5),
            capprops=dict(linewidth=1.5)
        )

        # 应用Apple颜色
        for patch, label in zip(bp['boxes'], labels_sorted):
            patch.set_facecolor(_get_color(label))
            patch.set_alpha(0.9)

        # 美化图形
        ax = plt.gca()
        for spine in ax.spines.values():
            spine.set_linewidth(1.5)
        ax.tick_params(axis='both', which='major', width=1.5, length=6)
        ax.tick_params(axis='both', which='minor', width=1.0, length=4)

        # 设置y轴范围
        if title.startswith("MAE"):
            plt.ylim(-1, 11)
            ax.set_yticks(range(0, 13, 2))
        else:
            plt.ylim(-1, 13)
            ax.set_yticks(range(0, 13, 2))

        plt.ylabel(title, fontsize=14, fontweight='bold')
        
        # 旋转x轴标签并添加分隔线
        plt.xticks(rotation=45, ha='right', fontsize=11)
        plt.yticks(fontsize=11)
        
        # 添加分隔线区分不同数据集
        ax_positions = range(1, len(FIXED_ORDER) + 1)
        for i in range(len(FIXED_ORDER)):
            if i % len(SIMILARITY_THRESHOLDS) == len(SIMILARITY_THRESHOLDS) - 1 and i < len(FIXED_ORDER) - 1:
                ax.axvline(x=i + 1.5, color='gray', linestyle='--', linewidth=1, alpha=0.7)
        
        # 添加网格
        ax.yaxis.grid(True, linestyle='--', alpha=0.7, linewidth=0.5)
        ax.set_axisbelow(True)
        
        plt.tight_layout()

        out_path = save_path.replace(".png", f"_{metric}.png")
        plt.savefig(out_path, dpi=600, bbox_inches='tight')
        plt.close()
        print(f"📊 Apple color box plot saved to: {out_path}")
    
    return FIXED_ORDER

def export_box_summary(df_details, save_path, FIXED_ORDER):
    """Export box plot statistics"""
    rows = []
    for metric in ["AbsError", "RelError"]:
        for dataset in FIXED_ORDER:
            if dataset not in df_details['Dataset'].values:
                continue
            grp = df_details[df_details['Dataset'] == dataset]
            if len(grp) > 0:
                q1, q2, q3 = np.percentile(grp[metric], [25, 50, 75])
                rows.append({
                    "Dataset": dataset,
                    "Metric": metric,
                    "Q1": q1,
                    "Median": q2,
                    "Q3": q3,
                    "IQR": q3 - q1,
                    "Count": len(grp)
                })
    pd.DataFrame(rows).to_csv(save_path, index=False)
    print(f"📑 Box plot quartile statistics saved to: {save_path}")

def calculate_outlier_percentage(df_details, metric, FIXED_ORDER):
    """Calculate outlier percentages for each dataset"""
    outlier_percentage = {}
    for dataset in FIXED_ORDER:
        if dataset not in df_details['Dataset'].values:
            continue
        grp = df_details[df_details['Dataset'] == dataset]
        if len(grp) > 0:
            q1, q3 = np.percentile(grp[metric], [25, 75])
            iqr = q3 - q1
            lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
            outliers = grp[(grp[metric] < lower) | (grp[metric] > upper)]
            outlier_percentage[dataset] = len(outliers) / len(grp) * 100
    return outlier_percentage

# ======================== Main Pipeline =========================
def main():
    csv_files = glob.glob(os.path.join(DATA_FOLDER, '*.csv'))
    
    for csv_file in csv_files:
        base_name = os.path.splitext(os.path.basename(csv_file))[0]
        model_name = get_model_name(base_name)
        
        # 修改这里：移除 EXCLUDE_MODELS 检查
        if model_name is None:
            continue

        similarity = extract_similarity(base_name)
        if similarity not in SIMILARITY_THRESHOLDS:
            continue

        if model_name in ['AM-I-filtered_with_labels_k4', 'AM-II-filtered_with_labels_k3']:
            # SVR models
            model_folder = MODEL_FOLDERS[model_name]
            model_path = os.path.join(model_folder, f"{model_name}_svr_model.joblib")
            scaler_path = os.path.join(model_folder, f"{model_name}_scaler.joblib")
            
            if not (os.path.exists(model_path) and os.path.exists(scaler_path)):
                print(f"⚠️ Warning: Model files not found for {model_name}")
                print(f"  Model path: {model_path}")
                print(f"  Scaler path: {scaler_path}")
                continue
            
            model = joblib.load(model_path)
            scaler = joblib.load(scaler_path)

            df = pd.read_csv(csv_file).dropna(subset=FEATURE_COLS + [TARGET_COL])
            X, y_true = df[FEATURE_COLS].values, df[TARGET_COL].values
            y_pred, mae, rmse, mre, abs_err, rel_err = evaluate_svr(model, scaler, X, y_true)

        elif model_name == 'AM-III-filtered_with_labels_k4':
            # LightGBM model
            model_folder = MODEL_FOLDERS[model_name]
            txt_path = os.path.join(model_folder, f"{model_name}_lgb.txt")
            imp_path = os.path.join(model_folder, f"{model_name}_imputer.pkl")
            feat_path = os.path.join(model_folder, f"{model_name}_feature_list.pkl")
            
            if not all(map(os.path.exists, [txt_path, imp_path, feat_path])):
                print(f"⚠️ Warning: Model files not found for {model_name}")
                print(f"  Model path: {txt_path}")
                print(f"  Imputer path: {imp_path}")
                print(f"  Feature path: {feat_path}")
                continue

            df = pd.read_csv(csv_file).dropna(subset=FEATURE_COLS + [TARGET_COL])
            X, y_true = df[FEATURE_COLS].values, df[TARGET_COL].values
            y_pred, mae, rmse, mre, abs_err, rel_err = evaluate_lgb(txt_path, imp_path, feat_path, X, y_true)

        else:
            continue

        # Save predictions
        df['y_pred'] = y_pred
        output_file = os.path.join(OUTPUT_FOLDER, f"{base_name}_predicted.csv")
        df.to_csv(output_file, index=False)

        # Collect evaluation results
        all_eval_results.append({
            'Dataset': base_name,
            'Model': model_name,
            'Similarity': similarity,
            'MAE': mae,
            'RMSE': rmse,
            'MRE': mre,
            'Sample_Size': len(df)
        })

        # Collect detailed error data for box plots
        tmp = pd.DataFrame({
            "Dataset": base_name,
            "AbsError": abs_err,
            "RelError": rel_err
        })
        all_pred_details.append(tmp)

        print(f"{base_name} | {model_name} | sim={similarity:.2f} | "
              f"MAE={mae:.3g} | RMSE={rmse:.3g} | MRE={mre:.3g}% | Samples={len(df)}")

    # Save summary of all evaluations
    if all_eval_results:
        summary_df = pd.DataFrame(all_eval_results)
        # 按模型和相似度排序
        summary_df['Sim_Order'] = summary_df['Similarity'].apply(lambda x: SIMILARITY_THRESHOLDS.index(x))
        summary_df['Model_Order'] = summary_df['Model'].apply(
            lambda x: 1 if x == 'AM-I-filtered_with_labels_k4' else 
                     (2 if x == 'AM-II-filtered_with_labels_k3' else 3)
        )
        summary_df = summary_df.sort_values(['Model_Order', 'Sim_Order'])
        summary_df = summary_df.drop(['Sim_Order', 'Model_Order'], axis=1)
        
        summary_df.to_csv(SUMMARY_CSV_PATH, index=False)
        print(f"\n✅ Summary table saved to: {SUMMARY_CSV_PATH}")
        
        # 显示汇总统计
        print("\n📊 Summary Statistics:")
        print("=" * 80)
        for model in ['AM-I-filtered_with_labels_k4', 'AM-II-filtered_with_labels_k3', 'AM-III-filtered_with_labels_k4']:
            model_df = summary_df[summary_df['Model'] == model]
            print(f"\n{model}:")
            for sim in SIMILARITY_THRESHOLDS:
                sim_df = model_df[model_df['Similarity'] == sim]
                if not sim_df.empty:
                    row = sim_df.iloc[0]
                    print(f"  sim={sim:.1f}: MAE={row['MAE']:.3g}, RMSE={row['RMSE']:.3g}, "
                          f"MRE={row['MRE']:.3g}%, Samples={row['Sample_Size']}")
    else:
        print("⚠️ No evaluation results were generated!")
        return

    # Generate box plots and statistics
    if all_pred_details:
        details_df = pd.concat(all_pred_details, ignore_index=True)
        
        # 生成固定的顺序
        FIXED_ORDER = generate_fixed_order()
        
        # Create box plots
        print(f"\n📈 Generating box plots with order:")
        print(" -> ".join(FIXED_ORDER))
        plot_boxplots(details_df, BOXPLOT_PATH)
        
        # Export box plot statistics
        export_box_summary(details_df, BOXSUMMARY_PATH, FIXED_ORDER)

        # Calculate and display outlier percentages
        print("\n📊 Outlier Analysis:")
        print("=" * 80)
        for metric in ["AbsError", "RelError"]:
            print(f"\n{metric} Outlier Percentages:")
            outlier_percentage = calculate_outlier_percentage(details_df, metric, FIXED_ORDER)
            for dataset in FIXED_ORDER:
                if dataset in outlier_percentage:
                    # 提取模型和相似度信息用于更好的显示
                    model = "_".join(dataset.split("_")[:5]) if "AM-III" in dataset else "_".join(dataset.split("_")[:4])
                    sim = dataset.split("_")[-1]
                    print(f"  {model} ({sim}): {outlier_percentage[dataset]:.2f}%")
    else:
        print("⚠️ No prediction details were collected for box plots!")

if __name__ == "__main__":
    main()

AM-I-filtered_with_labels_k4_test_sim60 | AM-I-filtered_with_labels_k4 | sim=0.60 | MAE=2.68 | RMSE=3.54 | MRE=3.98% | Samples=458
AM-III-filtered_with_labels_k4_test_sim80 | AM-III-filtered_with_labels_k4 | sim=0.80 | MAE=1.98 | RMSE=2.39 | MRE=2.8% | Samples=10
AM-II-filtered_with_labels_k3_test_sim60 | AM-II-filtered_with_labels_k3 | sim=0.60 | MAE=1.85 | RMSE=2.59 | MRE=3.41% | Samples=120
AM-III-filtered_with_labels_k4_test_sim60 | AM-III-filtered_with_labels_k4 | sim=0.60 | MAE=2.75 | RMSE=4.04 | MRE=3.86% | Samples=81
AM-II-filtered_with_labels_k3_test_sim50 | AM-II-filtered_with_labels_k3 | sim=0.50 | MAE=1.92 | RMSE=2.65 | MRE=3.51% | Samples=171
AM-II-filtered_with_labels_k3_test_sim80 | AM-II-filtered_with_labels_k3 | sim=0.80 | MAE=1.59 | RMSE=2.47 | MRE=2.77% | Samples=15
AM-II-filtered_with_labels_k3_test_sim90 | AM-II-filtered_with_labels_k3 | sim=0.90 | MAE=5.4 | RMSE=5.4 | MRE=10.2% | Samples=1
AM-III-filtered_with_labels_k4_test_sim70 | AM-III-filtered_with_labels_k4 

  bp = plt.boxplot(


📊 Apple color box plot saved to: ./similarity-prediction/mae_mre_boxplot_AbsError.png


  bp = plt.boxplot(


📊 Apple color box plot saved to: ./similarity-prediction/mae_mre_boxplot_RelError.png
📑 Box plot quartile statistics saved to: ./similarity-prediction/mae_mre_box_summary.csv

📊 Outlier Analysis:

AbsError Outlier Percentages:
  AM-I-filtered_with_labels_k4 (sim50): 3.86%
  AM-I-filtered_with_labels_k4 (sim60): 3.49%
  AM-I-filtered_with_labels_k4 (sim70): 4.33%
  AM-I-filtered_with_labels_k4 (sim80): 3.33%
  AM-I-filtered_with_labels_k4 (sim90): 4.88%
  AM-II-filtered_with_labels_k3 (sim50): 7.02%
  AM-II-filtered_with_labels_k3 (sim60): 7.50%
  AM-II-filtered_with_labels_k3 (sim70): 5.88%
  AM-II-filtered_with_labels_k3 (sim80): 13.33%
  AM-II-filtered_with_labels_k3 (sim90): 0.00%
  AM-III-filtered_with_labels_k4_test (sim50): 6.25%
  AM-III-filtered_with_labels_k4_test (sim60): 4.94%
  AM-III-filtered_with_labels_k4_test (sim70): 2.70%
  AM-III-filtered_with_labels_k4_test (sim80): 0.00%
  AM-III-filtered_with_labels_k4_test (sim90): 0.00%

RelError Outlier Percentages:
  AM-I-filt