In [6]:
import os
import re
import pandas as pd

In [7]:
EXP_DIR = "/home/cc/CheckGPT-reproduction/artifact_checkgpt/CheckGPT/exp"
METRIC_PATTERN = re.compile(
    r"Test accuracy:\s*([\d.]+)%.*, "
    r"Acc_GPT:\s*([\d.]+)%.*, "
    r"Acc_Human:\s*([\d.]+)%.*, "
    r"F1:\s*([\d.]+)"
)

In [8]:
def get_classification(folder_name):
    if "CheckGPTArch" in folder_name:
        return "CheckGPTArch"
    if "RCH" in folder_name:
        return "RCH"
    if "MLP" in folder_name:
        return "MLP"
    if "CNN" in folder_name:
        return "CNN"
    if "BiLSTMwoAttention" in folder_name:
        return "BiLSTMwoAttention"
    return None

In [9]:
records = []
for folder in os.listdir(EXP_DIR):
    if "_Test_" not in folder:
        continue

    folder_path = os.path.join(EXP_DIR, folder)
    log_path = os.path.join(folder_path, "train.log")

    if not os.path.isfile(log_path):
        continue

    parts = folder.split("_")
    domain = parts[0]  # CS / PHX / HSS
    task = int(parts[-2].replace("Task", ""))
    prompt = int(parts[-1].replace("Prompt", ""))
    classification = get_classification(folder)

    with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            match = METRIC_PATTERN.search(line)
            if match:
                test_acc, acc_gpt, acc_human, f1 = match.groups()
                records.append({
                    "domain": domain,
                    "task": task,
                    "prompt": prompt,
                    "classification": classification,
                    "test_accuracy": float(test_acc),
                    "acc_gpt": float(acc_gpt),
                    "acc_human": float(acc_human),
                    "f1": float(f1),
                    "expid": folder
                })
                break

df = pd.DataFrame(records)
df = df.sort_values(
    by=["domain", "classification", "task", "prompt"]
).reset_index(drop=True)

In [10]:
print(df)

    domain  task  prompt     classification  test_accuracy  acc_gpt  \
0       CS     1       1  BiLSTMwoAttention          99.90     99.9   
1       CS     1       2  BiLSTMwoAttention          99.85     99.7   
2       CS     1       3  BiLSTMwoAttention          99.85     99.8   
3       CS     1       4  BiLSTMwoAttention          99.95     99.9   
4       CS     2       1  BiLSTMwoAttention          99.40     99.2   
..     ...   ...     ...                ...            ...      ...   
175    PHX     2       4                RCH          98.80     98.3   
176    PHX     3       1                RCH          95.30     97.4   
177    PHX     3       2                RCH          94.80     95.4   
178    PHX     3       3                RCH          95.20     96.4   
179    PHX     3       4                RCH          96.50     96.8   

     acc_human      f1                                    expid  
0         99.9  0.9990  CS_Test_BiLSTMwoAttention_Task1_Prompt1  
1        100.0 

In [12]:
output_path = "repro_checkpoint_2_validation_result.xlsx"
df.to_excel(
    output_path,
    index=False,
    engine="openpyxl"
)