In [1]:
import pandas as pd
from itables import show
from IPython.display import HTML

In [2]:
def display_multiline(df, max_rows=20):
    """
    Display a DataFrame with automatic multiline wrapping:
    - Converts \n and \\n into HTML <br> for proper line breaks
    - Makes table cells wrap normally (no nowrap)
    - Limits displayed rows for readability
    """

    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.max_columns', None)

    df_disp = df.head(max_rows).copy()

    # Convert \n to <br> for multiline rendering
    def _ml(x):
        if isinstance(x, str):
            return x.replace("\\n", "<br>").replace("\n", "<br>")
        return x

    for col in df_disp.columns:
        df_disp[col] = df_disp[col].astype(str).apply(_ml)

    # Return a Styler that forces normal wrapping
    return df_disp.style.set_properties(**{
        "white-space": "normal",
        "word-wrap": "break-word"
    })

In [28]:
df = pd.read_csv("data/results/freshQA_since24_full.csv")

In [29]:
df.columns

Index(['id', 'split', 'question', 'effective_year', 'next_review', 'false_premise', 'num_hops', 'fact_type', 'source', 'model_response', 'answer_0', 'answer_1', 'answer_2', 'answer_3', 'answer_4', 'answer_5', 'answer_6', 'answer_7', 'answer_8', 'answer_9', 'note', 'eval_rating_robust', 'eval_explanation_robust', 'model_response_agent_v1', 'model_response_agent_v1_direct', 'model_response_agent_v1_directAnswer', 'model_response_freshprompt'], dtype='object')

In [11]:
import re

def extract_direct_answer(text: str) -> str:
    """
    从 Answer Contract 文本中尽力抽取 'Direct Answer'。
    处理常见格式：
      - (-/•) Direct Answer: <内容>
      - 'Final Answer' 标题后的下一行
      - 回退：Verdict: <内容>
      - 仍无：全文第一行非空文本
    """
    t = (str(text) if text is not None else "").strip()
    if not t:
        return ""

    # 1) 直接匹配 Direct Answer 行（容忍前导符号和不同分隔符）
    m = re.search(r"(?im)^[\s\-•>*]*Direct\s*Answer\s*[:\-–]\s*(.*)$", t)
    if m:
        val = m.group(1).strip()
        if val:
            return val
        # 同行为空，则取其后的第一行非空文本
        after = t[m.end():].splitlines()
        for line in after:
            ls = line.strip()
            if ls:
                return ls
        return ""

    # 2) 若有 'Final Answer' 标题，取标题之后的第一行非空文本
    fa = re.search(r"(?im)^[\s\-•>*]*Final\s*Answer\s*[:\-–]?(.*)$", t)
    if fa:
        tail = t[fa.end():].strip()
        for line in tail.splitlines():
            ls = line.strip()
            if ls:
                return ls

    # 3) 回退：若存在 Verdict 行，返回其内容（适合是/否问题）
    v = re.search(r"(?im)^[\s\-•>*]*Verdict\s*[:\-–]\s*(.+)$", t)
    if v:
        return v.group(1).strip()

    # 4) 最终回退：全文第一行非空文本
    for line in t.splitlines():
        ls = line.strip()
        if ls:
            return ls
    return t

# ===== 配置区：按需修改 =====
CSV_PATH = "data/results/freshQA_since24_full.csv"
FULL_COL = "model_response_agent_v1"
DIRECT_COL = FULL_COL + "_directAnswer"
# ==========================

# 读取数据
df = pd.read_csv(CSV_PATH)

# 若无 direct 列则添加
if DIRECT_COL not in df.columns:
    df[DIRECT_COL] = None

# 仅填补 direct 为空的行
mask = df[DIRECT_COL].isna() | (df[DIRECT_COL].astype(str).str.strip() == "")
df.loc[mask, DIRECT_COL] = df.loc[mask, FULL_COL].apply(extract_direct_answer)

# 简要检查
print("Filled rows:", int(mask.sum()))
print(df[[FULL_COL, DIRECT_COL]].head(3))

# 回写文件
df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
print("saved:", CSV_PATH)


Filled rows: 184
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [30]:
df.columns

Index(['id', 'split', 'question', 'effective_year', 'next_review', 'false_premise', 'num_hops', 'fact_type', 'source', 'model_response', 'answer_0', 'answer_1', 'answer_2', 'answer_3', 'answer_4', 'answer_5', 'answer_6', 'answer_7', 'answer_8', 'answer_9', 'note', 'eval_rating_robust', 'eval_explanation_robust', 'model_response_agent_v1', 'model_response_agent_v1_direct', 'model_response_agent_v1_directAnswer', 'model_response_freshprompt'], dtype='object')

In [32]:
display_multiline(df[["model_response_freshprompt", "model_response_agent_v1_directAnswer", "answer_0"]], max_rows=50)


Unnamed: 0,model_response_freshprompt,model_response_agent_v1_directAnswer,answer_0
0,The information provided is inconsistent and does not provide a clear current count of Democratic seats in the U.S. Senate. Please refer to a reliable and up-to-date source for the most accurate information.,Democrats do not currently control the United States Senate; Republicans have 6 more seats.,Democrats currently do not control the US Senate.
1,"The CEO of X Corp. was Linda Yaccarino, but she stepped down in 2025. The current CEO is not specified in the provided sources.",The current CEO of X Corp is unclear as of November 2025 due to conflicting reports.,X Corp. currently has no CEO following Linda Yaccarino's recent resignation.
2,The sources provided do not specify the two federal crimes the most recent former President of the United States has been convicted of.,There is no verified information that Joe Biden has,"The most recent former President of the United States, Joe Biden, hasn't been convicted of any federal crimes."
3,The sources do not provide information on Alphabet's highest market capitalization in 2021.,Alphabet's market capitalization reached its highest-ever recorded value in December 2021.,"The all-time highest value of Alphabet was in September 2025, not in 2021."
4,The information provided does not specify who Michael van Gerwen beat to win this year's PDC World Darts Championship.,Michael van Gerwen did not win the 2025 PDC World Darts Championship; he was defeated by Luke Littler in the final.,"Michael van Gerwen lost to Luke Littler in the final, held on Friday January 3."
5,The provided sources do not specify how Elon Musk refers to his position as X Corp.'s chairman and CTO in his current Twitter/X bio.,It is unclear what exact wording Elon Musk uses in his Twitter/X bio to refer to his position as chairman and CTO of X Corp.,Elon Musk's current Twitter/X bio does not mention anything about his position as X Corp.'s chairman and CTO.
6,"The Weeknd's last studio album is titled ""Hurry Up Tomorrow.""","The Weeknd's last studio album is titled ""Hurry Up Tomorrow.""",Hurry Up Tomorrow
7,"Elon Musk has 14 children, including his deceased child.","Elon Musk has 14 children, including his deceased child, Nevada Alexander.",14
8,The name of NASA's latest telescope that can map the entire celestial sky in infrared light is SPHEREx.,NASA's latest telescope that can map the entire celestial sky in infrared light is called SPHEREx.,SPHEREx
9,The German Bundestag has 630 seats.,The German Bundestag currently has 630 seats as of the 2025 federal election.,630


In [None]:
!python -m experiments.evaluate_results --input data/results/freshQA_since24_full.csv --output data/results/eval_agent_llm.csv --question_col question --response_cols model_response_agent_v1_direct --mode relaxed-llm --model gpt-5