In [1]:
import pandas as pd
from itables import show
from IPython.display import HTML

In [2]:
def display_multiline(df, max_rows=20):
    """
    Display a DataFrame with automatic multiline wrapping:
    - Converts \n and \\n into HTML <br> for proper line breaks
    - Makes table cells wrap normally (no nowrap)
    - Limits displayed rows for readability
    """

    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.max_columns', None)

    df_disp = df.head(max_rows).copy()

    # Convert \n to <br> for multiline rendering
    def _ml(x):
        if isinstance(x, str):
            return x.replace("\\n", "<br>").replace("\n", "<br>")
        return x

    for col in df_disp.columns:
        df_disp[col] = df_disp[col].astype(str).apply(_ml)

    # Return a Styler that forces normal wrapping
    return df_disp.style.set_properties(**{
        "white-space": "normal",
        "word-wrap": "break-word"
    })

In [55]:
df = pd.read_csv("data/results/freshQA_since24_full.csv")

In [47]:
df.columns

Index(['id', 'split', 'question', 'effective_year', 'next_review', 'false_premise', 'num_hops', 'fact_type', 'source', 'model_response', 'answer_0', 'answer_1', 'answer_2', 'answer_3', 'answer_4', 'answer_5', 'answer_6', 'answer_7', 'answer_8', 'answer_9', 'note', 'eval_rating_robust', 'eval_explanation_robust', 'model_response_agent_v1', 'model_response_freshprompt', 'model_response_agent_v1_directAnswer'], dtype='object')

In [53]:
import re

def extract_direct_answer(text: str) -> str:
    """
    从 Answer Contract 文本中尽力抽取 'Direct Answer'。
    处理常见格式：
      - (-/•) Direct Answer: <内容>
      - 'Final Answer' 标题后的下一行
      - 回退：Verdict: <内容>
      - 仍无：全文第一行非空文本
    """
    t = (str(text) if text is not None else "").strip()
    if not t:
        return ""

    # 1) 直接匹配 Direct Answer 行（容忍前导符号和不同分隔符）
    m = re.search(r"(?im)^[\s\-•>*]*Direct\s*Answer\s*[:\-–]\s*(.*)$", t)
    if m:
        val = m.group(1).strip()
        if val:
            return val
        # 同行为空，则取其后的第一行非空文本
        after = t[m.end():].splitlines()
        for line in after:
            ls = line.strip()
            if ls:
                return ls
        return ""

    # 2) 若有 'Final Answer' 标题，取标题之后的第一行非空文本
    fa = re.search(r"(?im)^[\s\-•>*]*Final\s*Answer\s*[:\-–]?(.*)$", t)
    if fa:
        tail = t[fa.end():].strip()
        for line in tail.splitlines():
            ls = line.strip()
            if ls:
                return ls

    # 3) 回退：若存在 Verdict 行，返回其内容（适合是/否问题）
    v = re.search(r"(?im)^[\s\-•>*]*Verdict\s*[:\-–]\s*(.+)$", t)
    if v:
        return v.group(1).strip()

    # 4) 最终回退：全文第一行非空文本
    for line in t.splitlines():
        ls = line.strip()
        if ls:
            return ls
    return t

# ===== 配置区：按需修改 =====
CSV_PATH = "data/results/freshQA_since24_full.csv"
FULL_COL = "model_response_agent_v1"
DIRECT_COL = FULL_COL + "_directAnswer"
# ==========================

# 读取数据
df = pd.read_csv(CSV_PATH)

# 若无 direct 列则添加
if DIRECT_COL not in df.columns:
    df[DIRECT_COL] = None

# 仅填补 direct 为空的行
mask = df[DIRECT_COL].isna() | (df[DIRECT_COL].astype(str).str.strip() == "")
df.loc[mask, DIRECT_COL] = df.loc[mask, FULL_COL].apply(extract_direct_answer)

# 简要检查
print("Filled rows:", int(mask.sum()))
print(df[[FULL_COL, DIRECT_COL]].head(3))

# 回写文件
df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
print("saved:", CSV_PATH)


Filled rows: 144
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [42]:
df.columns

Index(['id', 'split', 'question', 'effective_year', 'next_review', 'false_premise', 'num_hops', 'fact_type', 'source', 'model_response', 'answer_0', 'answer_1', 'answer_2', 'answer_3', 'answer_4', 'answer_5', 'answer_6', 'answer_7', 'answer_8', 'answer_9', 'note', 'eval_rating_robust', 'eval_explanation_robust', 'model_response_agent_v1', 'model_response_freshprompt', 'model_response_agent_v1_directAnswer'], dtype='object')

In [56]:
display_multiline(df[["model_response_freshprompt","model_response_agent_v1", "model_response_agent_v1_directAnswer", "answer_0"]], max_rows=186)


Unnamed: 0,model_response_freshprompt,model_response_agent_v1,model_response_agent_v1_directAnswer,answer_0
0,45 seats,"1) Prerequisite You Were Checking:  - I was verifying the definition/status of the current seat distribution in the U.S. Senate to determine by how many seats the Republicans control the Senate. 2) Evidence-Based Status:  - Verified. The evidence confirms that the Republicans hold 53 seats, while the Democrats hold 45 seats, with 2 independents caucusing with the Democrats. 3) Updated Fact Ladder:  - Identity: Who currently holds the majority in the United States Senate? [Verified]  - Definition / Status: By how many seats does the majority party control the Senate? [Verified] 4) Next Step:  - All prerequisites are now verified. I will proceed to provide the Final Answer using the Answer Contract. Premise: True Verdict: No Direct Answer: Democrats do not currently control the United States Senate; Republicans hold the majority. Key Facts: - Republicans hold 53 seats in the Senate, while Democrats hold 45 seats, with 2 independents caucusing with them (source: en.wikipedia.org, www.senate.gov). - This gives Republicans a majority by 6 seats over the Democrats and independents combined.",Democrats do not currently control the United States Senate; Republicans hold the majority.,Democrats currently do not control the US Senate.
1,"The CEO of X Corp. was Linda Yaccarino, but she stepped down in 2025. The current CEO is not specified in the provided sources.","Premise: Uncertain - The evidence does not confirm who is the current CEO of X Corp as of November 2025. It only confirms that Linda Yaccarino stepped down in July 2025. Verdict: Uncertain Direct Answer: The current CEO of X Corp as of November 2025 is not confirmed based on the available evidence. Key Facts: - Linda Yaccarino stepped down as CEO of X Corp in July 2025 (source: Fortune, July 10, 2025). - No successor has been confirmed in the available evidence. If Needed: Further investigation or updated sources are required to determine the current CEO of X Corp.",The current CEO of X Corp as of November 2025 is not confirmed based on the available evidence.,X Corp. currently has no CEO following Linda Yaccarino's recent resignation.
2,The sources provided do not specify the two federal crimes the most recent former President of the United States has been convicted of.,"1) Prerequisite You Were Checking:  - I was checking if Joe Biden has been convicted of any federal crimes. 2) Evidence-Based Status:  - Still Unclear. The evidence provided does not mention any federal crime convictions for Joe Biden. It discusses clemency actions and other legal matters but does not confirm any convictions. 3) Updated Fact Ladder:  - Identity: Determine who the most recent former President of the United States is as of today. [Verified]  - Status: Check if Joe Biden has been convicted of any federal crimes. [Unresolved]  - Definition / Status: Identify the specific federal crimes, if any, Joe Biden has been convicted of. [Unresolved] 4) Next Step:  - Given the lack of clear evidence, I will conclude that there is no verified information about Joe Biden being convicted of federal crimes. I will now provide the final answer based on the available evidence. Final Answer — Answer Contract: - Premise: Uncertain (There is no verified evidence that Joe Biden, the most recent former President, has been convicted of any federal crimes.) - Verdict: Uncertain - Direct Answer: There is no verified information that Joe Biden has been convicted of any federal crimes. -",There is no verified information that Joe Biden has been convicted of any federal crimes.,"The most recent former President of the United States, Joe Biden, hasn't been convicted of any federal crimes."
3,The sources do not provide information on Alphabet's highest market capitalization in 2021.,"Premise: Uncertain - The evidence does not clearly specify the exact month in 2021 when Alphabet's market capitalization reached its highest-ever recorded value. Verdict: Uncertain Direct Answer: The specific month in 2021 when Alphabet's market capitalization reached its highest-ever recorded value is not clearly identified in the available evidence. Key Facts: - Evidence indicates Alphabet's market capitalization peaked on November 19, 2021, but does not confirm if this was the highest-ever recorded value. (source: investors.com) - Alphabet surpassed a $3 trillion market capitalization, but the timing of this milestone is not specified for 2021. (source: gov.ca.gov) If Needed: Further detailed financial records or reports from 2021 would be needed to confirm the exact month of the highest-ever market capitalization value for Alphabet.",The specific month in 2021 when Alphabet's market capitalization reached its highest-ever recorded value is not clearly identified in the available evidence.,"The all-time highest value of Alphabet was in September 2025, not in 2021."
4,Michael van Gerwen did not win this year's PDC World Darts Championship. It was won by Luke Littler.,"1) Prerequisite You Were Checking:  - Confirm that Michael van Gerwen participated in the 2025 PDC World Darts Championship. 2) Evidence-Based Status:  - Contradicted. The evidence suggests that Michael van Gerwen did not participate in the 2025 PDC World Darts Championship due to personal reasons, including family commitments and his father's health issues. 3) Updated Fact Ladder:  - Identity: Determine who won the 2025 PDC World Darts Championship. [Verified]  - Definition / Status: Confirm that Michael van Gerwen participated in the 2025 PDC World Darts Championship. [Contradicted]  - Mapping / Other: Identify who Michael van Gerwen beat in the final if he won. [Unresolved] 4) Next Step:  - Since Michael van Gerwen did not participate in the 2025 PDC World Darts Championship, it is impossible for him to have beaten anyone in the final. Therefore, I can now provide the Final Answer. Final Answer — Answer Contract: - Premise: False - Verdict: No - Direct Answer: Michael van Gerwen did not win the 2025 PDC World Darts Championship, so he did not beat anyone in","Michael van Gerwen did not win the 2025 PDC World Darts Championship, so he did not beat anyone in","Michael van Gerwen lost to Luke Littler in the final, held on Friday January 3."
5,The provided sources do not specify how Elon Musk refers to his position as X Corp.'s chairman and CTO in his current Twitter/X bio.,Premise: Uncertain - The evidence did not provide specific information about the exact wording used in Elon Musk's current Twitter/X bio to refer to his position at X Corp. Verdict: Uncertain Direct Answer: It is unclear what exact wording Elon Musk uses in his current Twitter/X bio to refer to his position as X Corp.'s chairman and CTO. Key Facts: - Elon Musk is reportedly the Executive Chairman and CTO of X Corporation as of July 2025 (source: hostafrica.ng). - No specific evidence was found regarding the exact wording in Elon Musk's current Twitter/X bio. If Needed: Further direct access to Elon Musk's current Twitter/X profile would be required to verify the exact wording used in his bio.,It is unclear what exact wording Elon Musk uses in his current Twitter/X bio to refer to his position as X Corp.'s chairman and CTO.,Elon Musk's current Twitter/X bio does not mention anything about his position as X Corp.'s chairman and CTO.
6,"The Weeknd's last studio album is titled ""Hurry Up Tomorrow.""","1) Prerequisite You Were Checking:  - Confirm the exact title of the most recent studio album. 2) Evidence-Based Status:  - Verified. The evidence confirms that the exact title of The Weeknd's latest studio album is ""Hurry Up Tomorrow."" 3) Updated Fact Ladder:  1. Identity: Verify who ""The Weeknd"" is to ensure there are no recent changes or confusions in artist identity. [Verified]  2. Timeframe: Determine the release date of The Weeknd's most recent studio album. [Verified]  3. Wording/Quote: Confirm the exact title of the most recent studio album. [Verified] 4) Next Step:  - All prerequisites are now verified. I will provide the final answer using the Answer Contract. Final Answer — Answer Contract: - Premise: True - Verdict: Yes - Direct Answer: The Weeknd's last studio album is titled ""Hurry Up Tomorrow."" - Key Facts:  - ""Hurry Up Tomorrow"" is the sixth studio album by The Weeknd, released on January 31, 2025. (Source: en.wikipedia.org)  - It is the final album under his stage name of The Week","The Weeknd's last studio album is titled ""Hurry Up Tomorrow.""",Hurry Up Tomorrow
7,"Elon Musk has 14 children, including his deceased child.","1) Prerequisite You Were Checking:  - Determine the number of children Elon Musk has, including any deceased child. 2) Evidence-Based Status:  - Verified. The evidence indicates that Elon Musk has welcomed 14 children over 20 years, including a deceased child named Nevada Alexander who died at 10 weeks old. 3) Updated Fact Ladder:  - Identity: Verify who Elon Musk is, to ensure we are discussing the correct individual. [Verified]  - Definition / Status: Determine the number of children Elon Musk has, including any deceased child. [Verified] 4) Next Step:  - All prerequisites are now verified. I will proceed to provide the final answer using the Answer Contract. Final Answer — Answer Contract: - Premise: True - Verdict: Yes - Direct Answer: Elon Musk has 14 children, including his deceased child. - Key Facts:  - Elon Musk has welcomed 14 children over 20 years. (Source: people.com, Jun 06, 2025)  - His first child, Nevada Alexander, tragically died at 10 weeks old. (Source: people.com, Jun 06, 2025)","Elon Musk has 14 children, including his deceased child.",14
8,The name of NASA's latest telescope that can map the entire celestial sky in infrared light is SPHEREx.,"1) Prerequisite You Were Checking:  - Determine the name of NASA's latest telescope designed to map the entire celestial sky in infrared light. 2) Evidence-Based Status:  - Verified. The evidence indicates that NASA's latest telescope for mapping the entire celestial sky in infrared light is called SPHEREx. It is set to launch no earlier than February 27, 2025, and will conduct an all-sky spectroscopic survey in the near-infrared. 3) Updated Fact Ladder:  - Identity: Determine the name of NASA's latest telescope designed to map the entire celestial sky in infrared light. [Verified] 4) Next Step:  - All prerequisites are now verified. I will proceed to provide the Final Answer using the Answer Contract. Final Answer — Answer Contract: - Premise: True - Verdict: Yes - Direct Answer: NASA's latest telescope that can map the entire celestial sky in infrared light is called SPHEREx. - Key Facts:  - SPHEREx will conduct an all-sky spectroscopic survey in the near-infrared.  - It is scheduled to launch no earlier than February 27, 2025. (Source: jpl.nasa.gov, Jan 31,",NASA's latest telescope that can map the entire celestial sky in infrared light is called SPHEREx.,SPHEREx
9,The German Bundestag has 630 seats.,"1) Prerequisite You Were Checking:  - Verify the current composition of the German Bundestag. 2) Evidence-Based Status:  - Verified. The evidence indicates that the 2025 German federal election was held to elect 630 members of the Bundestag, down from 736 in 2021 due to reforms in seat distribution. 3) Updated Fact Ladder:  - Identity: Verify the current composition of the German Bundestag. [Verified] 4) Next Step:  - All prerequisites are now verified. I will proceed to provide the Final Answer using the Answer Contract. Final Answer — Answer Contract: - Premise: True - Verdict: Yes - Direct Answer: There are currently 630 seats in the German Bundestag as of 2025. - Key Facts: - The 2025 German federal election was held to elect 630 members of the Bundestag. (Source: en.wikipedia.org)",There are currently 630 seats in the German Bundestag as of 2025.,630


In [None]:
!python -m experiments.evaluate_results --input data/results/freshQA_since24_full.csv --output data/results/eval_agent_llm.csv --question_col question --response_cols model_response_agent_v1_direct --mode relaxed-llm --model gpt-5