# Fever 2.0 Evaluation Code

Assumes the dataset is in `datasets/fever2-fixers-dev.jsonl` relative to this notebook



In [1]:
import pandas as pd
from sklearn.metrics import f1_score

dataset = pd.read_json('datasets/fever2-fixers-dev.jsonl', lines=True)
print(dataset.head(3))
print(f"Total rows: {len(dataset)}")

# Normalize label column to always uppercase
dataset['label'] = dataset['label'].str.upper()

def check_claim(row):
    ok = True
    if not isinstance(row['claim'], str):
        print(row['claim'])
        ok = False
    if not isinstance(row['label'], str):
        print(row['label'])
        ok = False
    if row['label'] not in ['SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO']:
        ok = False
        print(row['label'])
    return ok


valid_rows = dataset.apply(check_claim, axis=1)

# Check that every row has a valid claim (str) and label ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
print(f"Number of valid rows: {valid_rows.sum()}")

       id            label                                              claim  \
0  500000  NOT ENOUGH INFO  There is a convicted statutory rapist called C...   
1  500001         SUPPORTS  There exists a producer and an actor called Si...   
2  500002          REFUTES  Exotic Birds rejected to be an opening band fo...   

                                            evidence  original_id  \
0                     [[[269158, None, None, None]]]     225798.0   
1                [[[141141, 156349, Simon_Pegg, 0]]]     120126.0   
2  [[[25977, 31918, Exotic_Birds, 2], [25977, 319...          NaN   

     transformation                 attack  \
0  label_preserving      there.is.a.called   
1  label_preserving  there.exists.a.called   
2               NaN       word replacement   

                                          annotation  
0                                                N/A  
1                                                N/A  
2  OK - Claim is grammatical and label supported

In [2]:
from tqdm import tqdm
from core.processing import process_query


def translate_label(label: str) -> str:
    """Translate the newsagent label to the Fever2 label."""
    if label == 'true':
        return 'SUPPORTS'
    elif label == 'false':
        return 'REFUTES'
    else:
        return 'NOT ENOUGH INFO'


async def evaluate_df(df: pd.DataFrame, builtin_tools: list[str]=[], user_tool_kwargs=[]):
    agent_responses, predicted_labels, correct_prediction = [], [], []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        claim = row['claim']
        label = row['label']

        agent_response = await process_query(text=claim, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
        predicted_label = agent_response['final_label']
        predicted_fever_label = translate_label(predicted_label)

        # Compile results
        agent_responses.append(agent_response)
        predicted_labels.append(predicted_fever_label)
        correct_prediction.append(predicted_fever_label == label)
    return agent_responses, predicted_labels, correct_prediction 

import nest_asyncio
nest_asyncio.apply()



In [3]:
async def run_evaluation(dataset, builtin_tools, user_tool_kwargs):
    # Evaluate the entire dataset
    df = dataset.copy()
    agent_responses, preds, is_correct_list = await evaluate_df(df, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
    df['agent_response'] = agent_responses
    df['predicted_label'] = preds
    df['is_correct'] = is_correct_list

    # Compute F1 score for the 3 labels
    # Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
    label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
    df['label_int'] = df['label'].map(label_mapping)
    df['predicted_label_int'] = df['predicted_label'].map(label_mapping)

    f1 = f1_score(df['label_int'],
                  df['predicted_label_int'], average='macro')
    print(f"F1 score: {f1:.4f}")
    return df, f1



In [4]:
# # Compute F1 score for the 3 labels
# # Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
# label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
# mini_df['label_int'] = mini_df['label'].map(label_mapping)
# mini_df['predicted_label_int'] = mini_df['predicted_label'].map(label_mapping)

# print(mini_df)
# f1 = f1_score(mini_df['label_int'],
#               mini_df['predicted_label_int'], average='macro')
# print(f"F1 score: {f1:.4f}")

Run with calculator, Wikipedia, and web search

In [8]:
mini_df = dataset.iloc[:100].copy()

In [9]:
mini_results, mini_f1 = await run_evaluation(dataset=mini_df, builtin_tools=['calculator', 'web_search', 'wikipedia'], user_tool_kwargs=[])

  8%|▊         | 8/100 [06:11<1:19:42, 51.98s/it]

Error fetching page 'Berjon': "Berjon" may refer to: 
Saúl Berjón
Antoine Berjon
Robin Berjon


 12%|█▏        | 12/100 [08:03<48:44, 33.23s/it] 

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value="Error: 1 validation erro...ease fix your mistakes.", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=Non

 15%|█▌        | 15/100 [11:40<1:30:07, 63.62s/it]

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value="Error: 1 validation erro...ease fix your mistakes.", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=Non

 16%|█▌        | 16/100 [13:50<1:57:01, 83.58s/it]

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='Error: 1 validation erro...ease fix your mistakes.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 20%|██        | 20/100 [16:35<1:02:01, 46.51s/it]

Error fetching page 'Thunderstorm (disambiguation)': "Thunderstorm (disambiguation)" may refer to: 
Thunderstorm (band)
Thunderstorm (album)
Thunderstorm (film)
Thunderstorm (play)
Thunderstorm (opera)
The Thunderstorm
"Thunderstorm" (Peppa Pig)
Thunderstorms and Neon Signs
Electrical storm (disambiguation)
Storm (disambiguation)
Thunder (disambiguation)


 22%|██▏       | 22/100 [17:40<51:56, 39.96s/it]  

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='Error: 1 validation erro...ease fix your mistakes.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=Non

 25%|██▌       | 25/100 [20:12<1:00:38, 48.51s/it]

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value="Error: 1 validation erro...ease fix your mistakes.", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=Non

 37%|███▋      | 37/100 [31:13<58:23, 55.61s/it]  

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value="Error: 1 validation erro...ease fix your mistakes.", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 70%|███████   | 70/100 [57:26<18:19, 36.64s/it]  

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value="Error: 1 validation erro...ease fix your mistakes.", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid
Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=Non

100%|██████████| 100/100 [1:17:20<00:00, 46.40s/it]

F1 score: 0.5333





In [10]:
print(mini_f1)

0.5332563951979677


Write out results

In [11]:
with open('f1.txt', 'w') as f:
    f.write(f"F1: {mini_f1:.4f}\n")

mini_results.to_json('mini_results_o3mini.jsonl', lines=True, orient='records')

In [23]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, fscore, support = precision_recall_fscore_support(mini_results['label'], mini_results['predicted_label'], average='macro')

In [None]:
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

None
Precision: 0.5390891952397786
Recall: 0.548611111111111


In [25]:
mistral_results = pd.read_json('mini_results.jsonl', lines=True)

precision, recall, fscore, support = precision_recall_fscore_support(mistral_results['label'], mistral_results['predicted_label'], average='macro')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {fscore:.4f}")

Precision: 0.4437
Recall: 0.4225
F1: 0.4177
