# Fever 2.0 Evaluation Code

Assumes the dataset is in `datasets/fever2-fixers-dev.jsonl` relative to this notebook



In [1]:
import pandas as pd
from sklearn.metrics import f1_score

dataset = pd.read_json('datasets/fever2-fixers-dev.jsonl', lines=True)
print(dataset.head(3))
print(f"Total rows: {len(dataset)}")

# Normalize label column to always uppercase
dataset['label'] = dataset['label'].str.upper()

def check_claim(row):
    ok = True
    if not isinstance(row['claim'], str):
        print(row['claim'])
        ok = False
    if not isinstance(row['label'], str):
        print(row['label'])
        ok = False
    if row['label'] not in ['SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO']:
        ok = False
        print(row['label'])
    return ok


valid_rows = dataset.apply(check_claim, axis=1)

# Check that every row has a valid claim (str) and label ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
print(f"Number of valid rows: {valid_rows.sum()}")

       id            label                                              claim  \
0  500000  NOT ENOUGH INFO  There is a convicted statutory rapist called C...   
1  500001         SUPPORTS  There exists a producer and an actor called Si...   
2  500002          REFUTES  Exotic Birds rejected to be an opening band fo...   

                                            evidence  original_id  \
0                     [[[269158, None, None, None]]]     225798.0   
1                [[[141141, 156349, Simon_Pegg, 0]]]     120126.0   
2  [[[25977, 31918, Exotic_Birds, 2], [25977, 319...          NaN   

     transformation                 attack  \
0  label_preserving      there.is.a.called   
1  label_preserving  there.exists.a.called   
2               NaN       word replacement   

                                          annotation  
0                                                N/A  
1                                                N/A  
2  OK - Claim is grammatical and label supported

In [2]:
from tqdm import tqdm
from core.processing import process_query


def translate_label(label: str) -> str:
    """Translate the newsagent label to the Fever2 label."""
    if label == 'true':
        return 'SUPPORTS'
    elif label == 'false':
        return 'REFUTES'
    else:
        return 'NOT ENOUGH INFO'


async def evaluate_df(df: pd.DataFrame, builtin_tools: list[str]=[], user_tool_kwargs=[]):
    agent_responses, predicted_labels, correct_prediction = [], [], []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        claim = row['claim']
        label = row['label']

        agent_response = await process_query(text=claim, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
        predicted_label = agent_response['final_label']
        predicted_fever_label = translate_label(predicted_label)

        # Compile results
        agent_responses.append(agent_response)
        predicted_labels.append(predicted_fever_label)
        correct_prediction.append(predicted_fever_label == label)
    return agent_responses, predicted_labels, correct_prediction 

import nest_asyncio
nest_asyncio.apply()



In [3]:
# Test run
mini_df = dataset.iloc[:1].copy()
agent_responses, preds, is_correct_list = await evaluate_df(mini_df)
mini_df['agent_response'] = agent_responses
mini_df['predicted_label'] = preds
mini_df['is_correct'] = is_correct_list

100%|██████████| 1/1 [00:11<00:00, 11.02s/it]


In [4]:
# Compute F1 score for the 3 labels
# Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
mini_df['label_int'] = mini_df['label'].map(label_mapping)
mini_df['predicted_label_int'] = mini_df['predicted_label'].map(label_mapping)

print(mini_df)
f1 = f1_score(mini_df['label_int'],
              mini_df['predicted_label_int'], average='macro')
print(f"F1 score: {f1:.4f}")

       id            label                                              claim  \
0  500000  NOT ENOUGH INFO  There is a convicted statutory rapist called C...   

                         evidence  original_id    transformation  \
0  [[[269158, None, None, None]]]     225798.0  label_preserving   

              attack annotation  \
0  there.is.a.called        N/A   

                                      agent_response  predicted_label  \
0  {'final_label': 'unknown', 'final_justificatio...  NOT ENOUGH INFO   

   is_correct  label_int  predicted_label_int  
0        True          2                    2  
F1 score: 1.0000


In [5]:
async def run_evaluation(dataset, builtin_tools, user_tool_kwargs):
    # Evaluate the entire dataset
    df = dataset.copy()
    agent_responses, preds, is_correct_list = await evaluate_df(df, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
    df['agent_response'] = agent_responses
    df['predicted_label'] = preds
    df['is_correct'] = is_correct_list

    # Compute F1 score for the 3 labels
    # Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
    label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
    df['label_int'] = df['label'].map(label_mapping)
    df['predicted_label_int'] = df['predicted_label'].map(label_mapping)

    f1 = f1_score(dataset['label_int'],
                  dataset['predicted_label_int'], average='macro')
    print(f"F1 score: {f1:.4f}")
    return df, f1



In [None]:
mini_results, mini_f1 = await run_evaluation(dataset=mini_df, builtin_tools=[], user_tool_kwargs=[])
print(f"F1: {mini_f1}")

100%|██████████| 1/1 [00:10<00:00, 10.89s/it]

F1 score: 1.0000
       id            label                                              claim  \
0  500000  NOT ENOUGH INFO  There is a convicted statutory rapist called C...   

                         evidence  original_id    transformation  \
0  [[[269158, None, None, None]]]     225798.0  label_preserving   

              attack annotation  \
0  there.is.a.called        N/A   

                                      agent_response  predicted_label  \
0  {'final_label': 'unknown', 'final_justificatio...  NOT ENOUGH INFO   

   is_correct  label_int  predicted_label_int  
0        True          2                    2  
1.0





In [7]:
print(mini_f1)

1.0


### Evaluate dataset using no tools

In [None]:
no_tools_results, no_tools_f1 = await run_evaluation(dataset=dataset, builtin_tools=[], user_tool_kwargs=[])
print(f"F1: {no_tools_f1}")

 20%|█▉        | 229/1174 [1:10:41<5:05:54, 19.42s/it]

## Evaluate using Wikipedia API

In [None]:
wiki_results, wiki_f1 = await run_evaluation(dataset=dataset, builtin_tools=['wikipedia'], user_tool_kwargs=[])

## Evaluate using Web Search

In [None]:
web_results, web_f1 = await run_evaluation(dataset=dataset, builtin_tools=['web_search'], user_tool_kwargs=[])

## Evaluate using Web & Wikipedia

In [None]:
wiki_web_results, wiki_web_f1 = await run_evaluation(dataset=dataset, builtin_tools=['web_search'], user_tool_kwargs=[])