# Fever 2.0 Evaluation Code

Assumes the dataset is in `datasets/fever2-fixers-dev.jsonl` relative to this notebook



In [1]:
import pandas as pd
from sklearn.metrics import f1_score

dataset = pd.read_json('datasets/fever2-fixers-dev.jsonl', lines=True)
print(dataset.head(3))
print(f"Total rows: {len(dataset)}")

# Normalize label column to always uppercase
dataset['label'] = dataset['label'].str.upper()

def check_claim(row):
    ok = True
    if not isinstance(row['claim'], str):
        print(row['claim'])
        ok = False
    if not isinstance(row['label'], str):
        print(row['label'])
        ok = False
    if row['label'] not in ['SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO']:
        ok = False
        print(row['label'])
    return ok


valid_rows = dataset.apply(check_claim, axis=1)

# Check that every row has a valid claim (str) and label ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
print(f"Number of valid rows: {valid_rows.sum()}")

       id            label                                              claim  \
0  500000  NOT ENOUGH INFO  There is a convicted statutory rapist called C...   
1  500001         SUPPORTS  There exists a producer and an actor called Si...   
2  500002          REFUTES  Exotic Birds rejected to be an opening band fo...   

                                            evidence  original_id  \
0                     [[[269158, None, None, None]]]     225798.0   
1                [[[141141, 156349, Simon_Pegg, 0]]]     120126.0   
2  [[[25977, 31918, Exotic_Birds, 2], [25977, 319...          NaN   

     transformation                 attack  \
0  label_preserving      there.is.a.called   
1  label_preserving  there.exists.a.called   
2               NaN       word replacement   

                                          annotation  
0                                                N/A  
1                                                N/A  
2  OK - Claim is grammatical and label supported

In [2]:
from tqdm import tqdm
from core.processing import process_query


def translate_label(label: str) -> str:
    """Translate the newsagent label to the Fever2 label."""
    if label == 'true':
        return 'SUPPORTS'
    elif label == 'false':
        return 'REFUTES'
    else:
        return 'NOT ENOUGH INFO'


async def evaluate_df(df: pd.DataFrame):
    agent_responses, predicted_labels, correct_prediction = [], [], []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        claim = row['claim']
        label = row['label']

        agent_response = await process_query(text=claim, selected_sources=None)
        predicted_label = agent_response['final_label']
        predicted_fever_label = translate_label(predicted_label)

        # Compile results
        agent_responses.append(agent_response)
        predicted_labels.append(predicted_fever_label)
        correct_prediction.append(predicted_fever_label == label)
    return agent_responses, predicted_labels, correct_prediction 

import nest_asyncio
nest_asyncio.apply()



In [3]:
# Figure out why this example isn't returning valid json in the verdict agent
dataset.iloc[0]

# Figure out why this example is causing newsagent to hang
# dataset.iloc[2]

id                                                           500000
label                                               NOT ENOUGH INFO
claim             There is a convicted statutory rapist called C...
evidence                             [[[269158, None, None, None]]]
original_id                                                225798.0
transformation                                     label_preserving
attack                                            there.is.a.called
annotation                                                      N/A
Name: 0, dtype: object

In [4]:
# Test run
mini_df = dataset.iloc[:1].copy()
agent_responses, preds, is_correct_list = await evaluate_df(mini_df)
mini_df['agent_response'] = agent_responses
mini_df['predicted_label'] = preds
mini_df['is_correct'] = is_correct_list

  0%|          | 0/1 [00:00<?, ?it/s]

Using first search result Chinatown (1974 film)!


100%|██████████| 1/1 [00:26<00:00, 26.14s/it]

JSON decode error: Expecting value: line 1 column 1 (char 0)
Raw response content: 
MESSAGES:

You are an expert analyst. The user will give you some text. Your task is to identify and extract all claims made in the text.

### Instructions: Read the entire document carefully and list each individual factual claim. Each claim should be a complete statement that can be fact-checked.

- A claim is any statement that can be objectively verified as true or false, regardless of whether it is actually true.
- IGNORE opinions, subjective interpretations, or ambiguous statements.
- IGNORE existential or trivial statements that are nearly always true (e.g. 'some people disagree with the policy')
- Remember to remain objective and avoid inserting any personal interpretation or bias when extracting claims.

Output Format:
- Return your results as a JSON array of strings.
- Each string in the array must contain one claim.
- Do not include any additional text, explanations, or commentary—only the JS




In [None]:
# Compute F1 score for the 3 labels


# Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
mini_df['label_int'] = mini_df['label'].map(label_mapping)
mini_df['predicted_label_int'] = mini_df['predicted_label'].map(label_mapping)

print(mini_df)


In [None]:
f1 = f1_score(mini_df['label_int'], mini_df['predicted_label_int'], average='macro')
print(f"F1 score: {f1:.4f}")