# Fever 2.0 Evaluation Code

Assumes the dataset is in `datasets/fever2-fixers-dev.jsonl` relative to this notebook



In [1]:
import pandas as pd
from sklearn.metrics import f1_score

dataset = pd.read_json('datasets/fever2-fixers-dev.jsonl', lines=True)
print(dataset.head(3))
print(f"Total rows: {len(dataset)}")

# Normalize label column to always uppercase
dataset['label'] = dataset['label'].str.upper()

def check_claim(row):
    ok = True
    if not isinstance(row['claim'], str):
        print(row['claim'])
        ok = False
    if not isinstance(row['label'], str):
        print(row['label'])
        ok = False
    if row['label'] not in ['SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO']:
        ok = False
        print(row['label'])
    return ok


valid_rows = dataset.apply(check_claim, axis=1)

# Check that every row has a valid claim (str) and label ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
print(f"Number of valid rows: {valid_rows.sum()}")

       id            label                                              claim  \
0  500000  NOT ENOUGH INFO  There is a convicted statutory rapist called C...   
1  500001         SUPPORTS  There exists a producer and an actor called Si...   
2  500002          REFUTES  Exotic Birds rejected to be an opening band fo...   

                                            evidence  original_id  \
0                     [[[269158, None, None, None]]]     225798.0   
1                [[[141141, 156349, Simon_Pegg, 0]]]     120126.0   
2  [[[25977, 31918, Exotic_Birds, 2], [25977, 319...          NaN   

     transformation                 attack  \
0  label_preserving      there.is.a.called   
1  label_preserving  there.exists.a.called   
2               NaN       word replacement   

                                          annotation  
0                                                N/A  
1                                                N/A  
2  OK - Claim is grammatical and label supported

In [2]:
from tqdm import tqdm
from core.processing import process_query


def translate_label(label: str) -> str:
    """Translate the newsagent label to the Fever2 label."""
    if label == 'true':
        return 'SUPPORTS'
    elif label == 'false':
        return 'REFUTES'
    else:
        return 'NOT ENOUGH INFO'


async def evaluate_df(df: pd.DataFrame, builtin_tools: list[str]=[], user_tool_kwargs=[]):
    agent_responses, predicted_labels, correct_prediction = [], [], []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        claim = row['claim']
        label = row['label']

        agent_response = await process_query(text=claim, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
        predicted_label = agent_response['final_label']
        predicted_fever_label = translate_label(predicted_label)

        # Compile results
        agent_responses.append(agent_response)
        predicted_labels.append(predicted_fever_label)
        correct_prediction.append(predicted_fever_label == label)
    return agent_responses, predicted_labels, correct_prediction 

import nest_asyncio
nest_asyncio.apply()



In [3]:
async def run_evaluation(dataset, builtin_tools, user_tool_kwargs):
    # Evaluate the entire dataset
    df = dataset.copy()
    agent_responses, preds, is_correct_list = await evaluate_df(df, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
    df['agent_response'] = agent_responses
    df['predicted_label'] = preds
    df['is_correct'] = is_correct_list

    # Compute F1 score for the 3 labels
    # Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
    label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
    df['label_int'] = df['label'].map(label_mapping)
    df['predicted_label_int'] = df['predicted_label'].map(label_mapping)

    f1 = f1_score(df['label_int'],
                  df['predicted_label_int'], average='macro')
    print(f"F1 score: {f1:.4f}")
    return df, f1



In [4]:
# # Compute F1 score for the 3 labels
# # Convert "REFUTES" to 0, "SUPPORTS" to 1, and "NOT ENOUGH INFO" to 2
# label_mapping = {'REFUTES': 0, 'SUPPORTS': 1, 'NOT ENOUGH INFO': 2}
# mini_df['label_int'] = mini_df['label'].map(label_mapping)
# mini_df['predicted_label_int'] = mini_df['predicted_label'].map(label_mapping)

# print(mini_df)
# f1 = f1_score(mini_df['label_int'],
#               mini_df['predicted_label_int'], average='macro')
# print(f"F1 score: {f1:.4f}")

Run with calculator, Wikipedia, and web search

In [5]:
mini_df = dataset.iloc[:100].copy()

In [6]:
mini_results, mini_f1 = await run_evaluation(dataset=mini_df, builtin_tools=['calculator', 'web_search', 'wikipedia'], user_tool_kwargs=[])

  0%|          | 0/100 [00:00<?, ?it/s]

Using first search result Chinatown (1974 film)!


  1%|          | 1/100 [00:25<41:23, 25.09s/it]

Using first search result Simon Pegg!
Using first search result Simon Pegg!


  2%|▏         | 2/100 [01:08<58:53, 36.06s/it]

Using first search result Exotic Birds!


  3%|▎         | 3/100 [01:24<43:31, 26.92s/it]

Using first search result The Nice Guys!
Using first search result Neo-noir!
Using first search result Shane Black!


  4%|▍         | 4/100 [02:40<1:14:07, 46.32s/it]

Using first search result Succession of Rupert Murdoch!


  5%|▌         | 5/100 [03:11<1:04:05, 40.48s/it]

Using first search result Taxi (TV series)!


  6%|▌         | 6/100 [03:38<56:34, 36.11s/it]  

Using first search result Omar Khadr!
Using first search result Omar Khadr!


  7%|▋         | 7/100 [04:37<1:07:42, 43.68s/it]

Using first search result Kris Jenner!
Using first search result Kris Jenner!
Using first search result Kim Kardashian, Superstar!


  8%|▊         | 8/100 [05:55<1:23:23, 54.39s/it]

Using first search result Antoine Berjon!


  9%|▉         | 9/100 [06:16<1:06:44, 44.01s/it]

Using first search result Moscovium!


 10%|█         | 10/100 [06:50<1:01:28, 40.98s/it]

Using first search result Fabian Nicieza!
Using first search result Fabian Nicieza!


 11%|█         | 11/100 [07:46<1:07:22, 45.42s/it]

Using first search result Paul von Hindenburg!


 12%|█▏        | 12/100 [08:13<58:29, 39.88s/it]  

Using first search result Intel iAPX 432!


 13%|█▎        | 13/100 [08:38<51:32, 35.55s/it]

Using first search result Eurotas!


 15%|█▌        | 15/100 [09:47<51:18, 36.22s/it]

Using first search result 76th Primetime Emmy Awards!


 16%|█▌        | 16/100 [10:14<46:39, 33.33s/it]

Using first search result Sausage Party!


 17%|█▋        | 17/100 [10:40<43:10, 31.21s/it]

Using first search result Watertown, Massachusetts!


 18%|█▊        | 18/100 [11:08<41:13, 30.17s/it]

Using first search result Artpop!


 19%|█▉        | 19/100 [11:43<42:34, 31.53s/it]

Using first search result Scandal (Japanese band)!


 20%|██        | 20/100 [12:11<40:46, 30.58s/it]

Using first search result Thunderstorm!


 21%|██        | 21/100 [12:40<39:45, 30.20s/it]

Using first search result List of films set on Mars!
Using first search result Probability theory!


 23%|██▎       | 23/100 [13:34<35:08, 27.39s/it]

Using first search result For Your Eyes Only (film)!
Using first search result A View to a Kill!


 24%|██▍       | 24/100 [14:29<45:08, 35.64s/it]

Using first search result Hardee's!


 25%|██▌       | 25/100 [14:46<37:30, 30.01s/it]

Using first search result Portrayal of James Bond in film!
Using first search result Mrs. Doubtfire!


 26%|██▌       | 26/100 [15:41<46:18, 37.54s/it]

Using first search result Aristotle!
Using first search result Aristotle!


 27%|██▋       | 27/100 [16:30<49:57, 41.07s/it]

Using first search result Denis Thatcher!


 28%|██▊       | 28/100 [17:00<45:13, 37.69s/it]

Using first search result Villa Park!
Using first search result FA Community Shield!


 29%|██▉       | 29/100 [17:47<47:49, 40.42s/it]

Using first search result Dawood Ibrahim!


 30%|███       | 30/100 [18:17<43:25, 37.22s/it]

Using first search result Ted Cruz!


 32%|███▏      | 32/100 [18:58<31:51, 28.11s/it]

Using first search result The Times (band)!


 33%|███▎      | 33/100 [19:27<31:38, 28.33s/it]

Using first search result Metro Atlanta!
Using first search result Metro Atlanta!


 35%|███▌      | 35/100 [20:52<36:30, 33.70s/it]

Using first search result QW missile!


 36%|███▌      | 36/100 [21:09<30:44, 28.82s/it]

Using first search result Construction of Mount Rushmore!
Using first search result Bust of Abraham Lincoln (Borglum)!
Using first search result Nimitz-class aircraft carrier!
Using first search result United States Capitol crypt!


 37%|███▋      | 37/100 [22:48<52:13, 49.73s/it]

Using first search result Good Old Days (Macklemore song)!


 38%|███▊      | 38/100 [23:10<42:59, 41.60s/it]

Using first search result Baja 1000!
Using first search result Serena Williams!


 40%|████      | 40/100 [24:16<35:15, 35.25s/it]

Using first search result Richard Fortus!
Using first search result Richard Fortus!


 41%|████      | 41/100 [24:43<32:25, 32.98s/it]

Using first search result Joshua Ferraro!
Using first search result Crown corporation!


 42%|████▏     | 42/100 [25:29<35:37, 36.85s/it]

Using first search result Globalism!


 43%|████▎     | 43/100 [25:50<30:33, 32.17s/it]

Using first search result The Good German!
Using first search result The Good German!


 44%|████▍     | 44/100 [26:25<30:42, 32.89s/it]

Using first search result Augustus Prew!
Using first search result Augustus Prew!


 45%|████▌     | 45/100 [26:50<28:00, 30.56s/it]

Using first search result The Wonder Years!


 46%|████▌     | 46/100 [27:22<27:53, 30.99s/it]

Using first search result Times Higher Education!


 47%|████▋     | 47/100 [27:44<25:05, 28.40s/it]

Using first search result Julia Stiles!
Using first search result Bourne (franchise)!


 49%|████▉     | 49/100 [29:00<26:14, 30.88s/it]

Using first search result Duff McKagan!
Using first search result Steve Jones (musician)!


 50%|█████     | 50/100 [29:51<30:52, 37.06s/it]

Using first search result MS-13!


 51%|█████     | 51/100 [30:25<29:30, 36.12s/it]

Using first search result Buffy the Vampire Slayer!
Using first search result Buffy the Vampire Slayer!


 52%|█████▏    | 52/100 [31:23<34:06, 42.63s/it]

Using first search result Victor Kiriakis!
Using first search result Friends!


 53%|█████▎    | 53/100 [32:12<34:51, 44.49s/it]

Using first search result Demographics of Europe!


 54%|█████▍    | 54/100 [32:42<30:47, 40.16s/it]

Using first search result Congressional Space Medal of Honor!
Using first search result Congressional Space Medal of Honor!


 55%|█████▌    | 55/100 [33:22<30:06, 40.15s/it]

Using first search result Erotic thriller!
Using first search result Paul Schrader!


 56%|█████▌    | 56/100 [34:16<32:32, 44.37s/it]

Using first search result Director Pink!


 57%|█████▋    | 57/100 [34:30<25:20, 35.37s/it]

Using first search result Tony Tarantino!
Using first search result Jack Falahee!


 58%|█████▊    | 58/100 [35:13<26:23, 37.70s/it]

Using first search result Edward de Bono!


 59%|█████▉    | 59/100 [35:45<24:31, 35.90s/it]

Using first search result Richard Dawson!


 60%|██████    | 60/100 [36:14<22:31, 33.78s/it]

Using first search result Prowler (Marvel Comics)!
Using first search result Spider-Man 2 (2023 video game)!


 61%|██████    | 61/100 [37:15<27:17, 41.98s/it]

Using first search result Group of 15!


 62%|██████▏   | 62/100 [37:35<22:18, 35.22s/it]

Using first search result Glee club!


 63%|██████▎   | 63/100 [38:26<24:43, 40.10s/it]

Using first search result Issa Rae!
Using first search result The Witcher (TV series)!


 64%|██████▍   | 64/100 [39:37<29:39, 49.43s/it]

Using first search result Vincent D'Onofrio!


 65%|██████▌   | 65/100 [40:06<25:17, 43.35s/it]

Using first search result Ted Cruz!


 66%|██████▌   | 66/100 [40:37<22:19, 39.40s/it]

Using first search result Odd, West Virginia!
Using first search result West Virginia!


 68%|██████▊   | 68/100 [41:25<16:16, 30.51s/it]

Using first search result Lisa Murkowski!
Using first search result Lisa Murkowski!


 70%|███████   | 70/100 [42:34<15:20, 30.68s/it]

Using first search result The Last Song (2010 film)!


 71%|███████   | 71/100 [43:08<15:15, 31.58s/it]

Using first search result Kendall Jenner!
Using first search result Kendall Jenner!


 72%|███████▏  | 72/100 [44:24<20:55, 44.82s/it]

Using first search result The Last Song (2010 film)!
Using first search result Chatham County, Georgia!


 73%|███████▎  | 73/100 [45:17<21:16, 47.28s/it]

Using first search result Honeymoon (Lana Del Rey album)!


 74%|███████▍  | 74/100 [45:43<17:42, 40.88s/it]

Using first search result Mike Friedrich!


 75%|███████▌  | 75/100 [46:08<15:02, 36.10s/it]

Using first search result Prague!


 76%|███████▌  | 76/100 [46:41<14:07, 35.31s/it]

Using first search result The Exponents!


 77%|███████▋  | 77/100 [47:14<13:14, 34.53s/it]

Using first search result Coeliac disease!
Using first search result Coeliac disease!


 78%|███████▊  | 78/100 [48:00<13:54, 37.92s/it]

Using first search result Blue Jasmine!
Using first search result Andrew Upton!


 79%|███████▉  | 79/100 [48:39<13:25, 38.35s/it]

Using first search result An Education!


 81%|████████  | 81/100 [49:06<07:50, 24.78s/it]

Using first search result List of library associations!
Using first search result American Library Association!


 82%|████████▏ | 82/100 [50:17<11:37, 38.76s/it]

Using first search result Sleepers (film)!


 83%|████████▎ | 83/100 [50:42<09:48, 34.59s/it]

Using first search result Sausage Party!


 84%|████████▍ | 84/100 [51:09<08:36, 32.28s/it]

Using first search result Eric Bana!


 85%|████████▌ | 85/100 [51:41<08:03, 32.24s/it]

Using first search result William Henry Johnson (valet)!
Using first search result Gettysburg Address!


 86%|████████▌ | 86/100 [52:31<08:45, 37.50s/it]

Using first search result The 100 (TV series)!
Using first search result Adolescence!


 87%|████████▋ | 87/100 [53:25<09:12, 42.46s/it]

Using first search result French Indochina!


 88%|████████▊ | 88/100 [53:56<07:47, 38.97s/it]

Using first search result Yugoslavia!


 89%|████████▉ | 89/100 [54:21<06:25, 35.06s/it]

Using first search result Portlandia!


 90%|█████████ | 90/100 [54:41<05:03, 30.33s/it]

Using first search result Delino DeShields Jr.!
Using first search result Delino DeShields Jr.!


 91%|█████████ | 91/100 [55:16<04:47, 31.94s/it]

Using first search result Tangled (franchise)!


 92%|█████████▏| 92/100 [55:36<03:45, 28.23s/it]

Using first search result Sean Gunn!


 93%|█████████▎| 93/100 [56:04<03:17, 28.26s/it]

Using first search result Temple Grandin (film)!


 94%|█████████▍| 94/100 [56:31<02:46, 27.80s/it]

Using first search result The Catcher in the Rye!


 95%|█████████▌| 95/100 [56:57<02:16, 27.37s/it]

Using first search result Giver (TV series)!


 96%|█████████▌| 96/100 [57:19<01:42, 25.70s/it]

Using first search result Charles Manson!


 97%|█████████▋| 97/100 [57:47<01:19, 26.47s/it]

Using first search result Lady Gaga discography!


 98%|█████████▊| 98/100 [58:09<00:50, 25.07s/it]

Using first search result Death Note!


 99%|█████████▉| 99/100 [58:37<00:25, 25.79s/it]

Using first search result Hezbollah!
Using first search result List of countries and territories by number of land borders!
Using first search result List of Mediterranean countries!


100%|██████████| 100/100 [59:26<00:00, 35.66s/it]

F1 score: 0.4177





In [7]:
print(mini_f1)

0.41769659282290456


Write out results

In [8]:
with open('f1.txt', 'w') as f:
    f.write(f"F1: {mini_f1:.4f}\n")

mini_results.to_json('mini_results.jsonl', lines=True, orient='records')