# Politifact Evaluation Code

Assumes the dataset is in `politifact_factcheck_data.jsonl` relative to this notebook. 

NOTE: file downloads as `.json`, not `.jsonl`, although the format is in JSON lines.



Politifact dataset has 6 `verdict` labels:
* true
* mostly-true
* half-true
* false
* mostly-false
* pants-fire

We will map Politifact labels as follows:
1. `true, mostly-true` to `true`
1. `false, mostly-false, pants-fire` to `false`
1. `half-true` to `mixed`

We will map the NewsAgent label `unknown` to `mixed`, and keep `true` and `false` as-is

### Checking dataset

1. `statement` value must be a string
1. `verdict` must be one of the 6 Politifact label strings

In [1]:
import pandas as pd
from sklearn.metrics import f1_score
import os
from dotenv import load_dotenv
load_dotenv('../../core/.env', override=True)
print(f"Claim Decomposer model: {os.getenv('CLAIM_DECOMPOSER_MODEL')}")
print(f"Research agent model: {os.getenv('RESEARCH_AGENT_MODEL')}")
print(f"Reasoning agent model: {os.getenv('REASONING_AGENT_MODEL')}")
print(f"Verdict agent model: {os.getenv('VERDICT_AGENT_MODEL')}")

dataset = pd.read_json('datasets/politifact_factcheck_data.jsonl', lines=True)
print(dataset.head(3))
print(f"Total rows: {len(dataset)}")
politifact_labels = ['true', 'mostly-true', 'half-true', 'mostly-false', 'false', 'pants-fire']


def check_claim(row):
    ok = True
    if not isinstance(row['statement'], str):
        print(row['statement'])
        ok = False
    if not isinstance(row['verdict'], str):
        print(row['verdict'])
        ok = False
    if row['verdict'] not in politifact_labels:
        ok = False
        print(row['verdict'])
    return ok


valid_rows = dataset.apply(check_claim, axis=1)

# Check that every row has a valid claim (str) and label ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
print(f"Number of valid rows: {valid_rows.sum()}")

Claim Decomposer model: o3-mini
Research agent model: o3-mini
Reasoning agent model: o3-mini
Verdict agent model: o3-mini
       verdict statement_originator  \
0         true         Barack Obama   
1        false           Matt Gaetz   
2  mostly-true         Kelly Ayotte   

                                           statement statement_date  \
0  John McCain opposed bankruptcy protections for...      6/11/2008   
1  "Bennie Thompson actively cheer-led riots in t...       6/7/2022   
2  Says Maggie Hassan was "out of state on 30 day...      5/18/2016   

  statement_source        factchecker factcheck_date  \
0           speech  Adriel Bettelheim      6/16/2008   
1       television        Yacob Reyes      6/13/2022   
2             news     Clay Wirestone      5/27/2016   

                             factcheck_analysis_link  
0  https://www.politifact.com/factchecks/2008/jun...  
1  https://www.politifact.com/factchecks/2022/jun...  
2  https://www.politifact.com/factchecks/2016/

In [2]:
from tqdm import tqdm
from core.processing import process_query


def translate_label(label: str) -> str:
    """Translate the newsagent label to the Politifact label."""
    return 'mixed' if label == 'unknown' else label
# newsagent_label_mapping = {
#     'true': 'true',
#     'unknown': 'mixed',
#     'mixed': 'mixed',
#     'false': 'false',
# }

politifact_label_mapping = {
    'true': 'true',
    'mostly-true': 'true',
    'half-true': 'mixed',
    'mostly-false': 'false',
    'false': 'false',
    'pants-fire': 'false'
}
dataset['label'] = dataset['verdict'].map(politifact_label_mapping)
dataset.head(3)

Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link,label
0,true,Barack Obama,John McCain opposed bankruptcy protections for...,6/11/2008,speech,Adriel Bettelheim,6/16/2008,https://www.politifact.com/factchecks/2008/jun...,True
1,false,Matt Gaetz,"""Bennie Thompson actively cheer-led riots in t...",6/7/2022,television,Yacob Reyes,6/13/2022,https://www.politifact.com/factchecks/2022/jun...,False
2,mostly-true,Kelly Ayotte,"Says Maggie Hassan was ""out of state on 30 day...",5/18/2016,news,Clay Wirestone,5/27/2016,https://www.politifact.com/factchecks/2016/may...,True


In [3]:
async def evaluate_df(df: pd.DataFrame, builtin_tools: list[str]=[], user_tool_kwargs=[]):
    agent_responses, predicted_labels, correct_prediction = [], [], []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        claim = row['statement']
        label = row['label']

        agent_response = await process_query(text=claim, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
        predicted_label = agent_response['final_label']
        predicted_politifact_label = translate_label(predicted_label)

        # Compile results
        agent_responses.append(agent_response)
        predicted_labels.append(predicted_politifact_label)
        correct_prediction.append(predicted_politifact_label == label)
    return agent_responses, predicted_labels, correct_prediction 

import nest_asyncio
nest_asyncio.apply()



In [4]:
async def run_evaluation(dataset, builtin_tools, user_tool_kwargs):
    """
    Assumes the dataset has a 'label' column with the labels after the mapping.
    """
    # Evaluate the entire dataset
    df = dataset.copy()
    agent_responses, preds, is_correct_list = await evaluate_df(df, builtin_tools=builtin_tools, user_tool_kwargs=user_tool_kwargs)
    df['agent_response'] = agent_responses
    df['predicted_label'] = preds
    df['is_correct'] = is_correct_list

    # Compute F1 score for the 3 labels
    f1 = f1_score(df['label'],
                  df['predicted_label'], average='macro')
    print(f"F1 score: {f1:.4f}")
    return df, f1



Run with calculator, Wikipedia, and web search

In [5]:
mini_df = dataset.iloc[:100].copy()

In [6]:
mini_results, mini_f1 = await run_evaluation(dataset=mini_df, builtin_tools=['calculator', 'web_search', 'wikipedia_tool'], user_tool_kwargs=[])

 11%|█         | 11/100 [08:36<1:05:59, 44.49s/it]

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='Error: 1 validation erro...ease fix your mistakes.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 12%|█▏        | 12/100 [09:25<1:07:10, 45.80s/it]

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value="Error: 1 validation erro...ease fix your mistakes.", input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 14%|█▍        | 14/100 [10:38<57:37, 40.20s/it]  

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='Error: 1 validation erro...ease fix your mistakes.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


 19%|█▉        | 19/100 [15:36<1:14:59, 55.56s/it]

Error during Tavily search: 502 Server Error: Bad Gateway for url: https://api.tavily.com/search


 48%|████▊     | 48/100 [37:41<30:16, 34.93s/it]  

Error: artifact in tool message didn't validate as list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Input should be a valid list [type=list_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.11/v/list_type
Error: message.content didn't parse as json to a valid list[Evidence]. Attempting to load from message.content. Error message: 1 validation error for RootModel[list[Evidence]]
  Invalid JSON: expected value at line 1 column 1 [type=json_invalid, input_value='Error: 1 validation erro...ease fix your mistakes.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


100%|██████████| 100/100 [1:19:44<00:00, 47.84s/it]

F1 score: 0.3610





In [7]:
print(mini_f1)

0.36103896103896105


Write out results

In [8]:
with open('politifact_f1.txt', 'w') as f:
    f.write(f"F1: {mini_f1:.4f}\n")

mini_results.to_json('politifact_results_o3mini.jsonl', lines=True, orient='records')

In [9]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, fscore, support = precision_recall_fscore_support(mini_results['label'], mini_results['predicted_label'], average='macro')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {fscore:.4f}")

Precision: 0.3655
Recall: 0.3834
F1: 0.3610


In [11]:
mistral_results = pd.read_json('politifact_results.jsonl', lines=True)
precision, recall, fscore, support = precision_recall_fscore_support(mistral_results['label'], mistral_results['predicted_label'], average='macro')
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {fscore:.4f}")

Precision: 0.3958
Recall: 0.3366
F1: 0.3330
