## Create a sample set to generate a dataset for fine tuning.

First load the FACTors data

In [35]:
import pandas as pd

# Load the data
factors_df = pd.read_csv("Data/FACTors.csv")

# Identify article_ids that occur only once
article_counts = factors_df['article_id'].value_counts()
duplicate_article_ids = article_counts[article_counts > 1]
unique_article_ids = article_counts[article_counts == 1].index

# Filter the DataFrame to keep only unique article_ids
clean_factors_df = factors_df[factors_df['article_id'].isin(unique_article_ids)]

# Confirm removal
print(f"Original rows: {len(factors_df)}")
print(f"Articles with multiple claims: {len(duplicate_article_ids)}")
print(f"Rows after removing duplicates: {len(clean_factors_df)}")

Original rows: 118112
Articles with multiple claims: 12
Rows after removing duplicates: 117981


In [17]:
clean_factors_df['normalised_rating'].value_counts()

normalised_rating
false             77641
partially true    18796
misleading        10165
true               9222
other              1089
unverifiable       1068
Name: count, dtype: int64

## Build a dataset with claims and factchecked answers
Retrieve first a sample of 1000 claims and fact checked articles, make sure to divide the verdicts equally

In [61]:
orgs = ["PolitiFact", "AFP Fact Check", "Snopes", "WebQoof", "FactCheck.org"]
labels = ["true", "false", "partially true", "misleading"]

def get_sample(df, label, n=250):
    subset = df[(df["normalised_rating"] == label) & (df["organisation"].isin(orgs))]
    return subset.sample(n, random_state=23)

# Get samples for each label and combine
samples = [get_sample(clean_factors_df, label) for label in labels]
sampled_clean_factors_df = pd.concat(samples, ignore_index=True)

In [40]:
sampled_clean_factors_df['normalised_rating'].value_counts()

normalised_rating
true              250
false             250
partially true    250
misleading        250
Name: count, dtype: int64

Retrieve the full articles fromt the url

In [62]:
from newspaper import Article

def fetch_full_article(url) -> str:
    """
    Fetch and return the summary of an article from a given URL.

    Returns: summary)
    """
    try:
        article = Article(url)
        article.download()
        article.parse()

        return article.text
    except Exception as e:
        return f"[Failed to fetch article content from {url}"

# Apply the function to each URL in the DataFrame
sampled_clean_factors_df['article'] = sampled_clean_factors_df.apply(
    lambda row: pd.Series(fetch_full_article(row['url'])), axis=1
)

In [63]:
sampled_factchecks_df=sampled_clean_factors_df[['claim','article','url','normalised_rating']]
sampled_factchecks_df.head(10)

Unnamed: 0,claim,article,url,normalised_rating
0,A physical book detailing the contents of Hunt...,Claim: A physical book detailing the contents ...,https://www.snopes.com/fact-check/hunter-biden...,True
1,A video taken at a George Floyd protest in Den...,Claim: A video taken at a George Floyd protest...,https://www.snopes.com/fact-check/dpd-car-preg...,True
2,"In September 2020, U.S. President Donald Trump...","Claim: In September 2020, U.S. President Donal...",https://www.snopes.com/fact-check/trump-execut...,True
3,A photograph shows Air Force One during U.S. P...,Claim: A photograph shows Air Force One during...,https://www.snopes.com/fact-check/air-force-on...,True
4,Photographs show the results of a car vs,Claim:\n\nClaim: Photographs show the results ...,https://www.snopes.com/fact-check/moose-story/,True
5,PlayStation and Xbox announced that refunds wo...,Claim: PlayStation and Xbox announced that ref...,https://www.snopes.com/fact-check/cyberpunk-20...,True
6,On-line coupon can be redeemed for a free smoo...,Claim:\n\nClaim: On-line coupon can be redeeme...,https://www.snopes.com/fact-check/jamba-juice-...,True
7,U.S. President Joe Biden wore a hard hat backw...,Claim: U.S. President Joe Biden wore a hard ha...,https://www.snopes.com/fact-check/biden-wear-h...,True
8,"""Nearly 2,000 high schools - roughly 12 percen...",U.S. Rep Bobby Scott cited a staggering statis...,https://www.politifact.com/factchecks/2011/may...,True
9,"""Our state has fewer science, technology, engi...",Is West Virginia trailing its neighbors in sci...,https://www.politifact.com/factchecks/2019/apr...,True


In [64]:
#check how many articles failed to fetch
no_article_df = sampled_factchecks_df[sampled_factchecks_df['article'].str.contains("Failed to fetch article content ", na=False)]
no_article_df.size

0

Generate short justifications for the original verdict, based upon the article and the given Normalized rating (verdict).
use GPT5, often regarded as best model for various tasks:
- https://artificialanalysis.ai/leaderboards/models
- https://www.vellum.ai/llm-leaderboard?utm_source=google&utm_medium=organic
- https://www.shakudo.io/blog/top-9-large-language-models

In [None]:
# Minimal LLM verdict+explanation pipeline ------------------------------------
import pandas as pd
from pydantic import BaseModel, Field
from langchain_core.messages import SystemMessage, HumanMessage
from typing import Literal
from langchain_ollama import ChatOllama

#low temperature for more factual answers
llm = ChatOllama(model="llama3.2", temperature=0.2, base_url="http://localhost:11434")

SYS = """You are a careful fact-checking assistant.
Write ONE or TWO concise sentences (≤50 words) that justify the given VERDICT using only evidence in the ARTICLE.
No outside facts, speculation, or bullet lists, focus on why the article is TRUE, FALSE, PARTIALLY TRUE or MISLEADING.
You don't have to mention the verdict in your explanation, since this is already given.
If the article does not provide enough information to justify the verdict, say so."""

def infer_verdict_and_expl(claim: str, article: str, normalised_rating: str):
    if not isinstance(article, str) or "Failed to fetch article content" in article:
        return None
    msgs = [
        SystemMessage(content=SYS),
        HumanMessage(content=f'CLAIM: {claim}\nVERDICT: {normalised_rating}\n\nARTICLE:\n""" {article[:8000]} """')
    ]
    try:
        resp = llm.invoke(msgs)
        text = getattr(resp, "content", str(resp)).strip()
        return " ".join(text.split())  # collapse whitespace
    except Exception:
        return None

# Apply to DataFrame (expects columns: 'claim', 'article', 'normalised_rating')
factchecks_df = sampled_factchecks_df.copy()
factchecks_df.loc[:, "short_explanation"] = factchecks_df.apply(
    lambda r: infer_verdict_and_expl(r["claim"], r["article"], r["normalised_rating"]),axis=1
)

factchecks_df[["claim", "normalised_rating", "short_explanation"]].head(10)

In [68]:
factchecks_df[["claim", "normalised_rating", "short_explanation"]].head(10)

Unnamed: 0,claim,normalised_rating,short_explanation
0,A physical book detailing the contents of Hunt...,True,The article confirms that a physical copy of t...
1,A video taken at a George Floyd protest in Den...,True,The article confirms that police officers fire...
2,"In September 2020, U.S. President Donald Trump...",True,The claim is partially true because Trump did ...
3,A photograph shows Air Force One during U.S. P...,True,This claim is TRUE because the article states ...
4,Photographs show the results of a car vs,True,The article provides photographs of the accide...
5,PlayStation and Xbox announced that refunds wo...,True,The claim is true because Sony PlayStation and...
6,On-line coupon can be redeemed for a free smoo...,True,The article confirms the existence of a legiti...
7,U.S. President Joe Biden wore a hard hat backw...,True,The claim that U.S. President Joe Biden wore a...
8,"""Nearly 2,000 high schools - roughly 12 percen...",True,The article cites a 2007 report by Robert Balf...
9,"""Our state has fewer science, technology, engi...",True,The article states that West Virginia conferre...


In [None]:
#save the results to a csv file
factchecks_df.to_csv("Data/factchecks_with_verdicts.csv", index=False)

## Create JSONL messages for finetuning
Next, create messages containing a claim, a verdict, and an explanation, then add Socratic questions to encourage critical thinking and reflection.

In [None]:
# Minimal LLM verdict+explanation pipeline ------------------------------------
import pandas as pd
from pydantic import BaseModel, Field
from langchain_core.messages import SystemMessage, HumanMessage
from typing import Literal
from langchain_ollama import ChatOllama

#low temperature for more factual answers
llm = ChatOllama(model="llama3.2", temperature=0.2, base_url="http://localhost:11434")

SYS = """You are a careful fact-checking assistant.
Write ONE or TWO concise sentences (≤50 words) that justify the given VERDICT using only evidence in the ARTICLE.
No outside facts, speculation, or bullet lists, focus on why the article is TRUE, FALSE, PARTIALLY TRUE or MISLEADING.
You don't have to mention the verdict in your explanation, since this is already given.
If the article does not provide enough information to justify the verdict, say so."""

def infer_verdict_and_expl(claim: str, article: str, normalised_rating: str):
    if not isinstance(article, str) or "Failed to fetch article content" in article:
        return None
    msgs = [
        SystemMessage(content=SYS),
        HumanMessage(content=f'CLAIM: {claim}\nVERDICT: {normalised_rating}\n\nARTICLE:\n""" {article[:8000]} """')
    ]
    try:
        resp = llm.invoke(msgs)
        text = getattr(resp, "content", str(resp)).strip()
        return " ".join(text.split())  # collapse whitespace
    except Exception:
        return None

# Apply to DataFrame (expects columns: 'claim', 'article', 'normalised_rating')
factchecks_df = sampled_factchecks_df.copy()
factchecks_df.loc[:, "short_explanation"] = factchecks_df.apply(
    lambda r: infer_verdict_and_expl(r["claim"], r["article"], r["normalised_rating"]),axis=1
)

factchecks_df[["claim", "normalised_rating", "short_explanation"]].head(10)