## Create a sample set to generate a dataset for fine tuning.

First load the FACTors data

In [6]:
import pandas as pd

# Load the data
factors_df = pd.read_csv("Data/FACTors.csv")

# Identify article_ids that occur only once
article_counts = factors_df['article_id'].value_counts()
duplicate_article_ids = article_counts[article_counts > 1]
unique_article_ids = article_counts[article_counts == 1].index

# Filter the DataFrame to keep only unique article_ids
clean_factors_df = factors_df[factors_df['article_id'].isin(unique_article_ids)]

# Confirm removal
print(f"Original rows: {len(factors_df)}")
print(f"Articles with multiple claims: {len(duplicate_article_ids)}")
print(f"Rows after removing duplicates: {len(clean_factors_df)}")

Original rows: 118112
Articles with multiple claims: 12
Rows after removing duplicates: 117981


In [7]:
clean_factors_df['normalised_rating'].value_counts()

normalised_rating
false             77641
partially true    18796
misleading        10165
true               9222
other              1089
unverifiable       1068
Name: count, dtype: int64

## Build a dataset with claims and factchecked answers
Retrieve first a sample of 1000 claims and fact checked articles, make sure to divide the verdicts equally

In [8]:
orgs = ["PolitiFact", "AFP Fact Check", "Snopes", "WebQoof", "FactCheck.org"]
labels = ["true", "false", "partially true", "misleading"]

def get_sample(df, label, n=250):
    subset = df[(df["normalised_rating"] == label) & (df["organisation"].isin(orgs))]
    return subset.sample(n, random_state=23)

# Get samples for each label and combine
samples = [get_sample(clean_factors_df, label) for label in labels]
sampled_clean_factors_df = pd.concat(samples, ignore_index=True)

In [9]:
sampled_clean_factors_df['normalised_rating'].value_counts()

normalised_rating
true              250
false             250
partially true    250
misleading        250
Name: count, dtype: int64

Retrieve the full articles fromt the url

In [10]:
from newspaper import Article

def fetch_full_article(url) -> str:
    """
    Fetch and return the summary of an article from a given URL.

    Returns: summary)
    """
    try:
        article = Article(url)
        article.download()
        article.parse()

        return article.text
    except Exception as e:
        return f"[Failed to fetch article content from {url}"

# Apply the function to each URL in the DataFrame
sampled_clean_factors_df['article'] = sampled_clean_factors_df.apply(
    lambda row: pd.Series(fetch_full_article(row['url'])), axis=1
)

In [22]:
sampled_factchecks_df=sampled_clean_factors_df[['claim','article','url','normalised_rating']]
sampled_factchecks_df.head(10)

Unnamed: 0,claim,article,url,normalised_rating
0,A physical book detailing the contents of Hunt...,Claim: A physical book detailing the contents ...,https://www.snopes.com/fact-check/hunter-biden...,True
1,A video taken at a George Floyd protest in Den...,Claim: A video taken at a George Floyd protest...,https://www.snopes.com/fact-check/dpd-car-preg...,True
2,"In September 2020, U.S. President Donald Trump...","Claim: In September 2020, U.S. President Donal...",https://www.snopes.com/fact-check/trump-execut...,True
3,A photograph shows Air Force One during U.S. P...,Claim: A photograph shows Air Force One during...,https://www.snopes.com/fact-check/air-force-on...,True
4,Photographs show the results of a car vs,Claim:\n\nClaim: Photographs show the results ...,https://www.snopes.com/fact-check/moose-story/,True
5,PlayStation and Xbox announced that refunds wo...,Claim: PlayStation and Xbox announced that ref...,https://www.snopes.com/fact-check/cyberpunk-20...,True
6,On-line coupon can be redeemed for a free smoo...,Claim:\n\nClaim: On-line coupon can be redeeme...,https://www.snopes.com/fact-check/jamba-juice-...,True
7,U.S. President Joe Biden wore a hard hat backw...,Claim: U.S. President Joe Biden wore a hard ha...,https://www.snopes.com/fact-check/biden-wear-h...,True
8,"""Nearly 2,000 high schools - roughly 12 percen...",U.S. Rep Bobby Scott cited a staggering statis...,https://www.politifact.com/factchecks/2011/may...,True
9,"""Our state has fewer science, technology, engi...",Is West Virginia trailing its neighbors in sci...,https://www.politifact.com/factchecks/2019/apr...,True


In [14]:
#check how many articles failed to fetch
no_article_df = sampled_factchecks_df[sampled_factchecks_df['article'].str.contains("Failed to fetch article content ", na=False)]
no_article_df.size

0

In [21]:
sampled_factchecks_df.to_csv("Data/sample_factchecks.csv", index=False)

### Generating short justifications and connecting the verdict as in the Eurpean Fact Checking Project
Generate short justifications for the original verdict, based upon the article and the given Normalized rating (verdict).
use GPT5, often regarded as best model for various tasks:
- https://artificialanalysis.ai/leaderboards/models
- https://www.vellum.ai/llm-leaderboard?utm_source=google&utm_medium=organic
- https://www.shakudo.io/blog/top-9-large-language-models

In [3]:
import pandas as pd
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_groq import ChatGroq
import tqdm as notebook_tqdm
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)

#low temperature for more factual answers, 
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.2 )

SYS = """You are a careful fact-checking assistant.
Write ONE or TWO concise sentences (≤50 words) that justify the given VERDICT using only evidence in the ARTICLE.
No outside facts, speculation, or bullet lists, focus on why the article is TRUE, FALSE, MOSTLY TRUE, MOSTLY FALSE, or UNCHECKABLE.
Start with the verdict in capitals, start with "TRUE because", "FALSE because", "MOSTLY TRUE because", "MOSTLY FALSE because", or "UNCHECKABLE because".
If the Normalised rating is TRUE or FALSE, the verdict must be TRUE or FALSE.
If the Normalised rating is PARTIALLY TRUE or MISLEADING, the verdict must be MOSTLY TRUE or MOSTLY FALSE.
If the article does not provide enough information to justify the verdict, say so.

 Examples:
    “The Imperial College London study estimated Omicron's reinfection risk is 5.4 times higher than Delta's, indicating faster spread. 
    South Africa's data showed higher reinfection risk with Omicron. However, the evidence is preliminary and not peer-reviewed, 
    so it's mostly true.” → MOSTLY TRUE
    “The World Bank does not have a single 'climate fund', and Oxfam's Climate Finance Unchecked report (2017-2023) found no missing funds 
    but rather a lack of tracking for project components labeled 'climate finance' during approval. 
    The claim originated from a social media post by Brian Tamaki, which was debunked by AAP FactCheck using Oxfam's report data.
    [0] Charity report did not find World Bank climate change fund was missing $41 billion” → FALSE"""

def infer_verdict_and_expl(claim: str, article: str, normalised_rating: str):
    if not isinstance(article, str) or "Failed to fetch article content" in article:
        return None
    msgs = [
        SystemMessage(content=SYS),
        HumanMessage(content=f'CLAIM: {claim}\nVERDICT: {normalised_rating}\n\nARTICLE:\n""" {article} """')
    ]
    try:
        resp = llm.invoke(msgs)
        text = getattr(resp, "content", str(resp)).strip()
        return " ".join(text.split())  # collapse whitespace
    except Exception:
        return None

# Apply to DataFrame (expects columns: 'claim', 'article', 'normalised_rating')
factchecks_df = pd.read_csv("Data/sample_factchecks.csv")
factchecks_df.loc[:, "short_explanation"] = factchecks_df.apply(
    lambda r: infer_verdict_and_expl(r["claim"], r["article"], r["normalised_rating"]),axis=1
)

factchecks_df[["claim", "normalised_rating", "short_explanation"]].head(10)

Unnamed: 0,claim,normalised_rating,short_explanation
0,A physical book detailing the contents of Hunt...,True,TRUE because the nonprofit group Marco Polo cr...
1,A video taken at a George Floyd protest in Den...,True,TRUE because the article confirms a video show...
2,"In September 2020, U.S. President Donald Trump...",True,"TRUE because Trump said at a rally, ""Maybe I'l..."
3,A photograph shows Air Force One during U.S. P...,True,FALSE because the photograph was taken in 2004...
4,Photographs show the results of a car vs,True,"TRUE because the article provides evidence, in..."
5,PlayStation and Xbox announced that refunds wo...,True,TRUE because Sony PlayStation and Microsoft Xb...
6,On-line coupon can be redeemed for a free smoo...,True,TRUE because a Jamba Juice Company Customer Se...
7,U.S. President Joe Biden wore a hard hat backw...,True,TRUE because the hard hat's suspension was con...
8,"""Nearly 2,000 high schools - roughly 12 percen...",True,TRUE because the claim is supported by a 2007 ...
9,"""Our state has fewer science, technology, engi...",True,TRUE because West Virginia conferred the small...


In [4]:
#save the results to a csv file

factchecks_df.to_csv("Data/factchecks_with_verdicts.csv", index=False)

In [None]:
print(factchecks_df["article"][2])
print(factchecks_df["short_explanation"][2 ])

## Create JSONL messages for finetuning
Next, create messages containing a claim, a verdict, and an explanation, then add Socratic questions to encourage critical thinking and reflection.

In [2]:
from pathlib import Path
import pandas as pd
import re
import json
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_groq import ChatGroq
import tqdm as notebook_tqdm
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env", override=True)

#low temperature for more factual answers,
llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.2 )

SYS = """You are given a fact-check CLAIM and its justification as SHORT_EXPLANATION explaining why it is labeled as true, false, mostly true, 
mostly false, or uncheckable. Your task is to generate five Socratic questions that probe the justification and verdict. The goal is to 
challenge the reasoning, surface blind spots, and encourage deeper reflection, not to accept the explanation at face value. Since the output
 will be used to finetune an LLM that critiques the reasoning of a fact-checking model, ensure that your questions reflect the following principles:
- Factuality – Do the claims rely on verifiable evidence? Could missing or weak evidence be questioned?
- Objectivity – Is the reasoning neutral, or does it show bias? How could the framing be challenged?
- Fairness – Are multiple perspectives considered? Is the reasoning applied consistently?
- Transparency – Is the explanation clear about its sources and reasoning steps? What is hidden or assumed?
- Hallucinations – Does the explanation risk introducing unsupported or invented information?
- Strategies & Alternatives – Are there other ways to frame, investigate, or reason about the claim?

When writing questions, draw from the following categories of Socratic questioning. Use them as inspiration to diversify your five questions 
(do not stick to just one category):

Purpose – probe the aim or agenda.
- What is your purpose right now?
- Why are you writing this?
- What do you want to persuade them of?
- What is our central aim or task in this line of thought?

Questions – probe the underlying questions.
- I am not sure exactly what question you are raising. Could you explain it?
- Is this question the best one to focus on, or is there a more pressing one?
- What questions might we be failing to ask that we should be asking?

Information – probe the evidence or data.
- On what information are you basing that comment?
- How do we know this information is accurate? How could we verify it?
- Have we failed to consider any information or data we need to consider?

Inferences & Conclusions – probe how the conclusion was drawn.
- How did you reach that conclusion?
- Could you explain your reasoning?
- Is there an alternative plausible conclusion?

Concepts & Ideas – probe key ideas being applied.
- What is the main idea you are using in your reasoning?
- Are we using the appropriate concept, or do we need to reconceptualize the problem?
- Do we need more facts, or do we need to rethink how we are labeling the facts?

Assumptions – probe what is taken for granted.
- What exactly are you taking for granted here?
- Why are you assuming that? Shouldn’t we rather assume that…?
- What alternative assumptions might we make?

Implications & Consequences – probe what follows.
- What are you implying when you say…?
- If we do this, what is likely to happen as a result?
- Have you considered the implications of this reasoning?
- Viewpoints & Perspectives – probe alternative frames.

From what point of view are you looking at this?
- Is there another point of view we should consider?
- Which of these possible viewpoints makes the most sense given the situation?

Instructions:
- Do not repeat the justification.
- Do not state whether the verdict is correct.
- Ask probing questions that challenge the reasoning, highlight blind spots, and open space for reconsideration.
- Ensure the five questions you generate come from different categories where possible

Output format (JSONL):
{
  "claim": "the original claim",
  "short_explanation": "the original short explanation",
  "verdict": "The verdict as written in the explanation: true, false, mostly true, mostly false or uncheckable",
  "questions": [
    "What is our central aim or task in this line of thought?",
    "What is the underlying question that this explanation is really trying to address?",
    "How do we know this information is accurate, and how could we verify it?",
    "Is there an alternative plausible conclusion based on the same reasoning?",
    "Is there another point of view we should consider when evaluating this claim?"
  ]
}
"""

def add_questions(claim: str, short_explanation: str):
    msgs = [
        SystemMessage(content=SYS),
        HumanMessage(content=f'CLAIM: {claim}\nSHORT_EXPLANATION: {short_explanation}')
    ]
    try:
        resp = llm.invoke(msgs)
        text = getattr(resp, "content", str(resp)).strip()
        one_line = " ".join(text.split())

        return one_line
    except Exception:
        return None

# --- Load data and compute short_explanation as before ---
factchecks_df = pd.read_csv("Data/factchecks_with_verdicts.csv")

# --- Generate JSONL lines and write them to a single file ---
output_path = Path("Data/socratic_questions.jsonl")
output_path.parent.mkdir(parents=True, exist_ok=True)

valid_lines = []

for _, row in factchecks_df.iterrows():
    line = add_questions(row["claim"], row["short_explanation"])
    if not line:
        continue

    try:
        obj = json.loads(line)  # parse the JSON string
    except json.JSONDecodeError:
        continue  # skip if the model output was not valid JSON

    # Expand into one object per question
    for q in obj.get("questions", []):
        new_obj = {
            "claim": obj["claim"],
            "short_explanation": obj["short_explanation"],
            "verdict": obj["verdict"],
            "question": q,
        }
        valid_lines.append(new_obj)

with output_path.open("w", encoding="utf-8") as f:
    for obj in valid_lines:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"Wrote {len(valid_lines)} JSON objects to {output_path}")

Wrote 4995 JSON objects to Data\socratic_questions.jsonl
