## Create a sample set to generate a dataset for fine tuning.

First load the FACTors data

In [1]:
import pandas as pd

# Load the data
factors_df = pd.read_csv("Data/FACTors.csv")

# Identify article_ids that occur only once
article_counts = factors_df['article_id'].value_counts()
duplicate_article_ids = article_counts[article_counts > 1]
unique_article_ids = article_counts[article_counts == 1].index

# Filter the DataFrame to keep only unique article_ids
clean_factors_df = factors_df[factors_df['article_id'].isin(unique_article_ids)]

# Confirm removal
print(f"Original rows: {len(factors_df)}")
print(f"Articles with multiple claims: {len(duplicate_article_ids)}")
print(f"Rows after removing duplicates: {len(clean_factors_df)}")

Original rows: 118112
Articles with multiple claims: 12
Rows after removing duplicates: 117981


## Build a dataset with claims and factchecked answers
Retrieve first a sample of 1000 claims and fact checked articles, make sure to divide the verdicts equally

In [2]:
# Take a subset of the largest fact checking organisations
factors_sub_df=clean_factors_df[clean_factors_df["organisation"].isin(["PolitiFact", "AFP Fact Check", "Snopes", "WebQoof", "FactCheck.org"])]
factors_sample_df= factors_sub_df.sample(n=3000, random_state=12)


Retrieve the full articles fromt the url

In [3]:
factors_sample_df=factors_sample_df[['claim','date_published','url']]
factors_sample_df.head(10)

Unnamed: 0,claim,date_published,url
81368,"""Arizona officials caught changing ballots, ha...",2024-11-12T00:00:00,https://www.politifact.com/factchecks/2024/nov...
90708,The Yeti snow monster from Disneyland's iconic...,2020-11-30T11:45:24,https://www.snopes.com/fact-check/disney-yeti/
67021,"""I can tell you that the enhanced interrogatio...",2016-05-24T00:00:00,https://www.politifact.com/factchecks/2016/may...
10591,Nigerian election tribunal witness goes on the...,2023-07-10T11:38:00,https://factcheck.afp.com/doc.afp.com.33NE7Y8
75871,"""We essentially repealed Obamacare because we ...",2017-12-21T00:00:00,https://www.politifact.com/factchecks/2017/dec...
71583,"President Obama plans to ""impose a tax of at l...",2011-11-22T00:00:00,https://www.politifact.com/factchecks/2011/nov...
75185,"""Almost half a million people are still eligib...",2016-08-22T00:00:00,https://www.politifact.com/factchecks/2016/aug...
91346,Two 'racist' Black teenagers shot and killed a...,2020-07-07T08:28:45,https://www.snopes.com/fact-check/thugs-shoot-...
71546,"Says Barack Obama had ""huge majorities"" in Con...",2011-12-04T00:00:00,https://www.politifact.com/factchecks/2011/dec...
91280,"Walter ""Blackie"" Wetzel, a former leader of th...",2020-07-16T14:47:57,https://www.snopes.com/fact-check/walter-wetze...


## Generate a summary and questions
Retrieve information and create a summary as done in the original workflow of the assistant for these 1000 claims.

GPT5 and GPT OSS 120GB were compared, Since GPT OSS was much faster (53 seconds versus 24,5 min) and produced good results. This was the choice.
- https://artificialanalysis.ai/models/gpt-oss-120b/providers

In [4]:
get_information_prompt = """
### Role
You are a neutral, guiding assistant that helps students through the fact-checking process step by step. 
In this step your are tasked with extracting detailed information about a claim to determine its checkability.

### Claim
{claim}

The claim has already been fact-checked and the outcome was published on this date:
### Date published
{date_published}

### Steps
1. Identify the subject.
2. Determine if the claim is *quantitative*. 
3. Assess precision: "precise", "vague", or "absolute (100%)". 
4. Identify what the claim is *based on* (e.g., "survey …", "official statistics"). 
5. Identify the geography and time period mentioned in the claim, if provided. You may assume that the date_published occurs shortly after the claim was made.
6. Identify *alerts/warnings*: unclear subject, qualitative claim, vague quantitative claim, geography missing, time period missing, methodological details absent. 
Don't mention an alert when the information is present.
7. Summarize concisely* what is currently known about the claim.
   - Include: the information found in the first 5 steps such as subject, type (quantitative/qualitative), precision, basis, and uncertainties.
   - Mention any active alerts or missing information.

Keep your tone neutral and analytical.

### Output Format
Return a single JSON object with exactly these fields:

- "alerts": array of strings. Each alert as a short string; use [] if none.
- "summary": string. A concise summary of the claim and its checkability status.

The response must be valid JSON and contain **only** this JSON object, with no extra text before or after it.

### Examples
Example A (qualitative):
{{
  "alerts": ["qualitative claim", "methodological details absent", "geography present", "time period present"],
  "summary": "A qualitative claim about a specific legal event; methodology implied but not fully detailed."
}}

Example B (quantitative but vague):
{{
  "alerts": ["vague quantitative claim", "time period missing", "source/methodology missing", "geography: EU (present)"],
  "summary": "A quantitative claim lacking precision and methodological details; several key elements are missing for checkability."
}}
"""

In [5]:
get_information_prompt_url = """
### Role
You are a neutral, guiding assistant that helps students through the fact-checking process step by step. 
In this step your are tasked with extracting detailed information about a claim to determine its checkability.

### Claim
{claim}

The claim has already been fact-checked, below the article and date published:
### Fact-checked article
{date_published}

<Article>
{article_text}
</Article>

### Steps
1. Identify the subject.
2. Determine if the claim is *quantitative*. 
3. Assess precision: "precise", "vague", or "absolute (100%)". 
4. Identify what the claim is *based on* (e.g., "survey …", "official statistics"). 
5. Identify the geography and time period mentioned in the claim, if provided. You may assume that the date_published occurs shortly after the claim was made.
6. Identify *alerts/warnings*: unclear subject, qualitative claim, vague quantitative claim, geography missing, time period missing, methodological details absent. 
Don't mention an alert when the information is present.
7. Summarize concisely* what is currently known about the claim, take into account the information in the *Article*.
   - Include: the information found in the first 5 steps such as subject, type (quantitative/qualitative), precision, basis, and uncertainties.
   - Check the content of the *Article* for missing information, but don't mention a *verdict* (e.g. True or False) in the summary.
   - Mention any active alerts or missing information, that could not be found in *Article*.

Keep your tone neutral and analytical.

### Output Format
Return a single JSON object with exactly these fields:

- "alerts": array of strings. Each alert as a short string; use [] if none.
- "summary": string. A concise summary of the claim and its checkability status.

The response must be valid JSON and contain **only** this JSON object, with no extra text before or after it.

### Examples
Example A (qualitative):
{{
  "alerts": ["qualitative claim", "methodological details absent", "geography present", "time period present"],
  "summary": "A qualitative claim about a specific legal event; methodology implied but not fully detailed."
}}

Example B (quantitative but vague):
{{
  "alerts": ["vague quantitative claim", "time period missing", "source/methodology missing", "geography: EU (present)"],
  "summary": "A quantitative claim lacking precision and methodological details; several key elements are missing for checkability."
}}
"""

In [6]:
socratic_questions_prompt = """You are given a fact-check claim and a summary of what is known about this claim. 
Your task is to generate 4 Socratic questions that probe the summary up until now. The goal is to challenge the reasoning, 
surface blind spots, and encourage deeper reflection, not to accept the summary at face value. 

### Claim
{claim}
{summary}

<Alerts>
{alerts}
</Alerts>


Since the output  will be used to finetune an LLM that critiques the reasoning of a fact-checking model, ensure that your questions reflect the following principles:
- Factuality – Do the claims rely on verifiable evidence? Could missing or weak evidence be questioned?
- Objectivity – Is the reasoning neutral, or does it show bias? How could the framing be challenged?
- Fairness – Are multiple perspectives considered? Is the reasoning applied consistently?
- Transparency – Is the summary clear about its sources and reasoning steps? What is hidden or assumed?
- Hallucinations – Does the summary risk introducing unsupported or invented information?
- Strategies & Alternatives – Are there other ways to frame, investigate, or reason about the claim?

When writing questions, draw from the following categories of Socratic questioning. Use them as inspiration to diversify your 4 questions 
(do not stick to just one category):

Purpose – probe the aim or agenda.
- What is your purpose right now?
- Why are you writing this?
- What do you want to persuade them of?
- What is our central aim or task in this line of thought?

Questions – probe the underlying questions.
- I am not sure exactly what question you are raising. Could you explain it?
- Is this question the best one to focus on, or is there a more pressing one?
- What questions might we be failing to ask that we should be asking?

Information – probe the evidence or data.
- On what information are you basing that comment?
- How do we know this information is accurate? How could we verify it?
- Have we failed to consider any information or data we need to consider?

Inferences & Conclusions – probe how the conclusion was drawn.
- How did you reach that conclusion?
- Could you explain your reasoning?
- Is there an alternative plausible conclusion?

Concepts & Ideas – probe key ideas being applied.
- What is the main idea you are using in your reasoning?
- Are we using the appropriate concept, or do we need to reconceptualize the problem?
- Do we need more facts, or do we need to rethink how we are labeling the facts?

Assumptions – probe what is taken for granted.
- What exactly are you taking for granted here?
- Why are you assuming that? Shouldn’t we rather assume that…?
- What alternative assumptions might we make?

Implications & Consequences – probe what follows.
- What are you implying when you say…?
- If we do this, what is likely to happen as a result?
- Have you considered the implications of this reasoning?

Viewpoints & Perspectives – probe alternative frames.
- From what point of view are you looking at this?
- Is there another point of view we should consider?
- Which of these possible viewpoints makes the most sense given the situation?

Instructions:
- Do not repeat the justification.
- Do not state whether the verdict is correct.
- Ask probing questions that challenge the reasoning, highlight blind spots, and open space for reconsideration.
- Ensure the five questions you generate come from different categories where possible

Output format (JSONL):
{{
  "claim": {claim},
  "summary": {summary},
  "questions": [
    "...4 questions..."
  ]
}}
"""

In [7]:
import pandas as pd
from langchain_core.messages import SystemMessage, HumanMessage
import tqdm as notebook_tqdm
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from typing_extensions import List

load_dotenv(dotenv_path=".env", override=True)

class MoreInfoResult(BaseModel):
    alerts: List[str] = Field([], description="Any alerts or warnings about the claim")
    summary: str = Field("", description="A concise summary of the claim")

class SocraticQuestionsResult(BaseModel):
    claim: str
    summary: str
    questions: List[str] = Field([], description="Five socratic questions")

#low temperature for more factual answers, 
llmGPTOSS = ChatGroq(model_name="openai/gpt-oss-120b", model_kwargs={"tool_choice": "none"}, temperature=0.1)
#llmGPT5 = ChatOpenAI(model="gpt-5", temperature=0.1)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
import json
from langchain_community.document_loaders import WebBaseLoader

def retrieve_info(claim: str, date_published: str) -> dict:

    """Gather more information about a potentially checkable claim."""

    # Use structured output
    structured_llm = llmGPTOSS.with_structured_output(MoreInfoResult,method="json_mode")
    #structured_llm = llmGPT5.with_structured_output(MoreInfoResult,method="json_mode")

    # Create a prompt
    prompt = get_information_prompt.format(
        claim=claim,
        date_published=date_published
    )

    #invoke the LLM and store the output
    result = structured_llm.invoke(prompt)

    # return a Python dict instead of a Pydantic model
    return result.model_dump()


def retrieve_info_url(claim: str, date_published: str, url: str) -> dict:

    # Load the article content
    loader = WebBaseLoader(url)
    docs = loader.load()
    article_text = docs[0].page_content

    # Use structured output
    structured_llm = llmGPTOSS.with_structured_output(MoreInfoResult,method="json_mode")
    #structured_llm = llmGPT5.with_structured_output(MoreInfoResult,method="json_mode")

    # Create a prompt
    prompt = get_information_prompt_url.format(
        claim=claim,
        date_published=date_published,
        url=url,
        article_text=article_text
    )

    #invoke the LLM and store the output
    result = structured_llm.invoke(prompt)

    # return a Python dict instead of a Pydantic model
    return result.model_dump()


def generate_socratic_questions(claim: str, summary: str, alerts: List) -> SocraticQuestionsResult:
    
    """Generate 2 Socratic questions from claim + summary."""

    # Use structured output
    structured_llm = llmGPTOSS.with_structured_output(SocraticQuestionsResult,method="json_mode")
    #structured_llm = llmGPT5.with_structured_output(SocraticQuestionsResult,method="json_mode")

    # Create a prompt
    prompt = socratic_questions_prompt.format(
        claim=claim,
        alerts=alerts,
        summary=summary
    )

    #invoke the LLM and return the output
    return structured_llm.invoke(prompt)

def retrieve_info_and_socratic_jsonl(claim: str, date_published: str, url: str) -> list[str]:
    """Runs both calls and returns a list of JSONL lines."""

    # Case 1: run the first LLM to create a summary
    info = retrieve_info(claim, date_published)

    # Case 2: run the second LLM to create a summary with more details
    info_url = retrieve_info_url(claim, date_published, url)

    # Retrieve the 4 questions for Case 1
    socratic_1 = generate_socratic_questions(
        claim=claim,
        summary=info.get("summary"),
        alerts=info.get("alerts")
    )

    # Retrieve the 4 questions for Case 2
    socratic_2 = generate_socratic_questions(
        claim=claim,
        summary=info_url.get("summary"),
        alerts=info_url.get("alerts")
    )

    lines = []

    # --- Case 1 ---
    q1_main = socratic_1.questions[:2]     # first 2 questions -> separate lines
    q1_history = socratic_1.questions[2:]  # last 2 questions -> history list

    for q in q1_main:
        obj = {
            "claim": claim,
            "summary": info.get("summary"),
            "alerts": info.get("alerts"),
            "url_used": False,
            "question": q,
            "history": q1_history
        }
        lines.append(json.dumps(obj, ensure_ascii=False))

    # --- Case 2 ---
    q2_main = socratic_2.questions[:2]
    q2_history = socratic_2.questions[2:]

    for q in q2_main:
        obj = {
            "claim": claim,
            "summary": info_url.get("summary"),
            "alerts": info_url.get("alerts"),
            "url_used": True,
            "question": q,
            "history": q2_history
        }
        lines.append(json.dumps(obj, ensure_ascii=False))

    return lines


def claims_to_jsonl_file(claims, dates, urls, output_path: str):

    """Generate JSONL lines (4 per claim) and write to file."""
    
    with open(output_path, "w", encoding="utf-8") as f:
        for c, dp, u in zip(claims, dates, urls):
            lines = retrieve_info_and_socratic_jsonl(c, dp, u)
            for line in lines:
                f.write(line + "\n")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [None]:
from pathlib import Path

# The lines will be written to a JSONL file
output_path = Path("Data/socratic_questions_GPTOSS3000.jsonl")
output_path.parent.mkdir(parents=True, exist_ok=True)

# Generate the lines and do the writing
claims_to_jsonl_file(factors_sample_df["claim"], factors_sample_df["date_published"], factors_sample_df["url"], output_path)


## Compare data for 10 claims

First the summaries foor GPT5 and GPT-OSS 120B are compared side by side. In the Next step ChatGPT was asked to analyse the excel and add a column with differences. The differences were minimal. since GPT-OSS 120B, is 25 times faster and about 10 times cheaper, this was the chosen model to generate the synthetic dataset for finetuning.

In [None]:
"""
import json
import pandas as pd

# paths to your uploaded jsonl files
path_gptoss = "Data/socratic_questions_GPTOSS.jsonl"
path_gpt5   = "Data/socratic_questions_GPT5.jsonl"

def read_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                # skip bad lines (or collect them if you want)
                continue
    return rows

# load
rows_oss = read_jsonl(path_gptoss)
rows_5   = read_jsonl(path_gpt5)

# to DataFrames, keep only needed cols, remove dupes
df_oss = pd.DataFrame(rows_oss)[["claim", "url_used", "summary"]].drop_duplicates()
df_5   = pd.DataFrame(rows_5)[["claim", "url_used", "summary"]].drop_duplicates()

# align by claim + url_used
merged = pd.merge(
    df_oss, df_5,
    on=["claim", "url_used"],
    how="inner",
    suffixes=("_GPTOSS", "_GPT5")
).drop_duplicates()

# keep ONLY the two summary columns
summaries_only = merged[["summary_GPTOSS", "summary_GPT5"]]

# view
print(summaries_only.head())

# save to Excel
excel_path = "Data/summaries_only_comparison.xlsx"
summaries_only.to_excel(excel_path, index=False)

print("Saved to:", excel_path)
"""

'\nimport json\nimport pandas as pd\n\n# paths to your uploaded jsonl files\npath_gptoss = "Data/socratic_questions_GPTOSS.jsonl"\npath_gpt5   = "Data/socratic_questions_GPT5.jsonl"\n\ndef read_jsonl(path):\n    rows = []\n    with open(path, "r", encoding="utf-8") as f:\n        for line in f:\n            line = line.strip()\n            if not line:\n                continue\n            try:\n                rows.append(json.loads(line))\n            except json.JSONDecodeError:\n                # skip bad lines (or collect them if you want)\n                continue\n    return rows\n\n# load\nrows_oss = read_jsonl(path_gptoss)\nrows_5   = read_jsonl(path_gpt5)\n\n# to DataFrames, keep only needed cols, remove dupes\ndf_oss = pd.DataFrame(rows_oss)[["claim", "url_used", "summary"]].drop_duplicates()\ndf_5   = pd.DataFrame(rows_5)[["claim", "url_used", "summary"]].drop_duplicates()\n\n# align by claim + url_used\nmerged = pd.merge(\n    df_oss, df_5,\n    on=["claim", "url_used"],\

Below a comparances for the questions

In [None]:
"""
import json
import pandas as pd

path_gptoss = "Data/socratic_questions_GPTOSS.jsonl"
path_gpt5   = "Data/socratic_questions_GPT5.jsonl"

def read_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return rows

def add_q_index(rows):

    df = pd.DataFrame(rows)

    # make sure url_used exists
    if "url_used" not in df.columns:
        df["url_used"] = False

    # stable order inside each group
    df["_orig_order"] = range(len(df))
    df = df.sort_values(["claim", "url_used", "_orig_order"], kind="stable")
    df["q_idx"] = df.groupby(["claim", "url_used"]).cumcount() + 1

    return df.drop(columns=["_orig_order"])

# load
rows_oss = read_jsonl(path_gptoss)
rows_5   = read_jsonl(path_gpt5)

# add q_idx for alignment
df_oss = add_q_index(rows_oss)
df_5   = add_q_index(rows_5)

# keep needed cols + dedupe
df_oss_q = df_oss[["claim", "url_used", "q_idx", "question"]].drop_duplicates()
df_5_q   = df_5[["claim", "url_used", "q_idx", "question"]].drop_duplicates()

# merge side-by-side
merged_q = pd.merge(
    df_oss_q, df_5_q,
    on=["claim", "url_used", "q_idx"],
    how="outer",
    suffixes=("_GPTOSS", "_GPT5")
).sort_values(["claim", "url_used", "q_idx"], kind="stable")

# if you ONLY want two columns:
questions_only = merged_q[["question_GPTOSS", "question_GPT5"]]

print(questions_only.head(10))

# optional: save to Excel
out_path = "Data/questions_side_by_side.xlsx"
questions_only.to_excel(out_path, index=False)
print("Saved to:", out_path)
"""


'\nimport json\nimport pandas as pd\n\npath_gptoss = "Data/socratic_questions_GPTOSS.jsonl"\npath_gpt5   = "Data/socratic_questions_GPT5.jsonl"\n\ndef read_jsonl(path):\n    rows = []\n    with open(path, "r", encoding="utf-8") as f:\n        for line in f:\n            line = line.strip()\n            if not line:\n                continue\n            try:\n                rows.append(json.loads(line))\n            except json.JSONDecodeError:\n                continue\n    return rows\n\ndef add_q_index(rows):\n\n    df = pd.DataFrame(rows)\n\n    # make sure url_used exists\n    if "url_used" not in df.columns:\n        df["url_used"] = False\n\n    # stable order inside each group\n    df["_orig_order"] = range(len(df))\n    df = df.sort_values(["claim", "url_used", "_orig_order"], kind="stable")\n    df["q_idx"] = df.groupby(["claim", "url_used"]).cumcount() + 1\n\n    return df.drop(columns=["_orig_order"])\n\n# load\nrows_oss = read_jsonl(path_gptoss)\nrows_5   = read_jsonl(pat