In [None]:
# ---------------------------------------------
# ✅ 1. Install and Import
# ---------------------------------------------
!pip install transformers --quiet

from transformers import pipeline
import pandas as pd
import random
from tqdm import tqdm

# Load reasoning model
model_name = "MBZUAI/LaMini-Flan-T5-783M"
lamini_pipe = pipeline("text2text-generation", model=model_name)

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeo

We install and import the required libraries as our first step

In [None]:
# ---------------------------------------------
# ✅ 2. Define Topics for Claim Generation
# ---------------------------------------------
topics = [
    "vaccination", "heart disease", "hydration", "mental health", "gut health",
    "diabetes", "nutrition", "antibiotics", "fertility", "cancer prevention"
]


Then we defined a list of topics that will be used to generate healthcare claims.

1. **Topic List**:
   - The `topics` list contains common healthcare topics, such as `"vaccination"`, `"heart disease"`, `"hydration"`, and others. These topics are intended to serve as the basis for generating healthcare-related claims.

2. **Purpose**:
   - These topics will be used in further steps to generate claims for testing or training the reasoning model. Each claim will be related to one of these topics, and the model will generate explanations based on the input claim and its credibility score.


In [None]:
# ---------------------------------------------
# ✅ 3. Generate Claims for Each Topic
# ---------------------------------------------
def generate_claims(topic):
    prompt = f"Generate 3 healthcare claims about {topic}, some true and some false."
    output = lamini_pipe(prompt, max_length=128, do_sample=True, temperature=0.9)[0]["generated_text"]
    claims = [c.strip("•- ") for c in output.split("\n") if c.strip()]
    return claims

all_claims = []
for topic in tqdm(topics):
    all_claims.extend(generate_claims(topic))

print(f"Generated {len(all_claims)} total claims.")

100%|██████████| 10/10 [01:30<00:00,  9.05s/it]

Generated 10 total claims.





After defining the list, we generated healthcare claims for each topic by prompting the reasoning model to produce a mix of true and false claims. The `generate_claims` function generates three claims per topic, which are cleaned and stored in a list. The code loops through a predefined set of topics, collecting all generated claims in one list and printing the total count of claims created.


In [4]:
# ---------------------------------------------
# ✅ 4. Assign Credibility + Generate Explanations
# ---------------------------------------------
def assign_score():
    # Evenly sample across bins: 10%, 30%, 50%, 70%, 90%
    return random.choice([10, 30, 50, 70, 90])

synthetic_data = []

for claim in tqdm(all_claims):
    score = assign_score()
    reasoning_type = "accurate" if score > 70 else "misinformation"
    prompt = f"Claim: {claim}\nCredibility: {score}%\nExplain why this claim is likely {reasoning_type}."
    explanation = lamini_pipe(prompt, max_length=100, do_sample=False)[0]["generated_text"]

    synthetic_data.append({
        "claim": claim,
        "credibility": score,
        "explanation": explanation.strip()
    })

100%|██████████| 10/10 [01:18<00:00,  7.80s/it]


Our next step is assigning a synthetic credibility score to each claim and generates an explanation based on the score.

1. **Assigning Credibility**:
   - The `assign_score` function randomly selects a credibility score from predefined values: 10%, 30%, 50%, 70%, or 90%. This simulates different levels of credibility for the claims.

2. **Generating Explanations**:
   - For each claim, a `reasoning_type` is determined based on the score (accurate if the score is above 70%, misinformation otherwise). A prompt is created for the reasoning model to generate an explanation for why the claim is considered accurate or misinformation.

3. **Collecting Synthetic Data**:
   - The generated claims, their assigned credibility scores, and the corresponding explanations are stored in a list called `synthetic_data`.

This process allows for the creation of a dataset with claims, credibility scores, and explanations for training or evaluation.


In [5]:
# ---------------------------------------------
# ✅ 5. Save Synthetic Data to CSV
# ---------------------------------------------
df = pd.DataFrame(synthetic_data)
df.to_csv("synthetic_claim_explanations.csv", index=False)

print("Saved synthetic_claim_explanations.csv with shape:", df.shape)
df.head()

Saved synthetic_claim_explanations.csv with shape: (10, 3)


Unnamed: 0,claim,credibility,explanation
0,True: Vaccination has been proven to save mill...,50,This claim is likely misinformation because va...
1,1. True: Heart disease is a common health issu...,50,This claim is likely misinformation because he...
2,1. True: Drinking enough water can improve ove...,90,The claim is likely accurate because it is a w...
3,1. Mental health is a complex issue that can b...,30,This claim is likely misinformation because it...
4,True: There is evidence to suggest that gut he...,30,This claim is likely misinformation because it...


In [2]:
# 📘 1. Install and Import Libraries

!pip install transformers tqdm --quiet

from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import random
import re


In [18]:
# 📘 2. Load Reasoning Model
# Load LaMini-Flan reasoning model
model_name = "MBZUAI/LaMini-Flan-T5-248M"
reasoning_pipeline = pipeline("text2text-generation", model=model_name, device=-1)


Device set to use cpu


In [19]:
# 📘 3. Generate 100 Prompts
# Create 100 prompts for the model to expand on
base_prompts = [
    "The claim is that exercise helps prevent heart disease.",
    "The claim is that vaccines are effective at preventing illness.",
    "The claim is that meditation reduces anxiety.",
    "The claim is that regular sleep improves brain function.",
    "The claim is that sunscreen prevents skin cancer.",
    "The claim is that fiber helps with digestion.",
    "The claim is that drinking water supports kidney health.",
    "The claim is that low sodium intake benefits blood pressure.",
    "The claim is that probiotics support gut health.",
    "The claim is that dental hygiene impacts heart health.",
    # Repeat and randomly vary structure
]
# Pad out to 100 with variations
while len(base_prompts) < 100:
    health_topic = random.choice([
        "exercise", "hydration", "mental health", "nutrition",
        "disease prevention", "vaccination", "chronic illness", "cancer prevention"
    ])
    action = random.choice([
        "helps with", "is important for", "is linked to", "is known to reduce", "supports"
    ])
    outcome = random.choice([
        "heart health", "reduced stress", "stronger immunity", "lower cancer risk",
        "lower blood pressure", "improved sleep", "gut health"
    ])
    prompt = f"The claim is that {health_topic} {action} {outcome}."
    base_prompts.append(prompt)


After installing and importing the required packages and libraries, we load the model and we generated a list of 100 prompts for the reasoning model to expand upon.

1. **Initial Set of Base Prompts**:
   - A list of 10 predefined health-related claims is created, each representing a claim about the effects of various health practices (e.g., exercise, vaccines, meditation, etc.).

2. **Expanding the List to 100 Prompts**:
   - The `while` loop continues to generate new prompts until the total count reaches 100. For each new prompt, random health topics, actions, and outcomes are chosen from predefined lists to vary the structure of the claims.

3. **Result**:
   - The final list `base_prompts` contains 100 unique prompts that can be used to generate explanations and assess the reasoning model's performance.


In [20]:
# 📘 4. Generate Explanations
# Generate explanations
claims = []
explanations = []

print("🧠 Generating explanations for synthetic claims...")
for claim_text in tqdm(base_prompts):
    # Strip down to clean claim for CSV
    clean_claim = claim_text.replace("The claim is that ", "").strip().rstrip(".")

    # Make the prompt explicit for explanation
    prompt = f"Claim: {clean_claim}\nCredibility: 90%\nExplain why this claim is likely accurate."

    output = reasoning_pipeline(prompt, max_length=150, num_return_sequences=1)[0]["generated_text"]

    claims.append(clean_claim)
    explanations.append(output.strip())


🧠 Generating explanations for synthetic claims...


100%|██████████| 100/100 [04:12<00:00,  2.52s/it]


The next step is to generate explanations for a set of synthetic claims using the reasoning model.

1. **Generating Explanations**:
   - A loop iterates through each claim in `base_prompts`, cleaning the claim by removing the prefix "The claim is that" and stripping any trailing punctuation.

2. **Creating Prompts**:
   - For each cleaned claim, a prompt is created with a fixed credibility of 90% and the instruction to explain why the claim is likely accurate.

3. **Model Inference**:
   - The `reasoning_pipeline` is used to generate an explanation for each claim. The explanation is returned as the model's output and is appended to the `explanations` list.

4. **Storing Results**:
   - Both the cleaned claims and their corresponding explanations are stored in separate lists, `claims` and `explanations`, which can be used for further evaluation or saving to a file.


In [21]:
# 📘 5. Assign Scores and Clean Claim Text
# Assign credibility scores and clean prompts
clean_claims = [c.replace("The claim is that ", "").strip().rstrip(".") for c in claims]
scores = [random.randint(71, 95) for _ in range(len(clean_claims))]

Then we clean the claim texts by removing the prefix "The claim is that" and trimming any trailing punctuation. It also assigns a random credibility score between 71% and 95% to each claim. The cleaned claims and their associated scores are stored in the `clean_claims` and `scores` lists, respectively. These scores simulate varying levels of credibility for each claim.


In [22]:
# 📘 6. Combine into DataFrame
df = pd.DataFrame({
    "claim": clean_claims,
    "score": scores,
    "explanation": explanations
})


Combined the cleaned claims, assigned scores, and generated explanations into a single DataFrame for easy access and analysis.

In [23]:
# 📘 7. Save to CSV
df.to_csv("synthetic_claim_explanations.csv", index=False)
print("✅ Saved 100+ synthetic claims to 'synthetic_claim_explanations.csv'")


✅ Saved 100+ synthetic claims to 'synthetic_claim_explanations.csv'
