In [None]:
# ---------------------------------------------
# ✅ 1. Install and Import
# ---------------------------------------------
!pip install transformers --quiet

from transformers import pipeline
import pandas as pd
import random
from tqdm import tqdm

# Load reasoning model
model_name = "MBZUAI/LaMini-Flan-T5-783M"
lamini_pipe = pipeline("text2text-generation", model=model_name)

Device set to use cpu
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeo

In [None]:
# ---------------------------------------------
# ✅ 2. Define Topics for Claim Generation
# ---------------------------------------------
topics = [
    "vaccination", "heart disease", "hydration", "mental health", "gut health",
    "diabetes", "nutrition", "antibiotics", "fertility", "cancer prevention"
]


In [None]:
# ---------------------------------------------
# ✅ 3. Generate Claims for Each Topic
# ---------------------------------------------
def generate_claims(topic):
    prompt = f"Generate 3 healthcare claims about {topic}, some true and some false."
    output = lamini_pipe(prompt, max_length=128, do_sample=True, temperature=0.9)[0]["generated_text"]
    claims = [c.strip("•- ") for c in output.split("\n") if c.strip()]
    return claims

all_claims = []
for topic in tqdm(topics):
    all_claims.extend(generate_claims(topic))

print(f"Generated {len(all_claims)} total claims.")

100%|██████████| 10/10 [01:30<00:00,  9.05s/it]

Generated 10 total claims.





In [4]:
# ---------------------------------------------
# ✅ 4. Assign Credibility + Generate Explanations
# ---------------------------------------------
def assign_score():
    # Evenly sample across bins: 10%, 30%, 50%, 70%, 90%
    return random.choice([10, 30, 50, 70, 90])

synthetic_data = []

for claim in tqdm(all_claims):
    score = assign_score()
    reasoning_type = "accurate" if score > 70 else "misinformation"
    prompt = f"Claim: {claim}\nCredibility: {score}%\nExplain why this claim is likely {reasoning_type}."
    explanation = lamini_pipe(prompt, max_length=100, do_sample=False)[0]["generated_text"]

    synthetic_data.append({
        "claim": claim,
        "credibility": score,
        "explanation": explanation.strip()
    })

100%|██████████| 10/10 [01:18<00:00,  7.80s/it]


In [5]:
# ---------------------------------------------
# ✅ 5. Save Synthetic Data to CSV
# ---------------------------------------------
df = pd.DataFrame(synthetic_data)
df.to_csv("synthetic_claim_explanations.csv", index=False)

print("Saved synthetic_claim_explanations.csv with shape:", df.shape)
df.head()

Saved synthetic_claim_explanations.csv with shape: (10, 3)


Unnamed: 0,claim,credibility,explanation
0,True: Vaccination has been proven to save mill...,50,This claim is likely misinformation because va...
1,1. True: Heart disease is a common health issu...,50,This claim is likely misinformation because he...
2,1. True: Drinking enough water can improve ove...,90,The claim is likely accurate because it is a w...
3,1. Mental health is a complex issue that can b...,30,This claim is likely misinformation because it...
4,True: There is evidence to suggest that gut he...,30,This claim is likely misinformation because it...


In [2]:
# 📘 1. Install and Import Libraries

!pip install transformers tqdm --quiet

from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import random
import re


In [18]:
# 📘 2. Load Reasoning Model
# Load LaMini-Flan reasoning model
model_name = "MBZUAI/LaMini-Flan-T5-248M"
reasoning_pipeline = pipeline("text2text-generation", model=model_name, device=-1)


Device set to use cpu


In [19]:
# 📘 3. Generate 100 Prompts
# Create 100 prompts for the model to expand on
base_prompts = [
    "The claim is that exercise helps prevent heart disease.",
    "The claim is that vaccines are effective at preventing illness.",
    "The claim is that meditation reduces anxiety.",
    "The claim is that regular sleep improves brain function.",
    "The claim is that sunscreen prevents skin cancer.",
    "The claim is that fiber helps with digestion.",
    "The claim is that drinking water supports kidney health.",
    "The claim is that low sodium intake benefits blood pressure.",
    "The claim is that probiotics support gut health.",
    "The claim is that dental hygiene impacts heart health.",
    # Repeat and randomly vary structure
]
# Pad out to 100 with variations
while len(base_prompts) < 100:
    health_topic = random.choice([
        "exercise", "hydration", "mental health", "nutrition",
        "disease prevention", "vaccination", "chronic illness", "cancer prevention"
    ])
    action = random.choice([
        "helps with", "is important for", "is linked to", "is known to reduce", "supports"
    ])
    outcome = random.choice([
        "heart health", "reduced stress", "stronger immunity", "lower cancer risk",
        "lower blood pressure", "improved sleep", "gut health"
    ])
    prompt = f"The claim is that {health_topic} {action} {outcome}."
    base_prompts.append(prompt)


In [20]:
# 📘 4. Generate Explanations
# Generate explanations
claims = []
explanations = []

print("🧠 Generating explanations for synthetic claims...")
for claim_text in tqdm(base_prompts):
    # Strip down to clean claim for CSV
    clean_claim = claim_text.replace("The claim is that ", "").strip().rstrip(".")

    # Make the prompt explicit for explanation
    prompt = f"Claim: {clean_claim}\nCredibility: 90%\nExplain why this claim is likely accurate."

    output = reasoning_pipeline(prompt, max_length=150, num_return_sequences=1)[0]["generated_text"]

    claims.append(clean_claim)
    explanations.append(output.strip())


🧠 Generating explanations for synthetic claims...


100%|██████████| 100/100 [04:12<00:00,  2.52s/it]


In [21]:
# 📘 5. Assign Scores and Clean Claim Text
# Assign credibility scores and clean prompts
clean_claims = [c.replace("The claim is that ", "").strip().rstrip(".") for c in claims]
scores = [random.randint(71, 95) for _ in range(len(clean_claims))]

In [22]:
# 📘 6. Combine into DataFrame
df = pd.DataFrame({
    "claim": clean_claims,
    "score": scores,
    "explanation": explanations
})


In [23]:
# 📘 7. Save to CSV
df.to_csv("synthetic_claim_explanations.csv", index=False)
print("✅ Saved 100+ synthetic claims to 'synthetic_claim_explanations.csv'")


✅ Saved 100+ synthetic claims to 'synthetic_claim_explanations.csv'
