In [1]:
import sys
import os
from dataclasses import dataclass
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

sys.path.insert(0, "../src")

from extract_valid_studies import main
from models import ValidStudy, PrimaryOutcome

load_dotenv()

True

In [2]:
MODEL = "gpt-5-mini"
RAW_STUDIES_DIR = "../data/raw_studies"
STUDIES_PROCESSED_OUTPUT_DIR= "../data/studies_processed"
CARDS_OUTPUT_DIR = "../data/cards"
MAX_WORKERS = 100
NUM_STUDIES = 750

In [3]:
from pydantic import BaseModel, Field, model_validator

class LLMResponse(BaseModel):
    question: str = Field(..., description="The final question in the specified format with the appropriate placeholders filled in verbatim with the other fields.")
    intervention_fragment: str = Field(..., description="The main intervention being tested, in layperson's terms. This should be directly pluggable into the question template.")
    intervention_group_fragment: str = Field(..., description="The purpose of the clinical trial, in layperson's terms. This should be directly pluggable into the question template.")
    outcome_fragment: str = Field(..., description="The primary outcome being measured, in layperson's terms. This should be directly pluggable into the question template.")
    comparator_group_fragment: str = Field(..., description="The comparator or control condition, in layperson's terms. This should be directly pluggable into the question template.")
    timeframe_fragment: str = Field(..., description="The timeframe of the outcome measurement, in layperson's terms. This should be directly pluggable into the question template.")
    intervention_group_description: str = Field(..., description="A brief description of the intervention group.")
    comparator_group_description: str = Field(..., description="A brief description of the comparator/control group.")
    understandability_score: int = Field(..., description="A 1-10 score representing the understandability of the question by a lay reader.")

    @model_validator(mode="after")
    def ensure_question_valid(self):
        expected_question = f"Did {self.intervention_fragment} improve {self.outcome_fragment} in {self.intervention_group_fragment} compared to {self.comparator_group_fragment} after {self.timeframe_fragment}?"
        if self.question != expected_question:
            raise ValueError(f"Question does not match the expected format. Got: {self.question}, Expected: {expected_question}")
        return self


@dataclass
class ProcessingInformation:
    study: ValidStudy
    outcome_id: str
    llm_response: LLMResponse

    def to_dict(self):
        def recursive_asdict(obj):
            if isinstance(obj, list):
                return [recursive_asdict(item) for item in obj]
            elif isinstance(obj, dict):
                return {key: recursive_asdict(value) for key, value in obj.items()}
            elif hasattr(obj, "__dict__"):
                return {key: recursive_asdict(value) for key, value in obj.__dict__.items()}
            else:
                return obj
        return {
            "study": recursive_asdict(self.study),
            "outcome_id": self.outcome_id,
            "llm_response": self.llm_response.model_dump(),
        }

In [4]:
@dataclass
class UsageTracker:
    total_api_calls: int = 0
    total_input_tokens: int = 0
    total_output_tokens: int = 0


    def cost(self) -> float:
        c_i = {
            "gpt-5": 1.25,
            "gpt-5-mini": 0.25,
        }
        c_o = {
            "gpt-5": 10,
            "gpt-5-mini": 2,
        }
        input_cost = (self.total_input_tokens / 1_000_000) * c_i[MODEL]
        output_cost = (self.total_output_tokens / 1_000_000) * c_o[MODEL]

        return input_cost + output_cost

    def summary(self):
        print(f"Total API calls: {self.total_api_calls}")
        print(f"Total input tokens: {self.total_input_tokens}")
        print(f"Total output tokens: {self.total_output_tokens}")
        print(f"Estimated cost: ${self.cost():.4f}")

tracker = UsageTracker()

In [5]:
def mk_prompt(v: ValidStudy, o: PrimaryOutcome) -> str:
    groups_info = []
    for g in o.groups:
        groups_info.append("\n".join([
            f"Group title: {g.title}",
            f"Description: {g.description}",
            f"Interventions: {', '.join([f"{i.name}: {i.description}" for i in g.interventions]) if g.interventions else '(uncertain)'}",
        ]))


    return f"""
We are creating flashcard summaries for a game where laypeople predict the outcomes of clinical trials (behavioral interventive).

The final question for the flashcard must be of format:
"Did [intervention_fragment] improve [outcome_fragment] in [intervention_group_fragment] compared to [comparator_group_fragment] after [timeframe_fragment]?"

Keep the questions as short as possible. Use acronyms if needed, as long as they are understandable. If there is a name given to the intervention (e.g. "The Jolly Flower Telephone Protocol for Healthy Ageing"), instead of using the name, simply describe the intervention in layperson's terms (e.g. "calling other elderly people").

Remember to keep the question short. Do not include examples. Do not include any additional text or explanation.

Recall that the question should be reconstructable by verbatim plugging in the other fields.


Please ensure that the answers are concise and easily understandable by someone without a medical background. Avoid technical jargon and use simple language. Where something is technical, give a lay description and then in parentheses the technical term. 

Please create a question based on the following clinical trial information:
• Trial Title: {v.title}
• Trial Description: {v.description}
• Measure: {o.title}
• Measure Description: {o.description}
• Timeframe: {o.timeframe}

The groups are as follows (the first is the intervention group):
{'\n\n'.join(groups_info)}


If there is missing intervention or comparator information, please either match to these interventions (if you can tell from the group title/description), or say "Control" if it is a no-treatment or standard care control group, or "Unknown" if you cannot tell.


Here are negative examples of questions:

EXAMPLE 1
Did using a web-based lung cancer screening decision aid improve decisional conflict (uncertainty about the screening decision) in Veterans who used the decision tool compared to Veterans given general prevention info (not about lung cancer) after 1 month?

Understandability score: 7/10 (confusing phrasing)

Better version:
Did using a web-based lung cancer screening decision aid reduce uncertainty in deciding whether to screen in Veterans who used the decision tool compared to Veterans given general prevention info (not about lung cancer) after 1 month?

Understandability score: 9/10


EXAMPLE 2
Did upregulating the left amygdala with real-time fMRI neurofeedback (thinking of positive memories) improve depressive symptoms (MADRS score) in MDD patients receiving left amygdala neurofeedback compared to MDD patients receiving HIPS (non-emotional region) neurofeedback after 2 weeks?

Understandability score: 3/10 (unclear what MDD is, what HIPS is)

Problems:
• Intervention seems to be "thinking of positive memories", the rest seems extraneous)
• MDD is not explained


EXAMPLE 3
Did wearing a UV dosimeter sticker and receiving daily personalized text messages based on sensor readings improve acceptability of wearing the UV sensor and receiving texts (system usability score, 6–42; higher better) in melanoma survivors in Cohort Study 1 Arm 1 (n=31) compared to melanoma survivors in Cohort Study 1 Arm 2 (n=29; daily texts + unstructured goal responses) after 21 days?


Understandability score: 5/10 (what is a dosimeter?)

Better version:
Did wearing a UV sensor sticker (UV dosimeter) and receiving daily personalized text messages improve acceptability of wearing a UV sensor and receiving related texts in melanoma survivors compared to melanoma survivors receiving daily texts + unstructured goal responses after 21 days?


EXAMPLE 4
Did family- and home-based behavioral support for ADHD (CASH‑AA) improve ADHD symptoms and related problems (delinquency, substance use, internalizing/externalizing symptoms) in adolescents with ADHD in the behavioral-only group compared to adolescents receiving the behavioral program plus medication integration (MIP) after one year?

Understandability score: 9/10

Better version (less non-core details):
Did family- and home-based behavioral support for ADHD (CASH‑AA) improve ADHD symptoms and related problems in adolescents with ADHD in the behavioral-only group compared to adolescents receiving the behavioral program plus medication integration (MIP) after one year?

Understandability score: 9/10 (no improvement, but more terse)


EXAMPLE 5
Did onsite collaborative care with a care manager (CC) improve treatment engagement (completed baseline and >2 OUD treatment visits) in pregnant and postpartum women with opioid use disorder compared to remote ECHO video mentorship (tele-support for providers) after 30 days from baseline?

Understandability score: 4/10 (what is ECHO, OUD?)

Problems:
• OUD is not explained
• ECHO is not explained


EXAMPLE 6
Did using an AF decision-support tool to recommend antithrombotic therapy improve discordant antithrombotic therapy (patients on treatment that disagreed with the tool's recommendation) in adults with non-valvular AF in primary care compared to educational intervention only (educational conference series) after one year?

Understandability score: 3/10 (what is AF?)

Problems:
• AF is not explained
• Antithrombotic is an unexplained technical term
• The explanation of "patients on treatment that disagreed with the tool's recommendation" is wordy and confusing.

---------------


Here are positive examples of questions:
EXAMPLE 1
Did practicing Tai Chi improve knee pain (WOMAC pain score) in people with knee osteoarthritis compared to wellness education and stretching after 12 weeks?

Understandability score: 10/10

Reason:
Concise, easy to understand, technical term in parenthesis (understanding is not gated by needing to know the technical term).


EXAMPLE 2
Did abstinence-only sex education improve abstinence (not having sex) in young African-American adolescents compared to health-promotion control after 24 months?

Understandability score: 10/10

Reason:
Easy to understand, clear treatment and control groups, short.


EXAMPLE 3
Did video counseling for quitting smoking improve smoking abstinence (verified by salivary cotinine) in women with HIV compared to women with HIV with telephone counseling instead after 6 months?

Understandability score: 10/10

Reason:
Good details, clear
    """


In [6]:
valid_studies = main(RAW_STUDIES_DIR)

100%|██████████| 34562/34562 [00:08<00:00, 3863.11it/s]


Loaded 5858 raw studies with results
2033 out of 5858 (34.70%) studies have p-values reported in primary outcomes analyses.


In [7]:
def process_single_outcome(study: ValidStudy, o: PrimaryOutcome, tracker_lock: threading.Lock) -> ProcessingInformation:
    """Process a single study and return the result or raise an exception."""
    MAX_TRIES = 3
    success = False
    n_tries = 0

    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    while not success and n_tries < MAX_TRIES:
        n_tries += 1

        try:
            response = client.responses.parse(
                model=MODEL,
                input=[
                    {
                        "role": "system",
                        "content": "You are an expert clinical trial analyst."
                    },
                    {"role": "user", "content": mk_prompt(study, o)},
                ],
                text_format=LLMResponse,
            )

            # Track token usage (thread-safe)
            with tracker_lock:
                tracker.total_api_calls += 1
                tracker.total_input_tokens += response.usage.input_tokens
                tracker.total_output_tokens += response.usage.output_tokens

            # Parse response
            llm_response = response.output_parsed

            return ProcessingInformation(
                study=study,
                outcome_id=o.id,
                llm_response=llm_response
            )
        except Exception as e:
            # Re-raise with study info for better error handling
            if n_tries >= MAX_TRIES - 1:
                raise Exception(f"Failed processing {study.nct_id}: {str(e)}")


def process_studies_multithreaded(studies: list[ValidStudy], max_workers: int = MAX_WORKERS) -> tuple[list[ProcessingInformation], list[tuple[str, str]]]:
    """Process studies using multiple threads."""
    results = []
    failures = []
    tracker_lock = threading.Lock()
    
    # Use ThreadPoolExecutor for concurrent API calls
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_study = {}
        # Submit all tasks
        n = 0
        for study in studies:
            for outcome in study.primary_outcomes:
                n += 1
                future_to_study[executor.submit(process_single_outcome, study, outcome, tracker_lock)] = study
       
        # Process completed tasks with progress bar
        with tqdm(total=n, desc="Processing study outcomes") as pbar:
            for future in as_completed(future_to_study):
                study = future_to_study[future]
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    failures.append((study.nct_id, str(e)))
                    pbar.set_postfix_str(f"Failures: {len(failures)}")
                
                pbar.update(1)
    
    return results, failures

In [None]:
# Process studies using multithreading (adjust max_workers based on API rate limits)
results, failures = process_studies_multithreaded(valid_studies[:NUM_CARDS], max_workers=MAX_WORKERS)

tracker.summary()

print(f"Successfully processed: {len(results)} outcomes")
print(f"Failed: {len(failures)} outcomes")
if failures:
    print("Failures:")
    for nct_id, error in failures[:5]:  # Show first 5 failures
        print(f"  {nct_id}: {error}")

Processing study outcomes: 100%|██████████| 2606/2606 [14:12<00:00,  3.06it/s, Failures: 405]

Total API calls: 2201
Total input tokens: 5517189
Total output tokens: 3115576
Estimated cost: $7.6104
Successfully processed: 2201 studies
Failed: 405 studies
Failures:
  NCT03605368: Failed processing NCT03605368: 1 validation error for LLMResponse
  Value error, Question does not match the expected format. Got: Did wearing an immersive VR headset to practice police interactions improve police interaction skills (Police Interaction Assessment score) in adolescents and adults with autism compared to BE SAFE video-based instruction after 2 weeks?, Expected: Did wearing an immersive VR headset to practice police interactions improve police interaction skills (Police Interaction Assessment score) in adolescents and adults with autism compared to BE SAFE video-based instruction (video modeling) after 2 weeks? [type=value_error, input_value={'question': 'Did wearing...standability_score': 10}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error




In [9]:
import ujson as json
for result in results:
    with open(os.path.join(STUDIES_PROCESSED_OUTPUT_DIR, f"{result.study.nct_id}_{result.outcome_id}_llm_response.json"), "w") as f:
        json.dump(result.to_dict(), f, indent=4)

In [10]:
def get_num_participants(groups) -> int:
    n = 0
    for g in groups:
        num = g.num_participants
        if isinstance(num, int):
            n += num
        elif isinstance(num, str):
            new_num = ''
            for c in num:
                if c.isdigit():
                    new_num += c
            n += int(new_num)

    return n

def mk_card(r: ProcessingInformation) -> dict:
    """Create a card dictionary from the processing information."""
    o = next(o for o in r.study.primary_outcomes if o.id == r.outcome_id)
    p_value = f"{o.p_value.comparator}{o.p_value.value}"
    if p_value.startswith("="):
        p_value = p_value[1:]

    success = (o.p_value.value < 0.05 and o.p_value.comparator != ">") or p_value == "<0.05"

    return {
        "study": {
            "nct_id": r.study.nct_id,
            "title": r.study.title,
            "brief_description": r.study.brief_description,
        },
        "card_id": r.outcome_id,
        "front_details": {
            "question": r.llm_response.question,
            "intervention_fragment": r.llm_response.intervention_fragment,
            "intervention_group_fragment": r.llm_response.intervention_group_fragment,
            "outcome_fragment": r.llm_response.outcome_fragment,
            "comparator_group_fragment": r.llm_response.comparator_group_fragment,
            "timeframe_fragment": r.llm_response.timeframe_fragment,
        },
        "p_value": p_value,
        "num_participants": get_num_participants(o.groups),
        "success": success,
        "conditions": r.study.conditions,
        "keywords": r.study.keywords,
        "decks": r.study.decks,
        "understandability_score": r.llm_response.understandability_score,
    }

cards = [mk_card(r) for r in results]
with open(os.path.join(CARDS_OUTPUT_DIR, "cards.json"), "w") as f:
    json.dump(cards, f, indent=4)

with open(os.path.join(CARDS_OUTPUT_DIR, "questions.txt"), "w") as f:
    for card in cards:
        f.write(card["front_details"]["question"] + "\n")

ValueError: invalid literal for int() with base 10: ''

In [None]:
from pprint import pprint
for r in results:
    o = next(o for o in r.study.primary_outcomes if o.id == r.outcome_id)
    succ = o.p_value.value < 0.05 and o.p_value.comparator is not ">"
    print(f"Study {r.study.nct_id} {r.study.title}:")
    pprint(f"  Question: {r.llm_response.question}")
    print(f"  Answer: {'YES' if succ else 'NO'}")
    print('-'*60)
    
o

Study NCT00362453 Tai Chi Mind-Body Therapy for Knee Osteoarthritis:
('  Question: Did practicing Tai Chi improve knee pain (WOMAC pain score) in '
 'people with knee osteoarthritis compared to wellness education and '
 'stretching after 12 weeks?')
  Answer: YES
------------------------------------------------------------
Study NCT03605368 VR Intervention to Improve Police Safety:
('  Question: Did remote at-home immersive VR police-safety training improve '
 'police interaction skills (change in Police Interaction Assessment score) in '
 'adolescents and adults with autism (ASD) compared to treatment-as-usual (no '
 'intervention) after ~2 weeks?')
  Answer: NO
------------------------------------------------------------
Study NCT01130103 Combination Treatment for Posttraumatic Stress Disorder (PTSD) After the World Trade Center (WTC) Attack:
('  Question: Did paroxetine (an SSRI) plus prolonged exposure therapy improve '
 'PTSD severity (Clinician-Administered PTSD Scale, CAPS) in s

PrimaryOutcome(nct_id='NCT03810534', id='NCT03810534_po_1', title="Preparedness for Caregiving Scale Score 7 Days After Patient's Skill Nursing Facility Discharge", description="The caregiver's preparedness for caregiving will be measured by the Preparedness for Caregiving Scale (PCS), which includes 8 items on a five-point Likert scale (0-4). The PCS measures self-reported readiness for caregiving. Range = 0-32, with higher scores associated with less anxiety.", population_description='', timeframe='7 Days After Patient SNF Discharge', groups=[Group(id='OG000', title='Connect-Home', description="Connect-Home intervention at the skilled nursing facility and at the subject's home. Participants will have data collected at 7, 30, and 60 days post SNF discharge.", num_participants='140', interventions=[Intervention(name='Connect-Home', type='BEHAVIORAL', description="Connect-Home will introduce organizational structure to support delivery of transitional care processes. New elements of str