In [1]:
import sys
import os
from dataclasses import dataclass
from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

sys.path.insert(0, "../src")

from extract_valid_studies import main
from models import ValidStudy, PrimaryOutcome

load_dotenv()

True

In [None]:
MODEL = "gpt-5-mini"
RAW_STUDIES_DIR = "../data/raw_studies"
STUDIES_PROCESSED_OUTPUT_DIR= "../data/studies_processed"
CARDS_OUTPUT_DIR = "../data/cards"
MAX_WORKERS = 100
NUM_CARDS = 100

In [3]:
from pydantic import BaseModel, Field, model_validator

class LLMResponse(BaseModel):
    question: str = Field(..., description="The final question in the specified format with the appropriate placeholders filled in verbatim with the other fields.")
    intervention_fragment: str = Field(..., description="The main intervention being tested, in layperson's terms. This should be directly pluggable into the question template.")
    intervention_group_fragment: str = Field(..., description="The purpose of the clinical trial, in layperson's terms. This should be directly pluggable into the question template.")
    outcome_fragment: str = Field(..., description="The primary outcome being measured, in layperson's terms. This should be directly pluggable into the question template.")
    comparator_group_fragment: str = Field(..., description="The comparator or control condition, in layperson's terms. This should be directly pluggable into the question template.")
    timeframe_fragment: str = Field(..., description="The timeframe of the outcome measurement, in layperson's terms. This should be directly pluggable into the question template.")
    intervention_group_description: str = Field(..., description="A brief description of the intervention group.")
    comparator_group_description: str = Field(..., description="A brief description of the comparator/control group.")

    @model_validator(mode="after")
    def ensure_question_valid(self):
        expected_question = f"Did {self.intervention_fragment} improve {self.outcome_fragment} in {self.intervention_group_fragment} compared to {self.comparator_group_fragment} after {self.timeframe_fragment}?"
        if self.question != expected_question:
            raise ValueError(f"Question does not match the expected format. Got: {self.question}, Expected: {expected_question}")
        return self


@dataclass
class ProcessingInformation:
    study: ValidStudy
    outcome_id: str
    llm_response: LLMResponse

    def to_dict(self):
        def recursive_asdict(obj):
            if isinstance(obj, list):
                return [recursive_asdict(item) for item in obj]
            elif isinstance(obj, dict):
                return {key: recursive_asdict(value) for key, value in obj.items()}
            elif hasattr(obj, "__dict__"):
                return {key: recursive_asdict(value) for key, value in obj.__dict__.items()}
            else:
                return obj
        return {
            "study": recursive_asdict(self.study),
            "outcome_id": self.outcome_id,
            "llm_response": self.llm_response.model_dump(),
        }

In [4]:
@dataclass
class UsageTracker:
    total_api_calls: int = 0
    total_input_tokens: int = 0
    total_output_tokens: int = 0


    def cost(self) -> float:
        c_i = {
            "gpt-5": 1.25,
            "gpt-5-mini": 0.25,
        }
        c_o = {
            "gpt-5": 10,
            "gpt-5-mini": 2,
        }
        input_cost = (self.total_input_tokens / 1_000_000) * c_i[MODEL]
        output_cost = (self.total_output_tokens / 1_000_000) * c_o[MODEL]

        return input_cost + output_cost

    def summary(self):
        print(f"Total API calls: {self.total_api_calls}")
        print(f"Total input tokens: {self.total_input_tokens}")
        print(f"Total output tokens: {self.total_output_tokens}")
        print(f"Estimated cost: ${self.cost():.4f}")

tracker = UsageTracker()

In [5]:
def mk_prompt(v: ValidStudy, o: PrimaryOutcome) -> str:
    groups_info = []
    for g in o.groups:
        groups_info.append("\n".join([
            f"Group title: {g.title}",
            f"Description: {g.description}",
            f"Interventions: {', '.join([f"{i.name}: {i.description}" for i in g.interventions]) if g.interventions else '(uncertain)'}",
        ]))


    return f"""
We are creating flashcard summaries for a game where laypeople predict the outcomes of clinical trials (behavioral interventive).

The final question for the flashcard must be of format:
"Did [intervention_fragment] improve [outcome_fragment] in [intervention_group_fragment] compared to [comparator_group_fragment] after [timeframe_fragment]?"

Keep the questions as short as possible. Use acronyms if needed, as long as they are understandable. If there is a name given to the intervention (e.g. "The Jolly Flower Telephone Protocol for Healthy Ageing"), instead of using the name, simply describe the intervention in layperson's terms (e.g. "calling other elderly people").

Remember to keep the question short. Do not include examples. Do not include any additional text or explanation.

Recall that the question should be reconstructable by verbatim plugging in the other fields.


Please ensure that the answers are concise and easily understandable by someone without a medical background. Avoid technical jargon and use simple language. Where something is technical, give a lay description and then in parentheses the technical term. 

Please create a question based on the following clinical trial information:
• Trial Title: {v.title}
• Trial Description: {v.description}
• Measure: {o.title}
• Measure Description: {o.description}
• Timeframe: {o.timeframe}

The groups are as follows (the first is the intervention group):
{'\n\n'.join(groups_info)}


If there is missing intervention or comparator information, please either match to these interventions (if you can tell from the group title/description), or say "Control" if it is a no-treatment or standard care control group, or "Unknown" if you cannot tell.
    """


In [6]:
valid_studies = main(RAW_STUDIES_DIR, NUM_CARDS)

  1%|          | 298/34562 [00:00<00:07, 4325.43it/s]

Loaded 50 raw studies with results
17 out of 50 (34.00%) studies have p-values reported in primary outcomes analyses.





In [7]:
def process_single_outcome(study: ValidStudy, o: PrimaryOutcome, tracker_lock: threading.Lock) -> ProcessingInformation:
    """Process a single study and return the result or raise an exception."""
    MAX_TRIES = 3
    success = False
    n_tries = 0

    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    while not success and n_tries < MAX_TRIES:
        n_tries += 1

        try:
            response = client.responses.parse(
                model=MODEL,
                input=[
                    {
                        "role": "system",
                        "content": "You are an expert clinical trial analyst."
                    },
                    {"role": "user", "content": mk_prompt(study, o)},
                ],
                text_format=LLMResponse,
            )

            # Track token usage (thread-safe)
            with tracker_lock:
                tracker.total_api_calls += 1
                tracker.total_input_tokens += response.usage.input_tokens
                tracker.total_output_tokens += response.usage.output_tokens

            # Parse response
            llm_response = response.output_parsed

            return ProcessingInformation(
                study=study,
                outcome_id=o.id,
                llm_response=llm_response
            )
        except Exception as e:
            # Re-raise with study info for better error handling
            if n_tries >= MAX_TRIES - 1:
                raise Exception(f"Failed processing {study.nct_id}: {str(e)}")


def process_studies_multithreaded(studies: list[ValidStudy], max_workers: int = MAX_WORKERS) -> tuple[list[ProcessingInformation], list[tuple[str, str]]]:
    """Process studies using multiple threads."""
    results = []
    failures = []
    tracker_lock = threading.Lock()
    
    # Use ThreadPoolExecutor for concurrent API calls
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_study = {
            executor.submit(process_single_outcome, study, study.primary_outcomes[0], tracker_lock): study
            for study in studies
        }
        
        # Process completed tasks with progress bar
        with tqdm(total=len(studies), desc="Processing studies") as pbar:
            for future in as_completed(future_to_study):
                study = future_to_study[future]
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    failures.append((study.nct_id, str(e)))
                    pbar.set_postfix_str(f"Failures: {len(failures)}")
                
                pbar.update(1)
    
    return results, failures

In [10]:
# Process studies using multithreading (adjust max_workers based on API rate limits)
results, failures = process_studies_multithreaded(valid_studies[:NUM_CARDS], max_workers=MAX_WORKERS)

tracker.summary()

print(f"Successfully processed: {len(results)} studies")
print(f"Failed: {len(failures)} studies")
if failures:
    print("Failures:")
    for nct_id, error in failures[:5]:  # Show first 5 failures
        print(f"  {nct_id}: {error}")

Processing studies: 100%|██████████| 5/5 [00:46<00:00,  9.39s/it, Failures: 2]

Total API calls: 8
Total input tokens: 11163
Total output tokens: 11153
Estimated cost: $0.0251
Successfully processed: 3 studies
Failed: 2 studies
Failures:
  NCT01130103: Failed processing NCT01130103: 1 validation error for LLMResponse
  Value error, Question does not match the expected format. Got: Did paroxetine + prolonged exposure (PE) improve PTSD severity (CAPS score) in WTC attack survivors with PTSD compared to placebo + PE after 10 weeks?, Expected: Did paroxetine + prolonged exposure (PE) improve PTSD severity (CAPS score) in WTC attack survivors with PTSD compared to placebo pill + prolonged exposure (PE) after 10 weeks? [type=value_error, input_value={'question': 'Did paroxet... therapy for 10 weeks.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error
  NCT05053360: Failed processing NCT05053360: 1 validation error for LLMResponse
  Value error, Question does not match the expected format. Got: Did a Healing Touch session 




In [13]:
import ujson as json
for result in results:
    with open(os.path.join(STUDIES_PROCESSED_OUTPUT_DIR, f"{result.study.nct_id}_{result.outcome_id}_llm_response.json"), "w") as f:
        json.dump(result.to_dict(), f, indent=4)

In [None]:
r = results[1]
o = next(o for o in r.study.primary_outcomes if o.id == r.outcome_id)
p_value = f"{o.p_value.comparator}{o.p_value.value}"
if p_value.startswith("="):
    p_value = p_value[1:]

success = (o.p_value.value < 0.05 and o.p_value.comparator != ">") or p_value == "<0.05"

card = {
    "study": {
        "nct_id": r.study.nct_id,
        "title": r.study.title,
    },
    "card_id": r.outcome_id,
    "front_details": {
        "question": r.llm_response.question,
        "intervention_fragment": r.llm_response.intervention_fragment,
        "intervention_group_fragment": r.llm_response.intervention_group_fragment,
        "outcome_fragment": r.llm_response.outcome_fragment,
        "comparator_group_fragment": r.llm_response.comparator_group_fragment,
        "timeframe_fragment": r.llm_response.timeframe_fragment,
    },
    "p_value": p_value,
    "num_participants": sum(int(g.num_participants) for g in o.groups),
    "success": success,
    "conditions": r.study.conditions,
    "keywords": r.study.keywords,
}
card

{'study': {'nct_id': 'NCT00802204',
  'title': 'Dopamine and Insulin Resistance'},
 'card_id': 'NCT00802204_po_0',
 'front_details': {'question': 'Did a very low-calorie diet (VLCD) improve striatal dopamine D2 receptor binding (DRD2 BP) in obese participants after the diet compared to the same obese participants at baseline after 8–10 days?',
  'intervention_fragment': 'a very low-calorie diet (VLCD)',
  'intervention_group_fragment': 'obese participants after the diet',
  'outcome_fragment': 'striatal dopamine D2 receptor binding (DRD2 BP)',
  'comparator_group_fragment': 'the same obese participants at baseline',
  'timeframe_fragment': '8–10 days'},
 'p_value': '<0.05',
 'num_participants': 33,
 'success': True,
 'conditions': ['Obesity'],
 'keywords': ['Obesity',
  'Insulin Resistance',
  'Neuroendocrine regulation',
  'Eating behaviors',
  'Dopamine signaling']}

In [None]:
from pprint import pprint
for r in results:
    o = next(o for o in r.study.primary_outcomes if o.id == r.outcome_id)
    succ = o.p_value.value < 0.05 and o.p_value.comparator is not ">"
    print(f"Study {r.study.nct_id} {r.study.title}:")
    pprint(f"  Question: {r.llm_response.question}")
    print(f"  Answer: {'YES' if succ else 'NO'}")
    print('-'*60)
    
o

Study NCT01860651 Web-based Monitoring in Children and Adolescents With Inflammatory Bowel Disease:
('  Question: Did web-based monitoring (symptoms + stool calprotectin) improve '
 'medication adherence (MARS score) in kids/teens with IBD on home meds '
 'compared to routine outpatient care (4 visits/year) after up to 2 years '
 '(quarterly checks)?')
  Answer: NO
------------------------------------------------------------
Study NCT01356277 Intervention to Improve Adherence in Teen Kidney Transplant:
('  Question: Did coach-led education, problem-solving with if-then action '
 'plans plus dose reminders improve medication-taking adherence (percent of '
 'days with 100% doses taken) in teen kidney transplant recipients compared to '
 'attention control (non-specific support without adherence help) after 12 '
 'months?')
  Answer: YES
------------------------------------------------------------
Study NCT00362453 Tai Chi Mind-Body Therapy for Knee Osteoarthritis:
('  Question: Did doing

  succ = o.p_value.value < 0.05 and o.p_value.comparator is not ">"


PrimaryOutcome(nct_id='NCT03978871', id='NCT03978871_po_0', title='Mean Change From Pre to Post Mindset Manipulation on Emotion Mindset Scale (Measures Beliefs About Whether Emotions Are Fixed or Malleable)', description='Participants completed a six-item fixed emotion mindset measure (EMS-fixed) measure, prior to (pre-induction EMS) and following the induction lessons (post-induction EMS). Items were drawn from the Implicit Theories of Emotion Scale (Tamir et al., 2007) and the Emotion Mindset Scale (EMS; Livingstone, 2013). Participants rated each item on a 6-point scale (1 = Strongly Disagree to 6 =Strongly Agree). Means were computed for these items, with higher scores representing a higher fixed emotion mindset mindset. For this analysis, we compared mean change scores across the two conditions (mindset and control). Minimum=1 Maximum=6 High= more fixed emotion mindset, worse outcome', population_description='', timeframe='Pre vs. Post Mindset Manipulation: baseline', groups=[Grou