In [10]:
import pandas as pd
import ujson as json
import os


from glob import glob
from tqdm import tqdm

In [11]:
RAW_STUDIES_DIR = "../raw_studies"

In [12]:
paths = glob(os.path.join(RAW_STUDIES_DIR, "*.json"))
len(paths)

50000

In [22]:
raw_studies = []

for path in tqdm(paths):
    with open(path, "r") as f:
        s = json.load(f)["api_response"]
        if s["hasResults"]:
            raw_studies.append(s)

print(f"Loaded {len(raw_studies)} raw studies with results")

100%|██████████| 50000/50000 [00:10<00:00, 4649.83it/s]

Loaded 5374 raw studies with results





In [None]:
sample = raw_studies[1]
sample["resultsSection"].keys()

id_mod = sample["protocolSection"]["identificationModule"]
print("Chosen sample study:", id_mod["briefTitle"])
print("https://clinicaltrials.gov/study/" + id_mod["nctId"])


outcomes = sample["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]

Chosen sample study: Stepping Stones and Creating Futures Intervention Trial
https://clinicaltrials.gov/study/NCT03022370


[{'groupIds': ['OG000', 'OG002'],
  'nonInferiorityType': 'SUPERIORITY',
  'pValue': '=0.686',
  'statisticalMethod': 'Regression, Logistic',
  'paramType': 'Odds Ratio (OR)',
  'paramValue': '0.92',
  'ciPctValue': '95',
  'ciNumSides': 'TWO_SIDED',
  'ciLowerLimit': '0.62',
  'ciUpperLimit': '1.37'},
 {'groupIds': ['OG001', 'OG003'],
  'nonInferiorityType': 'SUPERIORITY',
  'pValue': '=0.032',
  'statisticalMethod': 'Regression, Logistic',
  'paramType': 'Odds Ratio (OR)',
  'paramValue': '0.71',
  'ciPctValue': '95',
  'ciNumSides': 'TWO_SIDED',
  'ciLowerLimit': '0.51',
  'ciUpperLimit': '0.97'}]

In [None]:
# descriptions

d = sample["protocolSection"]["descriptionModule"]
print(d.keys())
d

dict_keys(['briefSummary', 'detailedDescription'])


{'briefSummary': 'The investigators propose a single-blind randomized clinical trial to determine if seniors show improved mobility (walking speed) and cognition following motor imagery (imagined walking) training. They hypothesize that imagined walking can be used as a rehabilitative tool for improving walking speed and cognition in the elderly, because it engages and strengthens similar neural systems as actual walking and cognition.',
 'detailedDescription': 'The proposed research aims to establish the efficacy of an imagined gait protocol for improving gait and cognition in the elderly. This imagined gait protocol involves imagined gait in single (imagined walking; iW) and dual-task (imagined walking while talking; iWWT) situations. A single-blind randomized clinical trial of 58 cognitively-healthy elderly with pre-post measures of gait, cognition, and functional Magnetic Resonance Imaging (fMRI) during imagined gait is proposed. The overall hypothesis is that imagined gait can be 

In [None]:
# intended measures

d = sample["protocolSection"]["outcomesModule"]
d.keys()
d["primaryOutcomes"]

[{'measure': 'Change in Walking Speed During Single and Dual-task Walking Conditions',
  'description': 'Change in gait speed (centimeters per second) measured during normal pace walking and walking while talking conditions using an instrumented pathway. A positive mean value is indicative of an improvement (increase) in gait speed post-intervention, whereas a negative mean value is indicative of a decrease in gait speed post-intervention.',
  'timeFrame': 'Baseline and 3 months'}]

In [None]:
# actual results

outcomes = sample["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]
primary_outcomes_results[0]

{'type': 'PRIMARY',
 'title': 'Change in Walking Speed During Single and Dual-task Walking Conditions',
 'description': 'Change in gait speed (centimeters per second) measured during normal pace walking and walking while talking conditions using an instrumented pathway. A positive mean value is indicative of an improvement (increase) in gait speed post-intervention, whereas a negative mean value is indicative of a decrease in gait speed post-intervention.',
 'populationDescription': 'Overall number of participants analyzed includes all randomized participants with baseline measures. Data was unable to be analyzed for 14 participants in the Visual Imagery group due to a protocol deviation (incorrect intervention procedure).',
 'reportingStatus': 'POSTED',
 'paramType': 'MEAN',
 'dispersionType': 'Standard Deviation',
 'unitOfMeasure': 'centimeters per second',
 'timeFrame': 'Baseline and 3 months',
 'groups': [{'id': 'OG000',
   'title': 'Imagined Gait Intervention',
   'description': '

In [67]:
for s in raw_studies[:3]:
    id_mod = s["protocolSection"]["identificationModule"]
    print("Study:", id_mod["briefTitle"])
    print("https://clinicaltrials.gov/study/" + id_mod["nctId"])
    outcomes = s["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
    primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]
    if len(primary_outcomes_results) == 0:
        print("No primary outcomes results")
        continue
    for i, o in enumerate(primary_outcomes_results):
        print(f"Primary outcome result {i+1}:")
        print("  Title:", o["title"])
        if "description" in o and o["description"]:
            print("  Description:", o["description"])
        if "timeFrame" in o and o["timeFrame"]:
            print("  Time frame:", o["timeFrame"])
        if "resultGroups" in o:
            for rg in o["resultGroups"]:
                print("  Result group:", rg.get("groupTitle", "N/A"))
                if "measures" in rg:
                    for m in rg["measures"]:
                        measure_title = m.get("measureTitle", "N/A")
                        value = m.get("value", "N/A")
                        units = m.get("units", "")
                        if units:
                            print(f"    Measure: {measure_title}, Value: {value} {units}")
                        else:
                            print(f"    Measure: {measure_title}, Value: {value}")
        print()
    print("-" * 40)


Study: Motor Imagery Intervention for Improving Gait and Cognition in the Elderly
https://clinicaltrials.gov/study/NCT02762604
Primary outcome result 1:
  Title: Change in Walking Speed During Single and Dual-task Walking Conditions
  Description: Change in gait speed (centimeters per second) measured during normal pace walking and walking while talking conditions using an instrumented pathway. A positive mean value is indicative of an improvement (increase) in gait speed post-intervention, whereas a negative mean value is indicative of a decrease in gait speed post-intervention.
  Time frame: Baseline and 3 months

----------------------------------------
Study: Stepping Stones and Creating Futures Intervention Trial
https://clinicaltrials.gov/study/NCT03022370
Primary outcome result 1:
  Title: Any Past Year Physical Intimate Partner Violence Perpetration (Men) and Experience (Women)
  Description: Physical intimate partner violence is assessed using five items based on the WHO VAW s

In [101]:
raw_studies_p = []
for s in raw_studies:
    has_p_value = False

    outcomes = s["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
    primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]
    for o in primary_outcomes_results:
        if has_p_value:
            break

        if "analyses" in o:
            for analysis in o["analyses"]:
                if "pValue" in analysis:
                    # print(f"Study: {id_mod['nctId']}, Primary Outcome: {o['title']}, p-value: {analysis['pValue']}")
                    has_p_value = True
                    raw_studies_p.append(s)
                    break
frac_p_val = len(raw_studies_p) / len(raw_studies)
print(f"{len(raw_studies_p)} out of {len(raw_studies)} ({frac_p_val*100:<.2f}%) studies have p-values reported in primary outcomes analyses.")

1801 out of 5374 (33.51%) studies have p-values reported in primary outcomes analyses.


In [104]:
from dataclasses import dataclass

ParsedPValue = tuple[str | None, float | None] # (comparison, value)
def parse_p_value(p: str) -> ParsedPValue:
    """
    Parse a p-value string and return a tuple of (comparison, value).


    Args:
        p (str): A p-value string that may contain comparison operators like '<', '>', '=' 
                 or just a numeric value
                 
    Returns:
        tuple: A tuple containing (comparison_operator, numeric_value) where:
               - comparison_operator is one of '<', '>', '=' or None if parsing failed
               - numeric_value is the float value or None if parsing failed
               
    Examples:
        parse_p_value('<0.05') -> ('<', 0.05)
        parse_p_value('0.032') -> ('=', 0.032)
        parse_p_value('>0.1') -> ('>', 0.1)
        parse_p_value('invalid') -> (None, None)
    """

    # remove whitespace that sometimes occurs (e.g., "< 0.05" --> "<0.05")
    p = ''.join(c for c in p if c != ' ')
    if p.startswith("<"):
        try:
            val = float(p[1:].strip())
            return ("<", val)
        except ValueError:
            return (None, None)
    elif p.startswith(">"):
        try:
            val = float(p[1:].strip())
            return (">", val)
        except ValueError:
            return (None, None)
    elif p.startswith("="):
        try:
            val = float(p[1:].strip())
            return ("=", val)
        except ValueError:
            return (None, None)
    else:
        try:
            val = float(p)
            return ("=", val)
        except ValueError:
            return (None, None)

parsed_p_values = [parse_p_value(p) for p in p_values]


@dataclass
class PValueStatistics:
    n_0_05: int # num p < 0.05
    n_0_01: int # num p < 0.01
    n_0_001: int # num p < 0.001

    f_0_05: float # frac p < 0.05
    f_0_01: float # frac p < 0.01
    f_0_001: float # frac p < 0.001

    n_success: int # num p < 0.05
    n_fail: int # num p >= 0.05

    f_success: float # frac p < 0.05
    f_fail: float # frac p >= 0.05

    n_total: int


def compute_p_value_statistics(parsed_p_values: list[ParsedPValue]):
    """
    Compute statistics from a list of parsed p-values.
    """

    n_total = len(parsed_p_values)
    n_0_05 = sum(1 for cmp, val in parsed_p_values if cmp in ("=", "<") and val is not None and val < 0.05)
    n_0_01 = sum(1 for cmp, val in parsed_p_values if cmp in ("=", "<") and val is not None and val < 0.01)
    n_0_001 = sum(1 for cmp, val in parsed_p_values if cmp in ("=", "<") and val is not None and val < 0.001)

    n_success = n_0_05
    n_fail = n_total - n_success

    f_0_05 = n_0_05 / n_total if n_total > 0 else 0
    f_0_01 = n_0_01 / n_total if n_total > 0 else 0
    f_0_001 = n_0_001 / n_total if n_total > 0 else 0

    f_success = f_0_05
    f_fail = 1 - f_success

    return PValueStatistics(
        n_0_05=n_0_05,
        n_0_01=n_0_01,
        n_0_001=n_0_001,

        f_0_05=f_0_05,
        f_0_01=f_0_01,
        f_0_001=f_0_001,

        n_success=n_success,
        n_fail=n_fail,

        f_success=f_success,
        f_fail=f_fail,

        n_total=n_total
    )

In [None]:
for s in raw_studies_p[:3]:
    id_mod = s["protocolSection"]["identificationModule"]
    outcomes = s["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
    primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]
    print("Study:", id_mod["briefTitle"])
    print("https://clinicaltrials.gov/study/" + id_mod["nctId"])
    if len(primary_outcomes_results) == 0:
        print("No primary outcomes results")
        continue
    for i, o in enumerate(primary_outcomes_results):
        # note: g["description"] is also useful!

        print(f"Primary outcome result {i+1}:")
        print("  Title:", o["title"])
        if "description" in o and o["description"]:
            print("  Description:", o["description"])
        if "timeFrame" in o and o["timeFrame"]:
            print("  Time frame:", o["timeFrame"])


        if "analyses" in o:
            for analysis in o["analyses"]:
                group_id_to_title = {g["id"]: g["title"] for g in o["groups"]}
                group_id_to_count = {}
                for denom in o["denoms"]:
                    if denom["units"].lower() != "participants":
                         continue
                    for c in denom["counts"]:
                        group_id_to_count[c["groupId"]] = c["value"]
                   
                if "pValue" in analysis:
                    groups = [f"{group_id_to_title[gid]} ({group_id_to_count[gid]})" for gid in analysis["groupIds"]]
                    print(f"    Analysis comparing groups: {', '.join(groups)}")
                    print(f"      p-value: {analysis['pValue']}")
                    print(f"      type: {analysis['nonInferiorityType']}")
       
    print("-" * 40)


Study: Stepping Stones and Creating Futures Intervention Trial
https://clinicaltrials.gov/study/NCT03022370
Primary outcome result 1:
  Title: Any Past Year Physical Intimate Partner Violence Perpetration (Men) and Experience (Women)
  Description: Physical intimate partner violence is assessed using five items based on the WHO VAW scale. A positive response to any item leads to a person being classified as perpetrating (men) and experiencing (women) in the past year. With 0=none, 1=yes.
  Time frame: 24 months post baseline
    Analysis comparing groups: Women - Intervention (260), Women - Control (285)
      p-value: =0.686
      type: SUPERIORITY
    Analysis comparing groups: Men - Intervention (237), Men - Control (268)
      p-value: =0.032
      type: SUPERIORITY
Primary outcome result 2:
  Title: Any Past Year Sexual Intimate Partner Violence Perpetration (Men) and Experience (Women)
  Description: Sexual intimate partner violence is assessed using three items based on the WHO 

In [None]:
# Only 1 group for p-value test


xs = []
groupses = []
for s in raw_studies_p[:300]:
    id_mod = s["protocolSection"]["identificationModule"]
    outcomes = s["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
    primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]
    # print("Study:", id_mod["briefTitle"])
    # print("https://clinicaltrials.gov/study/" + id_mod["nctId"])
    if len(primary_outcomes_results) == 0:
        print("No primary outcomes results")
        continue
    for i, o in enumerate(primary_outcomes_results):

        if "analyses" in o:
            for analysis in o["analyses"]:
                if "pValue" in analysis:
                    group_id_to_title = {g["id"]: g["title"] for g in o["groups"]}
                    groups = [group_id_to_title[gid] for gid in analysis["groupIds"]]
                    groupses.append(tuple(groups))
                    if len(groups) == 1:
                        print("Study:", id_mod["briefTitle"])
                        print(groups)
                        print(f"    Groups: {', '.join(groups)}")
                        print(f"      p-value: {analysis['pValue']}")
                        print(f"      type: {analysis['nonInferiorityType']}")
                        xs.append((s, o))

Study: Emergency Department Healthcare Education Assessment and Response for Teen Relationships: A Pilot Feasibility Study
['ED-HEART (Intervention Arm)']
    Analysis comparing groups: ED-HEART (Intervention Arm)
      p-value: .622
      type: OTHER
Study: Emergency Department Healthcare Education Assessment and Response for Teen Relationships: A Pilot Feasibility Study
['ED-HEART (Intervention Arm)']
    Analysis comparing groups: ED-HEART (Intervention Arm)
      p-value: .038
      type: OTHER
Study: Peer Intervention to Link Overdose Survivors to Treatment (PILOT)
['Treatment as Usual (TAU) Arm']
    Analysis comparing groups: Treatment as Usual (TAU) Arm
      p-value: 0.399
      type: SUPERIORITY
Study: Multisite RCT of STEP-Home: A Transdiagnostic Skill-based Community Reintegration Workshop
['STEP-Home']
    Analysis comparing groups: STEP-Home
      p-value: <.0001
      type: SUPERIORITY
Study: Multisite RCT of STEP-Home: A Transdiagnostic Skill-based Community Reintegrati

In [158]:
s, o = xs[1]
o

{'type': 'PRIMARY',
 'title': 'Participant Acceptability Ratings for ED-based Relationship Intervention in ED-Heart Intervention Arm',
 'description': '1 survey item (investigator-developed) on acceptability of emergency department based healthy/unhealthy relationships intervention, rated using 5-point Likert scale (strongly disagree/1 to strongly agree/5)',
 'populationDescription': 'Intervention participants answering follow up survey acceptability of ED-based adolescent relationship intervention',
 'reportingStatus': 'POSTED',
 'paramType': 'COUNT_OF_PARTICIPANTS',
 'unitOfMeasure': 'Participants',
 'timeFrame': '12 weeks',
 'groups': [{'id': 'OG000',
   'title': 'ED-HEART (Intervention Arm)',
   'description': 'All adolescents take a baseline survey in the Emergency Department (ED), receive Emergency Department Healthcare Education Assessment and Response for Teen Relationships (ED HEART) by a trained health educator, complete an exit survey while in the ED, complete a 6-week check

In [121]:
from collections import Counter
ni_types = []

for s in raw_studies_p:
    # id_mod = s["protocolSection"]["identificationModule"]
    # outcomes = s["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
    # primary_outcomes_results = [o for o in outcomes if o["type"] == "PRIMARY"]
    # print("Study:", id_mod["briefTitle"])
    # print("https://clinicaltrials.gov/study/" + id_mod["nctId"])
    # if len(primary_outcomes_results) == 0:
    #     print("No primary outcomes results")
    #     continue
    for i, o in enumerate(primary_outcomes_results):
        # print(f"Primary outcome result {i+1}:")
        # print("  Title:", o["title"])
        # if "description" in o and o["description"]:
        #     print("  Description:", o["description"])
        # if "timeFrame" in o and o["timeFrame"]:
        #     print("  Time frame:", o["timeFrame"])
        if "analyses" in o:
            for analysis in o["analyses"]:
                if "pValue" in analysis:
                    # print(f"    p-value: {analysis['pValue']}")
                    # print(f"    p-value: {analysis['pValue']}")
                    ni_types.append(analysis["nonInferiorityType"])
       
    # print("-" * 40)

Counter(ni_types)

Counter({'SUPERIORITY': 9005, 'OTHER': 3602})

In [116]:
for group in s["resultsSection"]["participantFlowModule"]["groups"]:
    print(group["title"])
    display(group)
    

All Groups


{'id': 'FG000',
 'title': 'All Groups',
 'description': 'Since participants\' data are only analyzed if they completed all assignments, and the order of assignment was immaterial to the trial, as it was a basic experiment studying humans, there was no reason to separate out the participants based on the order of interventions. Hence, the milestones listed below, other than "Completed," do not represent the actual sequential order for all participants.'}

In [159]:
primary_outcomes_results[0]['groups']
# primary_outcomes_results[0]

[{'id': 'OG000',
  'title': 'TMS to dlPFC, Without a Concurrent Task (Passive)',
  'description': "TMS (intermittent theta burst stimulation) will be applied to the dlPFC, when subjects are in a resting state\n\nTMS: Intermittent theta burst stimulation TMS applied to the cortex to excite cerebral cortex\n\nn-back working memory task: Subjects perform an executive function task, in which they view the serial presentation of letters and decide whether or not a letter matches a letter presented 'n' letters back (2 letters or 1 letter)"},
 {'id': 'OG001',
  'title': 'TMS to Vertex, Without Concurrent Task (Control)',
  'description': "TMS (intermittent theta burst stimulation) will be applied to the cerebral vertex, when subjects are in a resting state\n\nTMS: Intermittent theta burst stimulation TMS applied to the cortex to excite cerebral cortex\n\nn-back working memory task: Subjects perform an executive function task, in which they view the serial presentation of letters and decide wh

In [167]:
s["protocolSection"]["descriptionModule"]

{'briefSummary': 'This study will develop an integrated treatment for adolescents who are depressed and suicidal and their parents who are depressed and have a history of suicidality.',
 'detailedDescription': 'Depression, like many psychiatric disorders, has a genetic component that makes it more likely that members of the same family will have the disorder. Depression in parents, particularly mothers, may put the children at greater risk for depression. When an adolescent whose parent is depressed develops depression himself or herself, treating both the parent and the adolescent may be more effective than treating only the adolescent. This study will test a depression treatment that targets depressed suicidal adolescents with a parent or primary caretaker who is also depressed and has a history of suicidality.\n\nParticipation in this study will last 6 months. Participants will be randomly assigned to receive either concurrent parent and adolescent treatment or adolescent only treat

In [245]:
from typing import Any

@dataclass
class PValue:
    comparator: str
    value: float



@dataclass
class Intervention:
    name: str
    type: str
    description: str
    arm_group_labels: list[str]

@dataclass
class Group:
    id: str
    title: str
    description: str
    num_participants: int
    interventions: list[Intervention]

@dataclass
class PrimaryOutcome:
    nct_id: str
    title: str
    description: str
    population_description: str
    timeframe: str
    groups: list[Group]
    p_value: PValue



@dataclass
class ValidStudy:
    nct_id: str
    title: str
    description: str
    conditions: list[str]
    keywords: list[str]
    brief_description: str
    primary_outcomes: list[PrimaryOutcome]
    interventions: list[Intervention]





valid_studies: list[ValidStudy] = []

def extract_interventions(protocol_section: dict[str, Any]) -> list[Intervention]:
    """Extract intervention information"""

    arms_module = protocol_section.get('armsInterventionsModule', {})
    interventions_data = arms_module.get('interventions', [])
    
    interventions = []
    for i, intervention_data in enumerate(interventions_data):
        intervention = Intervention(
            name=intervention_data.get('name', f'Intervention {i+1}'),
            type=intervention_data.get('type', 'OTHER'),
            description=intervention_data.get('description', ''),
            arm_group_labels=intervention_data.get('armGroupLabels', [])
        )
        interventions.append(intervention)
    
    return interventions


for s in raw_studies_p:
    primary_outcomes = []
    nct_id = s["protocolSection"]["identificationModule"]["nctId"]
    outcomes = s["resultsSection"]["outcomeMeasuresModule"]["outcomeMeasures"]
    interventions = extract_interventions(s["protocolSection"])
    arm_group_labels_to_intervention: dict[str, list[Intervention]] = {}

    for intervention in interventions:
        for label in intervention.arm_group_labels:
            label = label.lower()
            if label not in arm_group_labels_to_intervention:
                arm_group_labels_to_intervention[label] = []
            arm_group_labels_to_intervention[label].append(intervention)

    pos = [o for o in outcomes if o["type"] == "PRIMARY"]
    for o in pos:
        if "analyses" in o:
            for analysis in o["analyses"]:
                group_id_to_title = {g["id"]: g["title"] for g in o["groups"]}
                group_id_to_count = {}
                for denom in o["denoms"]:
                    if denom["units"].lower() != "participants":
                         continue
                    for c in denom["counts"]:
                        group_id_to_count[c["groupId"]] = c["value"]
                   
                if "pValue" in analysis:
                    groups = [f"{group_id_to_title[gid]} ({group_id_to_count[gid]})" for gid in analysis["groupIds"]]

                    pv = parse_p_value(analysis["pValue"])
                    primary_outcomes.append(PrimaryOutcome(
                        nct_id=nct_id,
                        title=o["title"],
                        description=o.get("description", ""),
                        population_description=o.get("populationDescription", ""),
                        timeframe=o.get("timeFrame", ""),
                        groups=[
                            Group(
                                id=g["id"],
                                title=g["title"],
                                description=g.get("description", ""),
                                interventions=arm_group_labels_to_intervention.get(g["title"].lower(), []),
                                num_participants=group_id_to_count.get(g["id"], 0)
                            ) for g in o["groups"] if g["id"] in analysis["groupIds"]
                        ],
                        p_value=PValue(
                            comparator=pv[0],
                            value=pv[1],
                        )
                    ))
    if primary_outcomes:
        dmod = s["protocolSection"]["descriptionModule"]
        valid_studies.append(ValidStudy(
            nct_id=nct_id,
            title=s["protocolSection"]["identificationModule"]["briefTitle"],
            description=dmod.get("detailedDescription", ""),
            brief_description=dmod.get("briefSummary", ""),
            primary_outcomes=primary_outcomes,
            interventions=interventions,
            conditions=s["protocolSection"]["conditionsModule"].get("conditions", []),
            keywords=s["protocolSection"]["conditionsModule"].get("keywords", []),
        ))


v = valid_studies[1]
# pprint(valid_studies[0])
pprint(v.primary_outcomes[0].groups)
pprint(v.interventions)

[Group(id='OG000',
       title='Peer-mediated Pivotal Response Intervention',
       description='Facilitators hold four 15-minute training sessions with '
                   'the peer group to engage the target child. The facilitator '
                   'explicitly identifies the "buddy" and asks the children to '
                   'use four strategies to play with the child during center '
                   'time: (a) offer your buddy some play options; (b) show and '
                   'talk about how to play with your buddy; (c) compliment '
                   'your buddy; and (d) show your buddy how to take turns. At '
                   'each training session the facilitator will describe one '
                   'strategy, provide examples, and practice the strategy '
                   'through role play. After trainings, the facilitator '
                   'provides ongoing support during center time to implement '
                   'the strategies over 12 weeks during m

In [253]:
!pip install pyperclip

Collecting pyperclip
  Downloading pyperclip-1.11.0-py3-none-any.whl.metadata (2.4 kB)
Downloading pyperclip-1.11.0-py3-none-any.whl (11 kB)
Installing collected packages: pyperclip
Successfully installed pyperclip-1.11.0


In [None]:
import pyperclip

v = valid_studies[0]
o = v.primary_outcomes[0]
v.keywords

groups_info = []
for g in o.groups:
    groups_info.append("\n".join([
        f"Group title: {g.title}",
        f"Description: {g.description}",
        f"Interventions: {', '.join([f"{i.name}: {i.description}" for i in g.interventions]) if g.interventions else '(uncertain)'}",
    ]))

prompt = f"""
We are creating flashcard summaries for a game where laypeople predict the outcomes of clinical trials (behavioral interventive).

The final question for the flashcard must be of format:
Did [intervention] improve [outcome] in [treatment_group] compared to [comparison_group]?

Keep the questions as short as possible. Use acronyms if needed, as long as they are understandable. If there is a name given to the intervention (e.g. "The Jolly Flower Telephone Protocol for Healthy Ageing"), instead of using the name, simply describe the intervention in layperson's terms (e.g. "calling other elderly people").

Please answer in JSON format with the following fields:
• question: The final question in the specified format with the appropriate placeholders filled in verbatim with the other fields ("Did [intervention] improve [outcome] in [treatment_group] compared to [comparison_group]?")
• purpose: The purpose of the clinical trial, in layperson's terms. This should be directly pluggable into the question template.
• intervention: The main intervention being tested, in layperson's terms. This should be directly pluggable into the question template.
• outcome: The primary outcome being measured, in layperson's terms. This should be directly pluggable into the question template.
• comparison_group: The comparator or control condition, in layperson's terms. This should be directly pluggable into the question template.
• intervention_group_description: A brief description of the intervention group.
• comparator_group_description: A brief description of the comparator/control group.

Please ensure that the answers are concise and easily understandable by someone without a medical background. Avoid technical jargon and use simple language. Where something is technical, give a lay description and then in parentheses the technical term. 

Please create a question based on the following clinical trial information:
• Trial Title: {v.title}
• Trial Description: {v.description}
• Measure: {o.title}
• Measure Description: {o.description}

The groups are as follows (the first is the intervention group):
{'\n\n'.join(groups_info)}


If there is missing intervention or comparator information, please either match to these interventions (if you can tell from the group title/description), or say "Control" if it is a no-treatment or standard care control group, or "Unknown" if you cannot tell.
"""

pyperclip.copy(prompt)

In [213]:
for v in valid_studies[:5]:
    print(f"Study: {v.nct_id} - {v.title}")
    print(f"Brief Description: {v.brief_description[:200]}...")
    print("Primary Outcomes:")
    for outcome in v.primary_outcomes[:2]:
        print(f"  - Title: {outcome.title}")
        print(f"    Description: {outcome.description}")
        print(f"    Population Description: {outcome.population_description}")
        print(f"    Timeframe: {outcome.timeframe}")
        print(f"    P-Value: {outcome.p_value.comparator}{outcome.p_value.value}")
        print(f"    SUCCESS: {outcome.p_value.comparator != '>' and outcome.p_value.value < 0.05}")
        print("    Groups:")
        for group in outcome.groups:
            print(f"      * ID: {group.id}")
            print(f"        Title: {group.title}")
            print(f"        Description: {group.description}")
            print(f"        Number of Participants: {group.num_participants}")
    print("-" * 80)

Study: NCT03022370 - Stepping Stones and Creating Futures Intervention Trial
Brief Description: This study evaluates whether the behavioural/structural interventions of Stepping Stones and Creating Futures can reduce the incidence of intimate partner violence in urban informal settlements amongs...
Primary Outcomes:
  - Title: Any Past Year Physical Intimate Partner Violence Perpetration (Men) and Experience (Women)
    Description: Physical intimate partner violence is assessed using five items based on the WHO VAW scale. A positive response to any item leads to a person being classified as perpetrating (men) and experiencing (women) in the past year. With 0=none, 1=yes.
    Population Description: 
    Timeframe: 24 months post baseline
    P-Value: =0.686
    SUCCESS: False
    Groups:
      * ID: OG000
        Title: Women - Intervention
        Description: Women who received the Stepping Stones and Creating Futures intervention
        Number of Participants: 260
      * ID: OG00

In [202]:

s["resultsSection"]

def find_paths_to_key(data, key_prefix, current_path=""):
    """
    Recursively find all paths to keys that start with the given prefix.
    
    Args:
        data: The data structure to search (dict, list, or other)
        key_prefix: The prefix to search for in dictionary keys
        current_path: The current path (used for recursion)
    
    Returns:
        List of paths where keys starting with key_prefix are found
    """
    paths = []
    
    if isinstance(data, dict):
        for key, value in data.items():
            new_path = f"{current_path}.{key}" if current_path else key
            
            # Check if this key starts with our prefix
            if key.startswith(key_prefix):
                paths.append(new_path)
            
            # Recursively search the value
            paths.extend(find_paths_to_key(value, key_prefix, new_path))
    
    elif isinstance(data, list):
        for i, item in enumerate(data):
            new_path = f"{current_path}[{i}]" if current_path else f"[{i}]"
            paths.extend(find_paths_to_key(item, key_prefix, new_path))
    
    return paths

# Find all paths to keys starting with "armGroup"
armgroup_paths = find_paths_to_key(s, "armGroup")
print("Paths to keys starting with 'armGroup':")
for path in armgroup_paths:
    print(f"  {path}")

Paths to keys starting with 'armGroup':
  protocolSection.armsInterventionsModule.armGroups
  protocolSection.armsInterventionsModule.interventions[0].armGroupLabels
  protocolSection.armsInterventionsModule.interventions[1].armGroupLabels


In [216]:
s["resultsSection"]['participantFlowModule'].keys()

dict_keys(['groups', 'periods'])