In [1]:
"""
ClinicalTrials.gov API v2 client for fetching study data.
"""
import urllib
import json
import time
from typing import Dict, Generator
from dataclasses import dataclass

import os
from tqdm import tqdm

In [2]:
PAGE_SIZE = 1000
MAX_PAGES = 1000
OUTPUT_DIR = "data"

def fetch_studies(page_size: int = PAGE_SIZE, max_pages: int = MAX_PAGES, sleep_sec: float = 0.1) -> Generator[Dict, None, None]:
    """
    Fetch behavioral intervention studies from ClinicalTrials.gov API
    """
    BASE = "https://clinicaltrials.gov/api/v2/studies"
    
    fields = [
        "protocolSection.identificationModule",
        "protocolSection.descriptionModule",
        "protocolSection.statusModule",
        "protocolSection.sponsorCollaboratorsModule",
        "protocolSection.designModule",
        "protocolSection.conditionsModule",
        "protocolSection.eligibilityModule",
        "protocolSection.armsInterventionsModule",
        "protocolSection.outcomesModule",

        "resultsSection.participantFlowModule",
        "resultsSection.baselineCharacteristicsModule",
        "resultsSection.outcomeMeasuresModule",
        
        "hasResults"
   ]
    
    query_terms = [
        "AREA[InterventionType]BEHAVIORAL",
        "AREA[OverallStatus]COMPLETED",
    ]

    
    next_page_token = None
    for page in range(max_pages):
        params = {
            "pageSize": page_size,
            "fields": ",".join(fields),
            "query.term": " AND ".join(query_terms),
        }
        
        if next_page_token:
            params["pageToken"] = next_page_token
        
        full_url = f"{BASE}?{urllib.parse.urlencode(params)}"

        
        req = urllib.request.Request(full_url)
        req.add_header('User-Agent', 'Python-urllib/3.11')
        req.add_header('Accept', 'application/json')
        
        try:
            with urllib.request.urlopen(req, timeout=30) as response:
                if response.status != 200:
                    break
                
                data = json.loads(response.read().decode())
                studies = data.get('studies', [])
                
                if not studies:
                    break
                
                for study in studies:
                    yield study
                
                next_page_token = data.get('nextPageToken')
                if not next_page_token or len(studies) < page_size:
                    break
                    
        except Exception as e:
            print(f"Error fetching page {page}: {e}")
            break
        
        if sleep_sec > 0:
            time.sleep(sleep_sec)


for study in tqdm(fetch_studies()):
    with open(os.path.join(OUTPUT_DIR, f"{study['protocolSection']['identificationModule']['nctId']}.json"), 'w') as f:
        json.dump(study, f, indent=4)

0it [00:00, ?it/s]

34562it [02:12, 261.23it/s]


In [3]:
len(os.listdir(OUTPUT_DIR))

34562