# Proof of Concept

In [122]:
import requests
import json
from math import ceil
from tqdm import tqdm

import time

## Total Number of Studies

In [139]:
size = requests.get("https://clinicaltrials.gov/api/v2/stats/size")
total_studies = json.loads(size.content)['totalStudies']

## Gather All Studies POC

In [3]:
aggregated_studies = []
nextPage = None

request_delay = 1 / 3  #  <= 3 Requests per second

for _ in tqdm(range(ceil(total_studies/1000))):
    try:
        payload = {'format': 'json', 'pageSize': '1000', 'pageToken' : f'{nextPage}'} if nextPage else {'format': 'json', 'pageSize': '1000'}
        data = requests.get("https://clinicaltrials.gov/api/v2/studies", params=payload, timeout=5.0)
        studies = data.json()

        aggregated_studies.extend(studies['studies'])

        # The last page does not have a nextPageToken field
        if 'nextPageToken' not in studies:
            break

        nextPage = studies['nextPageToken']

        time.sleep(request_delay)
    except:
        print('Page Token:', nextPage)
        print('Payload:', payload)
        print('# Aggregated Studies:', len(aggregated_studies))

<class 'dict'>


  1%|▏         | 7/473 [00:06<07:45,  1.00it/s]

<class 'dict'>


  2%|▏         | 8/473 [00:07<07:16,  1.07it/s]

<class 'dict'>


  2%|▏         | 9/473 [00:08<07:25,  1.04it/s]

<class 'dict'>


  2%|▏         | 10/473 [00:09<07:02,  1.10it/s]

<class 'dict'>


  2%|▏         | 11/473 [00:10<07:15,  1.06it/s]

<class 'dict'>


  3%|▎         | 12/473 [00:10<06:52,  1.12it/s]

<class 'dict'>


  3%|▎         | 13/473 [00:11<06:32,  1.17it/s]

<class 'dict'>


  3%|▎         | 14/473 [00:12<07:03,  1.08it/s]

<class 'dict'>


  3%|▎         | 15/473 [00:13<06:51,  1.11it/s]

In [None]:
print('Expected Studies:', total_studies)
print('Gathered Studies:', len(aggregated_studies))

## Version / Timestamp

In [124]:
version = requests.get("https://clinicaltrials.gov/api/v2/version").json()
print('Data Time Stamp:', version['dataTimestamp'].split('T')[0])

Data Time Stamp: 2023-11-08


## Dumper POC

Formatting code similar to how it would look in a dumper (HTTPDumper)

In [98]:
API_PAGE = "https://clinicaltrials.gov/api/v2/studies"
PAGE_SIZE=1000

ids = []
pageTokens = []

total_pages = (total_studies + PAGE_SIZE - 1) // PAGE_SIZE  # Calculate total pages

nextPage = None
for p in tqdm(range(1, total_pages + 1)):
    if nextPage:
        doc = requests.get(API_PAGE + f"?fields=NCTId&pageSize={PAGE_SIZE}&pageToken={nextPage}").json()
    else:
        doc = requests.get(API_PAGE + f"?fields=NCTId&pageSize={PAGE_SIZE}").json()

    if 'nextPageToken' in doc:
        nextPage = doc['nextPageToken']
        pageTokens.append(nextPage)

    for study in doc['studies']:
        ids.append(study['protocolSection']['identificationModule']['nctId'])

100%|██████████| 473/473 [04:29<00:00,  1.75it/s]


## Parser / Uploader POC

In [134]:
API_PAGE = "https://clinicaltrials.gov/api/v2/studies"
PAGE_SIZE=1000

ids = []
pageTokens = []

total_pages = (total_studies + PAGE_SIZE - 1) // PAGE_SIZE  # Calculate total pages

nextPage = None
for p in tqdm(range(1, total_pages + 1)):
    if nextPage:
        res = requests.get(API_PAGE + f"?fields=NCTId&pageSize={PAGE_SIZE}&pageToken={nextPage}", stream=True)
    else:
        res = requests.get(API_PAGE + f"?fields=NCTId&pageSize={PAGE_SIZE}", stream=True)

    doc = res.json()

    if 'nextPageToken' in doc:
        nextPage = doc['nextPageToken']
        pageTokens.append(nextPage)

    fout = open(f"json_data/{p}.json", "wb")
    for chunk in res.iter_content(chunk_size=512 * 1024):
        if chunk:
            fout.write(chunk)
    fout.close()

100%|██████████| 473/473 [02:22<00:00,  3.32it/s]


In [135]:
import os
import glob
from biothings.utils.dataload import unlist, dict_sweep

data_folder = "json_data"
for infile in tqdm(glob.glob(os.path.join(data_folder,"*.json"))):
    doc = json.load(open(infile))

    studies = doc["studies"]

    for study in studies:
        print(study[])

100%|██████████| 473/473 [00:01<00:00, 361.26it/s]
