In [1]:
# Install required packages
!pip install pandas requests tqdm psutil

import pandas as pd
import requests
from tqdm import tqdm
import os
import logging
import psutil
import sys
import json
import threading
import concurrent.futures

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# File to save the API data (JSON format)
DATA_FILE = "clinical_drug_trials_all_records.json"
# Lock for thread-safe JSON writing
json_lock = threading.Lock()

# Display the JSON file download location
json_path = os.path.abspath(DATA_FILE)
logging.info(f"JSON file will be downloaded/saved at: {json_path}")

# Function to check available RAM
def check_available_ram():
    memory = psutil.virtual_memory()
    available_ram = memory.available / (1024 ** 3)  # Convert to GB
    logging.info(f"Available RAM: {available_ram:.2f} GB")
    return available_ram

# Step 1: Fetch Total Number of Clinical Drug Trials with Progress Bar
def fetch_page_for_count(page_token=None):
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "query.intr": "Drug",
        "filter.advanced": "AREA[StudyType]INTERVENTIONAL",
        "pageSize": 100,
        "pageToken": page_token
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken')
            return len(studies), next_page_token
        else:
            logging.error(f"Error fetching page for count: {response.status_code} - {response.text}")
            return 0, None
    except Exception as e:
        logging.error(f"Exception during page fetch for count: {e}")
        return 0, None

def get_total_study_count(max_workers=1):
    try:
        # First, try the /stats/size endpoint with filters
        base_url = "https://clinicaltrials.gov/api/v2/stats/size"
        params = {
            "query.intr": "Drug",
            "filter.advanced": "AREA[StudyType]INTERVENTIONAL"
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            total_studies = data.get('studies', 0)
            if total_studies > 0:
                logging.info(f"Total clinical drug trials from /stats/size: {total_studies}")
                return total_studies

        # If /stats/size fails or returns 0, estimate by fetching pages in parallel with a progress bar
        logging.warning("Failed to get total study count from /stats/size. Estimating via parallel fetch...")
        total_count = 0
        next_page_token = None
        page_tokens = [None]
        max_pages_to_estimate = 10  # Limit estimation to 10 pages to avoid infinite loop

        with tqdm(total=max_pages_to_estimate * 100, desc="Estimating total studies", unit="studies") as pbar:
            pages_fetched = 0
            while page_tokens and pages_fetched < max_pages_to_estimate:
                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                    future_to_token = {executor.submit(fetch_page_for_count, token): token for token in page_tokens}
                    page_tokens = []

                    for future in concurrent.futures.as_completed(future_to_token):
                        page_count, next_token = future.result()
                        total_count += page_count
                        pbar.update(page_count)
                        if next_token:
                            page_tokens.append(next_token)
                pages_fetched += 1

        # Estimate total based on pages fetched
        if total_count > 0:
            pages_fetched = total_count // 100
            estimated_total = total_count * (max_pages_to_estimate / pages_fetched) if pages_fetched > 0 else total_count * 10
            logging.info(f"Estimated total clinical drug trials: {int(estimated_total)}")
            return int(estimated_total)
        else:
            logging.warning("Could not estimate total study count. Using default large number for progress.")
            return 100000  # Default large number for progress bar
    except Exception as e:
        logging.error(f"Exception occurred while fetching total study count: {e}")
        return 100000  # Default large number for progress bar

# Step 1.1: Check if JSON File Exists and Estimate Record Count
def check_existing_json():
    if not os.path.exists(DATA_FILE):
        return 0  # File doesn't exist, need to download

    try:
        total_records = 0
        file_size = os.path.getsize(DATA_FILE) // (1024 ** 2)  # File size in MB
        with open(DATA_FILE, 'r', encoding='utf-8') as f:
            # Read the file as a JSON array
            f.seek(0)
            if f.read(1) != '[':  # Check if file starts with a JSON array
                return 0  # Invalid JSON format, assume empty
            f.seek(1)  # Move to the start of the first record
            with tqdm(total=file_size, desc="Counting JSON records", unit="MB") as pbar:
                while True:
                    try:
                        # Read one JSON object at a time
                        obj = json.loads(f.read(1) + f.read(f.read().find('}') + 1))
                        total_records += 1
                        # Update progress bar based on current position in file
                        current_pos = f.tell() // (1024 ** 2)  # Current position in MB
                        pbar.n = min(current_pos, file_size)  # Update progress
                        pbar.refresh()
                        # Skip comma and whitespace
                        while f.read(1) in [',', ' ', '\n']:
                            pass
                        f.seek(f.tell() - 1)
                    except json.JSONDecodeError:
                        break  # End of array or invalid JSON
        logging.info(f"Existing JSON file contains {total_records} records.")
        return total_records
    except Exception as e:
        logging.error(f"Error reading existing JSON file: {e}. Will re-download data.")
        return 0

# Step 1.2: Fetch a Single Page of Data
def fetch_page(page_token=None):
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "query.intr": "Drug",
        "filter.advanced": "AREA[StudyType]INTERVENTIONAL",
        "pageSize": 100,
        "pageToken": page_token
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken')
            return studies, next_page_token
        else:
            logging.error(f"Error fetching page with token {page_token}: {response.status_code} - {response.text}")
            return None, None
    except Exception as e:
        logging.error(f"Exception occurred during API request for page with token {page_token}: {e}")
        return None, None

# Step 1.3: Save Fetched Studies to JSON (Thread-Safe)
def save_to_json(studies, first_chunk=True):
    if not studies:
        return

    with json_lock:
        mode = 'a' if not first_chunk else 'w'
        with open(DATA_FILE, mode, encoding='utf-8') as f:
            if first_chunk:
                # Start the JSON array
                f.write('[')
            else:
                # Add a comma to separate from previous entries
                f.write(',')
            # Write each study as a JSON object
            for i, study in enumerate(studies):
                json.dump(study, f, ensure_ascii=False)
                if i < len(studies) - 1:
                    f.write(',')
            # If this is the last chunk, close the array (handled in fetch_clinical_trials)

# Step 1.4: Fetch All Clinical Drug Trials with Parallel Downloads and Save as JSON
def fetch_clinical_trials(max_workers=1):
    check_available_ram()
    total_studies = get_total_study_count(max_workers=max_workers)
    if total_studies == 0:
        logging.warning("API returned 0 studies. Attempting to fetch at least one page...")
        studies, next_page_token = fetch_page()
        if studies:
            total_studies = max(len(studies) * 100, 100000)  # Rough estimate
        else:
            logging.error("No studies fetched. Using default large number for progress.")
            total_studies = 100000

    existing_records = check_existing_json()
    if existing_records >= total_studies > 0:
        logging.info("JSON file already contains all records. Skipping download.")
        # Ensure the JSON file is properly closed with a closing bracket
        with open(DATA_FILE, 'rb+') as f:
            f.seek(-1, os.SEEK_END)
            last_char = f.read(1).decode('utf-8')
            if last_char != ']':
                f.seek(-1, os.SEEK_END)
                f.write(b']')
        return

    if os.path.exists(DATA_FILE):
        os.remove(DATA_FILE)

    studies_fetched = 0
    next_page_token = None
    page_tokens = [None]
    first_chunk = True

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        with tqdm(total=total_studies, desc="Fetching clinical drug trials", unit="studies") as pbar:
            while page_tokens:
                check_available_ram()
                if psutil.virtual_memory().available / (1024 ** 3) < 0.5:  # Less than 0.5 GB available
                    logging.warning("Low RAM available. Pausing fetch to free memory...")
                    sys.stdout.flush()
                    break
                future_to_token = {executor.submit(fetch_page, token): token for token in page_tokens}
                page_tokens = []

                for future in concurrent.futures.as_completed(future_to_token):
                    studies, next_token = future.result()
                    if studies:
                        save_to_json(studies, first_chunk=first_chunk)
                        first_chunk = False
                        studies_fetched += len(studies)
                        pbar.update(len(studies))
                    if next_token:
                        page_tokens.append(next_token)

    # Close the JSON array
    with open(DATA_FILE, 'a', encoding='utf-8') as f:
        f.write(']')

    logging.info(f"Fetched {studies_fetched} clinical drug trials and saved to {json_path}.")

# Main Workflow
def main():
    # Fetch all clinical drug trials and save as JSON
    fetch_clinical_trials()
    logging.info("Data fetching complete. You can now access the data in the JSON file for your research.")

if __name__ == "__main__":
    main()

Defaulting to user installation because normal site-packages is not writeable


2025-06-05 23:59:26,587 - INFO - JSON file will be downloaded/saved at: d:\IIT Patna\Leveraging-ai-clinical-trials\clinical_drug_trials_all_records.json
2025-06-05 23:59:26,594 - INFO - Available RAM: 1.79 GB
Estimating total studies: 100%|██████████| 1000/1000 [00:14<00:00, 70.30studies/s]
2025-06-05 23:59:41,266 - INFO - Estimated total clinical drug trials: 1000
Fetching clinical drug trials:   0%|          | 0/1000 [00:00<?, ?studies/s]2025-06-05 23:59:41,266 - INFO - Available RAM: 1.92 GB
Fetching clinical drug trials:  10%|█         | 100/1000 [00:01<00:16, 55.03studies/s]2025-06-05 23:59:43,083 - INFO - Available RAM: 1.88 GB
Fetching clinical drug trials:  20%|██        | 200/1000 [00:03<00:14, 54.81studies/s]2025-06-05 23:59:44,920 - INFO - Available RAM: 1.86 GB
Fetching clinical drug trials:  30%|███       | 300/1000 [00:04<00:11, 63.42studies/s]2025-06-05 23:59:46,195 - INFO - Available RAM: 1.87 GB
Fetching clinical drug trials:  40%|████      | 400/1000 [00:06<00:09, 63.

In [None]:
# Install required packages (if not already installed)
!pip install pandas tqdm psutil ijson

import pandas as pd
from tqdm import tqdm
import os
import logging
import psutil
import sys
import json
import gc
import ijson  # For streaming JSON parsing

# Set up logging (consistent with the previous program)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# File containing the API data (JSON format)
DATA_FILE = "clinical_drug_trials_all_records.json"

# Display the JSON file location
json_path = os.path.abspath(DATA_FILE)
logging.info(f"JSON file location: {json_path}")

# Function to check available RAM
def check_available_ram():
    memory = psutil.virtual_memory()
    available_ram = memory.available / (1024 ** 3)  # Convert to GB
    logging.info(f"Available RAM: {available_ram:.2f} GB")
    return available_ram

# Step 1: Extract and Print the Header of the JSON File
def print_json_header(num_records_to_sample=1):
    if not os.path.exists(DATA_FILE):
        logging.error(f"JSON file not found at: {json_path}")
        return

    try:
        check_available_ram()
        # Stream the JSON file to load a small sample of records
        records = []
        file_size = os.path.getsize(DATA_FILE) // (1024 ** 2)  # File size in MB
        logging.info("Streaming JSON data to load sample records for header extraction...")
        with open(DATA_FILE, 'r', encoding='utf-8') as f:
            # Use ijson to parse the JSON array incrementally
            parser = ijson.items(f, 'item')
            with tqdm(total=num_records_to_sample, desc="Loading sample records", unit="records") as pbar:
                for i, record in enumerate(parser):
                    if i >= num_records_to_sample:
                        break
                    records.append(record)
                    pbar.update(1)

        if not records:
            logging.info("No records found in the JSON file.")
            return

        # Flatten the JSON structure into a DataFrame to determine the header
        logging.info("Flattening JSON data to extract header...")
        df = pd.json_normalize(records)
        header = df.columns.tolist()
        logging.info(f"Total number of columns in the header: {len(header)}")

        # Print the header in a readable format
        logging.info(f"\nHeader (Column Names) of the JSON File:\n{'-'*50}")
        for i, col in enumerate(header, 1):
            print(f"{i}. {col}")
        print('-'*50)

        # Clean up memory
        del df
        del records
        gc.collect()
        check_available_ram()

    except Exception as e:
        logging.error(f"Error extracting header from JSON file: {e}")

# Main Workflow
def main():
    # Extract and print the header
    print_json_header(num_records_to_sample=1)
    logging.info("Header extraction complete. You can now use the JSON file for further analytics.")

if __name__ == "__main__":
    main()