In [None]:
import requests
import concurrent.futures
import json
import os
from tqdm import tqdm
import sys
from json import JSONDecodeError
import sqlite3
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from threading import Lock

In [22]:
BASE_URL = "https://clinicaltrials.gov/api/v2/studies"
PAGE_SIZE = 100  # API max is 100
OUTPUT_FILE = "clinical_trials_full.json"
SLEEP_BETWEEN_REQUESTS = 0.5  # seconds

def fetch_all_studies():
    """Fetch all studies from ClinicalTrials.gov API v2 and write to a JSON file."""
    print("🚀 Starting download of all clinical trials...")
    next_token = None
    total_fetched = 0
    first_record = True

    # Open output file and write opening bracket for JSON array
    with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
        outfile.write("[\n")

        with tqdm(desc="Downloading pages ", unit=" page") as pbar:
            while True:
                params = {"pageSize": PAGE_SIZE}
                if next_token:
                    params["pageToken"] = next_token

                try:
                    response = requests.get(BASE_URL, params=params, timeout=30)
                    response.raise_for_status()
                    data = response.json()
                except Exception as e:
                    print(f"\n⚠️ Error fetching page: {e}")
                    print("Retrying in 5 seconds...")
                    time.sleep(5)
                    continue

                studies = data.get("studies", [])
                if not studies:
                    break

                for study in studies:
                    # Write comma before every record except the first
                    if not first_record:
                        outfile.write(",\n")
                    json.dump(study, outfile, ensure_ascii=False)
                    first_record = False
                    total_fetched += 1

                next_token = data.get("nextPageToken")
                pbar.update(1)

                if not next_token:
                    break

                time.sleep(SLEEP_BETWEEN_REQUESTS)

        # Close JSON array
        outfile.write("\n]")

    print(f"\n✅ Download complete. Total studies saved: {total_fetched}")
    print(f"Output file: {OUTPUT_FILE}")

    # Show the first two records for verification
    try:
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            all_data = json.load(f)
            print(f"\nFirst two records:")
            print(json.dumps(all_data[:2], indent=2))
    except Exception as e:
        print(f"Could not load output file for verification: {e}")

if __name__ == "__main__":
    fetch_all_studies()


🚀 Starting download of all clinical trials...


Downloading pages : 5415page [2:55:34,  1.95s/page]



✅ Download complete. Total studies saved: 541402
Output file: clinical_trials_full.json
Could not load output file for verification: 


In [None]:
import os
import json

filepath = "clinical_trials_full.json"

# Step 1: File existence
if not os.path.isfile(filepath):
    print(f"❌ File not found: {filepath}")
    exit()

# Step 2: File permissions
try:
    with open(filepath, 'r', encoding='utf-8') as f:
        f.read(1)
    print("✅ File can be opened for reading.")
except Exception as e:
    print(f"❌ File cannot be opened: {e}")
    exit()

# Step 3: Try JSON validation
try:
    with open(filepath, 'r', encoding='utf-8') as f:
        json.load(f)
    print(f"✅ {filepath} is valid JSON.")
except json.JSONDecodeError as e:
    print(f"❌ JSON syntax error in {filepath}:")
    print(f"   Line {e.lineno}, Column {e.colno}: {e.msg}")
except Exception as e:
    print(f"❌ Could not open or parse {filepath}: {e}")

import ijson

filepath = "clinical_trials_full.json"
record_count = 0
error_count = 0

with open(filepath, 'rb') as f:
    try:
        for record in ijson.items(f, 'item'):
            record_count += 1
            # Optionally, add further validation here
    except Exception as e:
        print(f"❌ Error while parsing record {record_count+1}: {e}")
        error_count += 1

print(f"\n✅ Finished streaming validation.")
print(f"Total records parsed: {record_count}")
print(f"Total errors encountered: {error_count}")


✅ File can be opened for reading.
❌ Could not open or parse clinical_trials_full.json: 


In [30]:
import json

def validate_json_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            json.load(f)
        print(f"✅ {filepath} is valid JSON.")
    except json.JSONDecodeError as e:
        print(f"❌ JSON syntax error in {filepath}:")
        print(f"   Line {e.lineno}, Column {e.colno}: {e.msg}")
    except Exception as e:
        print(f"❌ Could not open or parse {filepath}: {e}")

if __name__ == "__main__":
    validate_json_file("clinical_trials_full.json")


❌ Could not open or parse clinical_trials_full.json: 


In [23]:
import os
print(os.path.exists("clinical_trials_full.json"))

import os
print("File size:", os.path.getsize("clinical_trials_full.json"))


True
File size: 10149045454


In [None]:
## Not to be ran ...

import json
import os
from json import JSONDecodeError
from tqdm import tqdm

def clean_large_json(input_file, output_file):
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file '{input_file}' not found")

    file_size = os.path.getsize(input_file)
    valid_records = 0
    total_errors = 0

    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile, \
         tqdm(total=file_size, unit='B', unit_scale=True, desc="Cleaning data") as pbar:

        outfile.write('[\n')
        buffer = ''
        first_record = True

        while True:
            chunk = infile.read(4096)  # 4KB chunks
            if not chunk:
                break
            
            buffer += chunk
            pbar.update(len(chunk))
            
            # Process complete JSON objects in buffer
            while True:
                # Find potential JSON object start/end
                start = buffer.find('{')
                end = buffer.find('}', start) + 1  # Include closing brace

                if start == -1 or end == 0:
                    break  # No complete objects found

                json_str = buffer[start:end]
                buffer = buffer[end:]  # Remove processed part

                try:
                    obj = json.loads(json_str)
                    if isinstance(obj, dict) and 'protocolSection' in obj:
                        if not first_record:
                            outfile.write(',\n')
                        json.dump(obj, outfile, ensure_ascii=False)
                        first_record = False
                        valid_records += 1
                    else:
                        total_errors += 1
                except JSONDecodeError:
                    total_errors += 1

        outfile.write('\n]')

    print(f"\nCleaning complete. Valid records: {valid_records}, Errors: {total_errors}")
    return output_file

if __name__ == "__main__":
    input_filename = "clinical_trials_full.json"
    output_filename = "cleaned_data.json"

    try:
        print("Starting data cleaning process...")
        result_file = clean_large_json(input_filename, output_filename)
        print(f"\nCleaned data saved to {result_file}")

        # Verify and print first two records
        if os.path.getsize(output_filename) > 2:
            with open(output_filename, 'r') as f:
                data = json.load(f)
                print(f"\nTotal cleaned records: {len(data)}")
                print("\nFirst two valid records:")
                print(json.dumps(data[:2], indent=2))
        else:
            print("\nNo valid records found in output file")

    except Exception as e:
        print(f"\nError: {str(e)}")


In [26]:
import json
from tqdm import tqdm
import os

def count_records_with_progress(input_file):
    file_size = os.path.getsize(input_file)
    count = 0
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for _ in tqdm(data, desc='Counting records'):
            count += 1
    return count

input_filename = 'clinical_trials_full.json'
record_count = count_records_with_progress(input_filename)
print(f"\nTotal records in {input_filename}: {record_count}")



MemoryError: 