In [14]:
import os
import json

json_file = "clinical_trials_full.json"

if not os.path.exists(json_file):
    print(f"❌ Error: {json_file} not found!")
    print("Run the data fetching script first to create this file")
else:
    print(f"✅ Found {json_file} ({os.path.getsize(json_file)/1024/1024:.2f} MB)")


✅ Found clinical_trials_full.json (9678.88 MB)


In [15]:
def read_first_record(file_path):
    """Safely reads first JSON object from large array file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            # Skip array opening bracket
            while f.read(1) != '[':
                pass
            
            # Read until first closing brace
            buffer = ''
            brace_count = 0
            while True:
                c = f.read(1)
                if not c:
                    break
                
                if c == '{':
                    brace_count += 1
                if c == '}':
                    brace_count -= 1
                    if brace_count == 0:
                        buffer += c
                        break
                
                buffer += c
                
            return json.loads(buffer)
    
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# Usage
first_record = read_first_record(json_file)
if first_record:
    print("First record structure:")
    print(json.dumps(first_record, indent=2))
else:
    print("Could not read first record")


First record structure:
{
  "protocolSection": {
    "identificationModule": {
      "nctId": "NCT00072579",
      "orgStudyIdInfo": {
        "id": "CCCWFU-23102"
      },
      "secondaryIdInfos": [
        {
          "id": "CDR0000340983",
          "type": "REGISTRY",
          "domain": "PDQ (Physician Data Query)"
        },
        {
          "id": "BRLX-02153"
        },
        {
          "id": "NCI-7350"
        }
      ],
      "organization": {
        "fullName": "Wake Forest University Health Sciences",
        "class": "OTHER"
      },
      "briefTitle": "Sargramostim in Treating Patients With Chronic Phase Chronic Myelogenous Leukemia Who Are Not in Complete Cytogenetic Remission Following Initial Treatment",
      "officialTitle": "Phase II Study of GM-CSF in Patients With Chronic Phase Chronic Myeloid Leukemia (CP-CML) Who Are Not in Complete Cytogenetic Remission After Initial Therapy"
    },
    "statusModule": {
      "statusVerifiedDate": "2013-06",
      "ove

In [16]:
def print_record_structure(record, indent=0):
    """Recursively prints the structure of a JSON record"""
    if isinstance(record, dict):
        for key, value in record.items():
            print(' ' * indent + f"├─ {key} ({type(value).__name__})")
            if isinstance(value, (dict, list)):
                print_record_structure(value, indent + 4)
    elif isinstance(record, list) and len(record) > 0:
        print_record_structure(record[0], indent)

if first_record:
    print("\nRecord field structure:")
    print_record_structure(first_record)



Record field structure:
├─ protocolSection (dict)
    ├─ identificationModule (dict)
        ├─ nctId (str)
        ├─ orgStudyIdInfo (dict)
            ├─ id (str)
        ├─ secondaryIdInfos (list)
            ├─ id (str)
            ├─ type (str)
            ├─ domain (str)
        ├─ organization (dict)
            ├─ fullName (str)
            ├─ class (str)
        ├─ briefTitle (str)
        ├─ officialTitle (str)
    ├─ statusModule (dict)
        ├─ statusVerifiedDate (str)
        ├─ overallStatus (str)
        ├─ expandedAccessInfo (dict)
            ├─ hasExpandedAccess (bool)
        ├─ startDateStruct (dict)
            ├─ date (str)
        ├─ primaryCompletionDateStruct (dict)
            ├─ date (str)
            ├─ type (str)
        ├─ completionDateStruct (dict)
            ├─ date (str)
            ├─ type (str)
        ├─ studyFirstSubmitDate (str)
        ├─ studyFirstSubmitQcDate (str)
        ├─ studyFirstPostDateStruct (dict)
            ├─ date (str)
       

In [17]:
import json
import os
from decimal import Decimal
from tqdm import tqdm
import ijson

INPUT_FILE = "clinical_trials_full.json"
OUTPUT_FILE = "clinical_trials_cleaned.json"

def convert_decimals(obj):
    """Recursively convert Decimals to floats"""
    if isinstance(obj, Decimal):
        return float(obj)
    if isinstance(obj, dict):
        return {k: convert_decimals(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [convert_decimals(v) for v in obj]
    return obj

def clean_record(record):
    """Cleans a single study record"""
    try:
        nct_id = record.get('protocolSection', {}).get('identificationModule', {}).get('nctId')
        brief_title = record.get('protocolSection', {}).get('identificationModule', {}).get('briefTitle')
        return convert_decimals(record) if nct_id and brief_title else None
    except Exception as e:
        print(f"Error cleaning record: {str(e)}")
        return None

def process_records():
    if not os.path.exists(INPUT_FILE):
        print(f"❌ Error: Input file '{INPUT_FILE}' not found!")
        return

    file_size = os.path.getsize(INPUT_FILE)
    
    with open(INPUT_FILE, 'rb') as infile, \
         open(OUTPUT_FILE, 'w', encoding='utf-8') as outfile:

        outfile.write('[\n')
        first_record = True
        stack = []
        current = None

        with tqdm(total=file_size, unit='B', unit_scale=True, desc="Processing") as pbar:
            try:
                for prefix, event, value in ijson.parse(infile):
                    pbar.update(infile.tell() - pbar.n)
                    
                    if event == 'start_map':
                        new_obj = {}
                        if stack:
                            key = stack[-1][1]
                            parent = stack[-1][0]
                            parent[key] = new_obj
                        stack.append((new_obj, None))
                        current = new_obj
                    elif event == 'map_key':
                        if stack:
                            stack[-1] = (stack[-1][0], value)
                    elif event == 'end_map':
                        if stack:
                            obj, _ = stack.pop()
                            if not stack:  # Root object
                                cleaned = clean_record(obj)
                                if cleaned:
                                    if not first_record:
                                        outfile.write(',\n')
                                    json.dump(cleaned, outfile, ensure_ascii=False)
                                    first_record = False
                    elif event in ['string', 'number', 'boolean', 'null']:
                        if stack and stack[-1][1] is not None:
                            key = stack[-1][1]
                            stack[-1][0][key] = value
                            stack[-1] = (stack[-1][0], None)

            except ijson.common.JSONError as e:
                print(f"\n⚠️ JSON error: {str(e)}")

        outfile.write('\n]')

if __name__ == "__main__":
    process_records()


Processing: 100%|█████████████████████████████████████████████████████████████████| 10.1G/10.1G [41:01<00:00, 4.12MB/s]


In [18]:
def print_record_structure(record, indent=0):
    """Recursively prints the structure of a JSON record"""
    if isinstance(record, dict):
        for key, value in record.items():
            print(' ' * indent + f"├─ {key} ({type(value).__name__})")
            if isinstance(value, (dict, list)):
                print_record_structure(value, indent + 4)
    elif isinstance(record, list) and len(record) > 0:
        print_record_structure(record[0], indent)

json_file = "clinical_trials_cleaned.json"
cleaned_records = read_first_record(json_file)
if cleaned_records:
    print("\nRecord field structure:")
    print_record_structure(cleaned_records)



Record field structure:
├─ protocolSection (dict)
    ├─ identificationModule (dict)
        ├─ nctId (str)
        ├─ orgStudyIdInfo (dict)
            ├─ id (str)
        ├─ secondaryIdInfos (dict)
            ├─ id (str)
        ├─ organization (dict)
            ├─ fullName (str)
            ├─ class (str)
        ├─ briefTitle (str)
        ├─ officialTitle (str)
    ├─ statusModule (dict)
        ├─ statusVerifiedDate (str)
        ├─ overallStatus (str)
        ├─ expandedAccessInfo (dict)
            ├─ hasExpandedAccess (bool)
        ├─ startDateStruct (dict)
            ├─ date (str)
        ├─ primaryCompletionDateStruct (dict)
            ├─ date (str)
            ├─ type (str)
        ├─ completionDateStruct (dict)
            ├─ date (str)
            ├─ type (str)
        ├─ studyFirstSubmitDate (str)
        ├─ studyFirstSubmitQcDate (str)
        ├─ studyFirstPostDateStruct (dict)
            ├─ date (str)
            ├─ type (str)
        ├─ lastUpdateSubmitDate (st

In [19]:
import ijson
from tqdm import tqdm
import os

INPUT_FILE = "clinical_trials_cleaned.json"

def count_records(filepath):
    file_size = os.path.getsize(filepath)
    count = 0
    with open(filepath, 'rb') as f, tqdm(total=file_size, unit='B', unit_scale=True, desc="Counting records") as pbar:
        for record in ijson.items(f, 'item'):
            count += 1
            # Update progress bar based on file position
            pbar.update(f.tell() - pbar.n)
    return count

if __name__ == "__main__":
    total = count_records(INPUT_FILE)
    print(f"\nTotal records in {INPUT_FILE}: {total}")


Counting records: 100%|███████████████████████████████████████████████████████████| 4.49G/4.49G [00:54<00:00, 82.0MB/s]


Total records in clinical_trials_cleaned.json: 541402





In [21]:
# Install required packages
!pip install pandas numpy scikit-learn xgboost lightgbm transformers torch
!pip install matplotlib seaborn plotly ijson tqdm stable-baselines3

# Import libraries
import pandas as pd
import numpy as np
import json
import ijson
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed and imported successfully")


Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 1.9 MB/s eta 0:00:01
   ---------------------------- ----------- 1.0/1.5 MB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.1 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting plotly
  Downloading plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-1.42.1-py3-none-any.whl.