In [None]:
# Install required packages
!pip install stable-baselines3 torch transformers xgboost scikit-learn pandas requests tenacity gymnasium tqdm imbalanced-learn psutil

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from stable_baselines3 import PPO
import pandas as pd
import requests
from transformers import BertTokenizer, BertModel
from tenacity import retry, stop_after_attempt, wait_fixed
import gymnasium as gym
from gymnasium import spaces
from tqdm import tqdm
import concurrent.futures
import threading
import os
import csv
import multiprocessing as mp
import logging
from imblearn.over_sampling import SMOTE
import psutil
import sys

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# File to save the API data
DATA_FILE = "clinical_trials_data.csv"
# Lock for thread-safe CSV writing
csv_lock = threading.Lock()
# Temporary file for intermediate results
TEMP_FILE = "temp_data.npz"

# Function to check available RAM
def check_available_ram():
    memory = psutil.virtual_memory()
    available_ram = memory.available / (1024 ** 3)  # Convert to GB
    logging.info(f"Available RAM: {available_ram:.2f} GB")
    return available_ram

# Step 1: Fetch Total Number of Studies with Parallel Estimation and Progress Bar
def fetch_page_for_count(page_token=None):
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "pageSize": 100,
        "pageToken": page_token
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            studies = data.get('studies', [])
            next_page_token = data.get('nextPageToken')
            return len(studies), next_page_token
        else:
            logging.error(f"Error fetching page for count: {response.status_code} - {response.text}")
            return 0, None
    except Exception as e:
        logging.error(f"Exception during page fetch for count: {e}")
        return 0, None

def get_total_study_count(max_workers=1):  # Further reduced workers to manage RAM
    try:
        # First, try the /stats/size endpoint
        base_url = "https://clinicaltrials.gov/api/v2/stats/size"
        response = requests.get(base_url)
        if response.status_code == 200:
            data = response.json()
            total_studies = data.get('studies', 0)
            if total_studies > 0:
                logging.info(f"Total studies from /stats/size: {total_studies}")
                return total_studies

        # If /stats/size fails or returns 0, estimate by fetching pages in parallel with a progress bar
        logging.warning("Failed to get total study count from /stats/size. Estimating via parallel fetch...")
        total_count = 0
        next_page_token = None
        page_tokens = [None]  # Start with the first page
        max_pages_to_estimate = 10  # Limit estimation to 10 pages to avoid infinite loop

        with tqdm(total=max_pages_to_estimate * 100, desc="Estimating total studies", unit="studies") as pbar:
            pages_fetched = 0
            while page_tokens and pages_fetched < max_pages_to_estimate:
                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                    future_to_token = {executor.submit(fetch_page_for_count, token): token for token in page_tokens}
                    page_tokens = []

                    for future in concurrent.futures.as_completed(future_to_token):
                        page_count, next_token = future.result()
                        total_count += page_count
                        pbar.update(page_count)
                        if next_token:
                            page_tokens.append(next_token)
                pages_fetched += 1

        # Estimate total based on pages fetched
        if total_count > 0:
            pages_fetched = total_count // 100
            estimated_total = total_count * (max_pages_to_estimate / pages_fetched) if pages_fetched > 0 else total_count * 10
            logging.info(f"Estimated total studies: {int(estimated_total)}")
            return int(estimated_total)
        else:
            logging.warning("Could not estimate total study count. Using default large number for progress.")
            return 100000  # Default large number for progress bar
    except Exception as e:
        logging.error(f"Exception occurred while fetching total study count: {e}")
        return 100000  # Default large number for progress bar

# Step 1.1: Count Records in CSV with Progress Bar (Memory-Efficient)
def check_existing_csv():
    if not os.path.exists(DATA_FILE):
        return 0  # File doesn't exist, need to download

    try:
        total_records = 0
        file_size = os.path.getsize(DATA_FILE) // (1024 ** 2)  # File size in MB
        with open(DATA_FILE, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            with tqdm(total=file_size, desc="Counting CSV records", unit="MB") as pbar:
                for _ in reader:
                    total_records += 1
                    # Update progress bar based on current position in file
                    current_pos = f.tell() // (1024 ** 2)  # Current position in MB
                    pbar.n = min(current_pos, file_size)  # Update progress
                    pbar.refresh()
        # Subtract 1 for the header row
        total_records -= 1
        logging.info(f"Existing CSV contains {total_records} records.")
        return total_records
    except Exception as e:
        logging.error(f"Error reading existing CSV: {e}. Will re-download data.")
        return 0

# Step 1.2: Fetch a Single Page of Data
def fetch_page(page_token=None):
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "pageSize": 100,
        "pageToken": page_token
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            logging.error(f"Error fetching page with token {page_token}: {response.status_code} - {response.text}")
            return None, None
        data = response.json()
        studies = data.get('studies', [])
        next_page_token = data.get('nextPageToken')
        return studies, next_page_token
    except Exception as e:
        logging.error(f"Exception occurred during API request for page with token {page_token}: {e}")
        return None, None

# Step 1.3: Save Fetched Studies to CSV (Thread-Safe)
def save_to_csv(studies):
    if not studies:
        return

    df_page = pd.json_normalize(studies)
    with csv_lock:
        mode = 'a' if os.path.exists(DATA_FILE) else 'w'
        header = not os.path.exists(DATA_FILE)
        df_page.to_csv(DATA_FILE, mode=mode, header=header, index=False, quoting=csv.QUOTE_ALL)

# Step 1.4: Fetch All Data from ClinicalTrials.gov API with Parallel Downloads
def fetch_clinical_trials(max_workers=1):  # Further reduced workers to manage RAM
    check_available_ram()
    total_studies = get_total_study_count(max_workers=max_workers)
    if total_studies == 0:
        logging.warning("API returned 0 studies. Attempting to fetch at least one page...")
        studies, next_page_token = fetch_page()
        if studies:
            total_studies = max(len(studies) * 100, 100000)  # Rough estimate
        else:
            logging.error("No studies fetched. Using default large number for progress.")
            total_studies = 100000

    existing_records = check_existing_csv()
    if existing_records >= total_studies > 0:
        logging.info("CSV file already contains all records. Skipping download.")
    else:
        if os.path.exists(DATA_FILE):
            os.remove(DATA_FILE)

        studies_fetched = 0
        next_page_token = None
        page_tokens = [None]

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            with tqdm(total=total_studies, desc="Fetching studies", unit="studies") as pbar:
                while page_tokens:
                    check_available_ram()
                    if psutil.virtual_memory().available / (1024 ** 3) < 0.5:  # Less than 0.5 GB available
                        logging.warning("Low RAM available. Pausing fetch to free memory...")
                        sys.stdout.flush()
                        break
                    future_to_token = {executor.submit(fetch_page, token): token for token in page_tokens}
                    page_tokens = []

                    for future in concurrent.futures.as_completed(future_to_token):
                        studies, next_token = future.result()
                        if studies:
                            save_to_csv(studies)
                            studies_fetched += len(studies)
                            pbar.update(len(studies))
                        if next_token:
                            page_tokens.append(next_token)

    if not os.path.exists(DATA_FILE):
        logging.error("No data was fetched. Returning empty DataFrame.")
        return pd.DataFrame()

    logging.info(f"\nReading fetched data from {DATA_FILE}...")
    df_chunks = pd.read_csv(DATA_FILE, chunksize=250, quoting=csv.QUOTE_ALL, on_bad_lines='skip')  # Further reduced chunk size
    df = pd.concat(df_chunks, ignore_index=True)
    logging.info(f"Fetched {len(df)} studies from ClinicalTrials.gov.")
    return df

# Step 1.5: Analyze the Fetched Data
def analyze_data(df):
    if df.empty:
        logging.info("No data to analyze.")
        return

    logging.info("\nDataset Analysis:")
    logging.info(f"Total number of studies: {len(df)}")

    studies_with_results = df[df['resultsSection'].notna()]
    logging.info(f"Number of studies with results: {len(studies_with_results)}")

    if 'protocolSection.designModule.studyType' in df:
        study_types = df['protocolSection.designModule.studyType'].value_counts()
        logging.info("\nStudy Types Distribution:")
        logging.info(study_types.to_string())

    if 'protocolSection.designModule.phases' in df:
        phases = df['protocolSection.designModule.phases'].explode().value_counts()
        logging.info("\nPhases Distribution:")
        logging.info(phases.to_string())

    if 'resultsSection.resultsFirstPostDate' in df:
        results_dates = pd.to_datetime(df['resultsSection.resultsFirstPostDate'].dropna())
        if not results_dates.empty:
            logging.info("\nResults First Posted Date Range:")
            logging.info(f"Earliest: {results_dates.min()}")
            logging.info(f"Latest: {results_dates.max()}")

    if 'protocolSection.armsInterventionsModule.interventions' in df:
        interventions_present = df['protocolSection.armsInterventionsModule.interventions'].notna().sum()
        logging.info(f"\nNumber of studies with interventions: {interventions_present}")

    if 'resultsSection.outcomeMeasuresModule.outcomeMeasures' in df:
        outcomes_present = df['resultsSection.outcomeMeasuresModule.outcomeMeasures'].notna().sum()
        logging.info(f"Number of studies with outcome measures: {outcomes_present}")

# Step 1.6: Fallback Simulated Data if API Returns Insufficient Results
def simulate_clinical_data(n_samples=1000):
    logging.info("Simulating clinical trial data as fallback...")
    X_num = np.random.rand(n_samples, 3)  # age, sex, dosage
    X_text = ["Simulated summary"] * n_samples
    y = np.random.randint(0, 2, size=n_samples)  # ADE occurrence (0 or 1)
    return X_num, X_text, y

# Step 2: Preprocess API Data and Simulate Patient-Level Data (A) with Temporary File Saving
def preprocess_clinical_data(df, min_samples=1000, chunk_size=100):
    if df.empty or len(df) < 5:
        logging.info("Insufficient data from API. Using simulated data.")
        return simulate_clinical_data(n_samples=min_samples)

    # Initialize lists for chunked processing
    numerical_features_chunks = []
    textual_data_chunks = []
    labels_chunks = []

    chunk_numerical = []
    chunk_textual = []
    chunk_labels = []
    chunk_count = 0

    with tqdm(total=len(df), desc="Preprocessing data", unit="rows") as pbar:
        for idx, row in df.iterrows():
            enrollment = row.get('protocolSection.eligibilityModule.enrollmentCount', 0)
            if enrollment == 0:
                pbar.update(1)
                continue

            age_group = row.get('protocolSection.eligibilityModule.minimumAge', 'ADULT, OLDER_ADULT')
            sex = row.get('protocolSection.eligibilityModule.sex', 'ALL')
            interventions = row.get('protocolSection.armsInterventionsModule.interventions', [{}])
            outcomes = row.get('resultsSection.outcomeMeasuresModule.outcomeMeasures', [{}])
            summary = row.get('protocolSection.descriptionModule.briefSummary', "No summary available")

            ade_prob = 0.3
            for outcome in outcomes:
                title = outcome.get('title', '').lower()
                description = outcome.get('description', '').lower()
                if 'adverse event' in title or 'safety' in title or 'adverse event' in description or 'safety' in description:
                    ade_prob = 0.5
                    break

            dosage = 1.0
            for intervention in interventions:
                name = intervention.get('name', '').lower()
                if '60mg/m2' in name:
                    dosage = 60.0
                elif '80mg/m2' in name:
                    dosage = 80.0
                elif '100mg/m2' in name:
                    dosage = 100.0

            for _ in range(int(enrollment)):
                if 'ADULT' in age_group and 'OLDER_ADULT' in age_group:
                    age = np.random.randint(18, 100)
                elif 'ADULT' in age_group:
                    age = np.random.randint(18, 65)
                else:
                    age = np.random.randint(65, 100)

                sex_val = np.random.choice([0, 1])
                chunk_numerical.append([age, sex_val, dosage])
                chunk_textual.append(summary)
                label = 1 if np.random.random() < ade_prob else 0
                chunk_labels.append(label)

                chunk_count += 1
                pbar.update(1)

                # Save chunk to temporary file when it reaches chunk_size
                if chunk_count >= chunk_size:
                    numerical_features_chunks.append(np.array(chunk_numerical))
                    textual_data_chunks.append(chunk_textual)
                    labels_chunks.append(np.array(chunk_labels))
                    # Save to temporary file
                    with open(TEMP_FILE, 'ab') as f:
                        np.savez(f, numerical=np.array(chunk_numerical), textual=np.array(chunk_textual, dtype=object), labels=np.array(chunk_labels))
                    chunk_numerical = []
                    chunk_textual = []
                    chunk_labels = []
                    chunk_count = 0
                    check_available_ram()

    # Save any remaining data
    if chunk_numerical:
        numerical_features_chunks.append(np.array(chunk_numerical))
        textual_data_chunks.append(chunk_textual)
        labels_chunks.append(np.array(chunk_labels))
        with open(TEMP_FILE, 'ab') as f:
            np.savez(f, numerical=np.array(chunk_numerical), textual=np.array(chunk_textual, dtype=object), labels=np.array(chunk_labels))

    if len(numerical_features_chunks) == 0:
        numerical_features = np.array([])
        textual_data = []
        labels = np.array([])
    else:
        numerical_features = np.vstack(numerical_features_chunks)
        textual_data = [item for sublist in textual_data_chunks for item in sublist]
        labels = np.concatenate(labels_chunks)

    if len(numerical_features) < min_samples:
        logging.info(f"Only {len(numerical_features)} samples generated. Supplementing with simulated data.")
        X_num_sim, X_text_sim, y_sim = simulate_clinical_data(n_samples=min_samples - len(numerical_features))
        numerical_features = np.vstack([numerical_features, X_num_sim]) if len(numerical_features) > 0 else X_num_sim
        textual_data.extend(X_text_sim)
        labels = np.concatenate([labels, y_sim]) if len(labels) > 0 else y_sim

    return numerical_features, textual_data, labels

# Step 3: Extract Textual Embeddings Using BERT with Chunked Processing
def get_bert_embeddings(texts, chunk_size=50):
    try:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        model.eval()

        embeddings = []
        with tqdm(total=len(texts), desc="Extracting BERT embeddings", unit="texts") as pbar:
            for i in range(0, len(texts), chunk_size):
                chunk = texts[i:i + chunk_size]
                inputs = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True, max_length=128, return_attention_mask=True)
                with torch.no_grad():
                    outputs = model(**inputs)
                chunk_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
                embeddings.append(chunk_embeddings)
                pbar.update(len(chunk))
                check_available_ram()
        embeddings = np.vstack(embeddings)
        return embeddings
    except Exception as e:
        logging.error(f"Error in BERT embeddings: {e}. Using random embeddings as fallback.")
        return np.random.rand(len(texts), 768)

# Step 4: Define DNN for ADE Prediction (B --> G: LLMs) with Improved Architecture
class DNN(nn.Module):
    def __init__(self, input_dim):
        super(DNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.layers(x)

# Step 5: Train DNN with Improved Hyperparameters
def train_dnn(X, y, epochs=200):
    try:
        model = DNN(input_dim=X.shape[1])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-3)
        X_tensor = torch.FloatTensor(X).float()
        y_tensor = torch.FloatTensor(y).float().view(-1, 1)
        
        for epoch in range(epochs):
            optimizer.zero_grad()
            outputs = model(X_tensor)
            loss = criterion(outputs, y_tensor)
            loss.backward()
            optimizer.step()
        return model, outputs.detach().numpy().flatten()
    except Exception as e:
        logging.error(f"Error in DNN training: {e}. Returning random predictions.")
        return None, np.random.rand(len(y))

# Step 6: Gradient Boosting with XGBoost (B --> H: AI Models) with Improved Hyperparameters
def train_gbm(X, y):
    try:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
        model = xgb.XGBClassifier(max_depth=5, learning_rate=0.05, n_estimators=200, random_state=42)
        model.fit(X_resampled, y_resampled)
        y_gbm_pred = model.predict_proba(X_scaled)[:, 1]
        return y_gbm_pred
    except Exception as e:
        logging.error(f"Error in GBM training: {e}. Returning random predictions.")
        return np.random.rand(len(y))

# Step 7: Ensemble Prediction (C: ADE Prediction & Prevention)
def ensemble_predict(y_dnn, y_gbm, w=0.6):
    return w * y_dnn + (1 - w) * y_gbm

# Step 8: Reinforcement Learning for Treatment Adaptation (C --> J)
class TrialEnv(gym.Env):
    def __init__(self):
        super(TrialEnv, self).__init__()
        self.protocol = 1.0
        self.state = 0.0
        self.step_count = 0
        self.max_steps = 10
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float32)

    def step(self, action):
        self.step_count += 1
        self.protocol += action[0]
        reward = -np.random.random() if action[0] > 0 else -1
        self.state = np.array([self.protocol], dtype=np.float32)
        done = self.step_count >= self.max_steps
        truncated = False
        return self.state, reward, done, truncated, {}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.step_count = 0
        self.state = np.array([0.0], dtype=np.float32)
        self.protocol = 1.0
        return self.state, {}

def train_rl():
    try:
        env = TrialEnv()
        model = PPO("MlpPolicy", env, verbose=0, learning_rate=0.0003)
        model.learn(total_timesteps=2000)
        return model, env
    except Exception as e:
        logging.error(f"Error in RL training: {e}. Returning dummy model.")
        env = TrialEnv()
        class DummyModel:
            def __init__(self):
                self.env = env
        return DummyModel(), env

# Step 9: Evaluate Model (Step 6: Metrics)
def evaluate_model(y_true, y_pred_prob):
    try:
        y_pred_binary = (y_pred_prob > 0.5).astype(int)
        f1 = f1_score(y_true, y_pred_binary)
        roc_auc = roc_auc_score(y_true, y_pred_prob)
        precision, recall, _ = precision_recall_curve(y_true, y_pred_prob)
        pr_auc = auc(recall, precision)
        return f1, roc_auc, pr_auc
    except Exception as e:
        logging.error(f"Error in evaluation: {e}. Returning default metrics.")
        return 0.0, 0.0, 0.0

# Main Workflow
def main():
    df = fetch_clinical_trials()
    analyze_data(df)
    X_num, X_text, y = preprocess_clinical_data(df)

    if len(X_num) == 0:
        logging.error("No data available to process. Exiting.")
        return

    X_text_emb = get_bert_embeddings(X_text)
    X = np.hstack((X_num, X_text_emb))

    eligibility_criteria = X_num[:, 0].mean() if X_num.shape[0] > 0 else 0.0
    treatment_protocols = X_num[:, 2].sum() if X_num.shape[0] > 0 else 0.0

    dnn_model, y_dnn = train_dnn(X, y)
    y_gbm = train_gbm(X, y)
    y_pred = ensemble_predict(y_dnn, y_gbm)

    f1, roc_auc, pr_auc = evaluate_model(y, y_pred)
    logging.info(f"Evaluation Metrics:\nF1 Score: {f1:.2f}\nROC-AUC: {roc_auc:.2f}\nPR-AUC: {pr_auc:.2f}\n")

    rl_model, env = train_rl()
    protocol_value = env.protocol
    personalized_medicine = f"Protocol adjusted to: {protocol_value:.2f}"
    reduced_costs = f"Trial costs reduced by optimizing with F1: {f1:.2f}"

    logging.info(f"Eligibility Criteria (Average Age): {eligibility_criteria:.2f}")
    logging.info(f"Treatment Protocols (Total Dosage): {treatment_protocols:.2f}")
    logging.info(f"Personalized Medicine: {personalized_medicine}")
    logging.info(f"Reduced Trial Costs: {reduced_costs}")

    # Delete temporary file
    if os.path.exists(TEMP_FILE):
        os.remove(TEMP_FILE)
        logging.info(f"Deleted temporary file: {TEMP_FILE}")

if __name__ == "__main__":
    main()