In [1]:
import os
import getpass
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
# This is a critical check. We verify that our script can access the necessary API keys from the environment.
if "LANGCHAIN_API_KEY" not in os.environ or "ENTREZ_EMAIL" not in os.environ:
    # If the keys are missing, we print an error and halt, as the application cannot proceed.
    print("Required environment variables not set. Please set them in your .env file or environment.")
else:
    # This confirmation tells us our secrets have been loaded securely and are ready for use.
    print("Environment variables loaded successfully.")

Required environment variables not set. Please set them in your .env file or environment.


In [4]:


# We explicitly set the LangSmith project name. This is a best practice that ensures all traces

# generated by this project are automatically grouped together in the LangSmith user interface for easy analysis.
os.environ["LANGCHAIN_PROJECT"] = "AI_Clinical_Trials_Architect"

In [5]:
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

In [6]:


# This dictionary will act as our central registry, or "foundry," for all LLM and embedding model clients.
llm_config = {
    # For the 'planner', we use Llama 3.1 8B. It's a modern, highly capable model that excels at instruction-following.
    # We set `format='json'` to leverage Ollama's built-in JSON mode, ensuring reliable structured output for this critical task.
    "planner": ChatOllama(model="gpt-oss:20b-cloud", temperature=0.0, format='json'),
    
    # For the 'drafter' and 'sql_coder', we use Qwen2 7B. It's a nimble and fast model, perfect for
    # tasks like text generation and code completion where speed is valuable.
    "drafter": ChatOllama(model="deepseek-v3.1:671b-cloud", temperature=0.2),
    "sql_coder": ChatOllama(model="deepseek-v3.1:671b-cloud", temperature=0.0),
    
    # For the 'director', the highest-level strategic agent, we use the powerful Llama 3 70B model.
    # This high-stakes task of diagnosing performance and evolving the system's own procedures
    # justifies the use of a larger, more powerful model.
    "director": ChatOllama(model="gpt-oss:120b-cloud", temperature=0.0, format='json'),
    # For embeddings, we use 'nomic-embed-text', a top-tier, efficient open-source model.
    "embedding_model": OllamaEmbeddings(model="snowflake-arctic-embed:22m")
}

  "planner": ChatOllama(model="gpt-oss:20b-cloud", temperature=0.0, format='json'),
  "embedding_model": OllamaEmbeddings(model="snowflake-arctic-embed:22m")


In [7]:
# Print the configuration to confirm the clients are initialized and their parameters are set correctly.
print("LLM clients configured:")
print(f"Planner ({llm_config['planner'].model}): {llm_config['planner']}")
print(f"Drafter ({llm_config['drafter'].model}): {llm_config['drafter']}")
print(f"SQL Coder ({llm_config['sql_coder'].model}): {llm_config['sql_coder']}")
print(f"Director ({llm_config['director'].model}): {llm_config['director']}")
print(f"Embedding Model ({llm_config['embedding_model'].model}): {llm_config['embedding_model']}")

LLM clients configured:
Planner (gpt-oss:20b-cloud): model='gpt-oss:20b-cloud' temperature=0.0 format='json'
Drafter (deepseek-v3.1:671b-cloud): model='deepseek-v3.1:671b-cloud' temperature=0.2
SQL Coder (deepseek-v3.1:671b-cloud): model='deepseek-v3.1:671b-cloud' temperature=0.0
Director (gpt-oss:120b-cloud): model='gpt-oss:120b-cloud' temperature=0.0 format='json'
Embedding Model (snowflake-arctic-embed:22m): base_url='http://localhost:11434' model='snowflake-arctic-embed:22m' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


In [8]:
import os

# A dictionary to hold the paths for our different data types. This keeps our file management clean and centralized.
data_paths = {
    "base": "./data",
    "pubmed": "./data/pubmed_articles",
    "fda": "./data/fda_guidelines",
    "ethics": "./data/ethical_guidelines",
    "mimic": "./data/mimic_db"
}
# This loop iterates through our defined paths and uses os.makedirs() to create any directories that don't already exist.
# This prevents errors in later steps when we try to save files to these locations.
for path in data_paths.values():
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created directory: {path}")


In [9]:
from Bio import Entrez
from Bio import Medline

def download_pubmed_articles(query, max_articles=20):
    """Fetches abstracts from PubMed for a given query and saves them as text files."""
    # The NCBI API requires an email address for identification. We fetch it from our environment variables.
    Entrez.email = os.environ.get("ENTREZ_EMAIL")
    print(f"Fetching PubMed articles for query: {query}")
    
    # Step 1: Use Entrez.esearch to find the PubMed IDs (PMIDs) for articles matching our query.
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_articles, sort="relevance")
    record = Entrez.read(handle)
    id_list = record["IdList"]
    print(f"Found {len(id_list)} article IDs.")
    
    print("Downloading articles...")
    # Step 2: Use Entrez.efetch to retrieve the full records (in MEDLINE format) for the list of PMIDs.
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    
    count = 0
    # Step 3: Iterate through the retrieved records, parse them, and save each abstract to a file.
    for i, record in enumerate(records):
        pmid = record.get("PMID", "")
        title = record.get("TI", "No Title")
        abstract = record.get("AB", "No Abstract")
        if pmid:
            # We name the file after the PMID for easy reference and to avoid duplicates.
            filepath = os.path.join(data_paths["pubmed"], f"{pmid}.txt")
            with open(filepath, "w") as f:
                f.write(f"Title: {title}\n\nAbstract: {abstract}")
            print(f"[{i+1}/{len(id_list)}] Fetching PMID: {pmid}... Saved to {filepath}")
            count += 1
    return count

In [10]:
# We define a specific, boolean query to find articles highly relevant to our trial concept.
pubmed_query = "(SGLT2 inhibitor) AND (type 2 diabetes) AND (renal impairment)"
num_downloaded = download_pubmed_articles(pubmed_query)
print(f"PubMed download complete. {num_downloaded} articles saved.")

Fetching PubMed articles for query: (SGLT2 inhibitor) AND (type 2 diabetes) AND (renal impairment)


            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


Found 20 article IDs.
Downloading articles...
[1/20] Fetching PMID: 36945734... Saved to ./data/pubmed_articles/36945734.txt
[2/20] Fetching PMID: 38914124... Saved to ./data/pubmed_articles/38914124.txt
[3/20] Fetching PMID: 30697905... Saved to ./data/pubmed_articles/30697905.txt
[4/20] Fetching PMID: 36335326... Saved to ./data/pubmed_articles/36335326.txt
[5/20] Fetching PMID: 36351458... Saved to ./data/pubmed_articles/36351458.txt
[6/20] Fetching PMID: 34619106... Saved to ./data/pubmed_articles/34619106.txt
[7/20] Fetching PMID: 40327845... Saved to ./data/pubmed_articles/40327845.txt
[8/20] Fetching PMID: 35113333... Saved to ./data/pubmed_articles/35113333.txt
[9/20] Fetching PMID: 33413348... Saved to ./data/pubmed_articles/33413348.txt
[10/20] Fetching PMID: 34272327... Saved to ./data/pubmed_articles/34272327.txt
[11/20] Fetching PMID: 34817311... Saved to ./data/pubmed_articles/34817311.txt
[12/20] Fetching PMID: 35145275... Saved to ./data/pubmed_articles/35145275.txt
[13

In [None]:
import requests
from pypdf import PdfReader
import io

def download_and_extract_text_from_pdf(url, output_path):
    """Downloads a PDF from a URL, saves it, and also extracts its text content to a separate .txt file."""
    print(f"Downloading FDA Guideline: {url}")
    try:
        # We use the 'requests' library to perform the HTTP GET request to download the file.
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        # --- End of Change ---

        # Send a GET request to the URL with the new headers
        print(f"Requesting URL: {url}")
        # The `requests` library will automatically follow redirects.
        response = requests.get(url, stream=True, headers=headers, timeout=30)
        response.raise_for_status() # This is a good practice that will raise an error if the download fails (e.g., a 404 error).
        # We save the raw PDF file, which is useful for archival purposes.
        with open(output_path, 'wb') as f:
            f.write(response.content)
        print(f"Successfully downloaded and saved to {output_path}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return False

In [22]:
# This URL points to a real FDA guidance document for developing drugs for diabetes.
fda_url = "https://www.fda.gov/media/71185/download"
fda_pdf_path = os.path.join(data_paths["fda"], "fda_diabetes_guidance.pdf")

In [23]:
fda_pdf_path

'./data/fda_guidelines/fda_diabetes_guidance.pdf'

In [24]:
download_and_extract_text_from_pdf(fda_url, fda_pdf_path)

Downloading FDA Guideline: https://www.fda.gov/media/71185/download
Requesting URL: https://www.fda.gov/media/71185/download
Error downloading file: 404 Client Error: Not Found for url: https://www.fda.gov/apology_objects/abuse-detection-apology.html


False

In [29]:
 # We then use pypdf to read the PDF content directly from the in-memory response.
reader = PdfReader(fda_pdf_path)
text = ""
# We loop through each page of the PDF and append its extracted text.
for page in reader.pages:
    print(page.get_contents())
    text += page.extract_text() + "\n\n"

{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}


In [26]:
# Finally, we save the clean, extracted text to a .txt file. This is the file our RAG system will actually use.
txt_output_path = os.path.splitext(fda_pdf_path)[0] + '.txt'
with open(txt_output_path, 'w') as f:
    f.write(text)


In [15]:
# This multi-line string contains a curated summary of the three core principles of the Belmont Report,
# which is the foundational document for ethics in human subject research in the United States.
ethics_content = """
Title: Summary of the Belmont Report Principles for Clinical Research
1. Respect for Persons: This principle requires that individuals be treated as autonomous agents and that persons with diminished autonomy are entitled to protection. This translates to robust informed consent processes. Inclusion/exclusion criteria must not unduly target or coerce vulnerable populations, such as economically disadvantaged individuals, prisoners, or those with severe cognitive impairments, unless the research is directly intended to benefit that population.
2. Beneficence: This principle involves two complementary rules: (1) do not harm and (2) maximize possible benefits and minimize possible harms. The criteria must be designed to select a population that is most likely to benefit and least likely to be harmed by the intervention. The risks to subjects must be reasonable in relation to anticipated benefits.
3. Justice: This principle concerns the fairness of distribution of the burdens and benefits of research. The selection of research subjects must be equitable. Criteria should not be designed to exclude certain groups without a sound scientific or safety-related justification. For example, excluding participants based on race, gender, or socioeconomic status is unjust unless there is a clear rationale related to the drug's mechanism or risk profile.
"""

# We define the path where our ethics document will be saved.
ethics_path = os.path.join(data_paths["ethics"], "belmont_summary.txt")

# We open the file in write mode and save the content.
with open(ethics_path, "w") as f:
    f.write(ethics_content)
print(f"Created ethics guideline file: {ethics_path}")

Created ethics guideline file: ./data/ethical_guidelines/belmont_summary.txt


In [18]:
import duckdb
import pandas as pd
import os


def load_real_mimic_data():
    """Loads real MIMIC-III CSVs into a persistent DuckDB database file, processing the massive LABEVENTS table efficiently."""
    print("Attempting to load real MIMIC-III data from local CSVs...")
    db_path = os.path.join(data_paths["mimic"], "mimic3_real.db")
    csv_dir = os.path.join(data_paths["mimic"], "mimiciii_csvs")
    
    # Define the paths to the required compressed CSV files.
    required_files = {
        "patients": os.path.join(csv_dir, "PATIENTS.csv.gz"),
        "diagnoses": os.path.join(csv_dir, "DIAGNOSES_ICD.csv.gz"),
        "labevents": os.path.join(csv_dir, "LABEVENTS.csv.gz"),
    }
    
    # Before starting, we check if all the necessary source files are present.
    missing_files = [path for path in required_files.values() if not os.path.exists(path)]
    if missing_files:
        print("ERROR: The following MIMIC-III files were not found:")
        for f in missing_files: print(f"- {f}")
        print("\nPlease download them as instructed and place them in the correct directory.")
        return None
    
    print("Required files found. Proceeding with database creation.")
    # Remove any old database file to ensure we are building from scratch.
    if os.path.exists(db_path):
        os.remove(db_path)
    # Connect to DuckDB. If the database file doesn't exist, it will be created.
    con = duckdb.connect(db_path)
    
    # Use DuckDB's powerful `read_csv_auto` to directly load data from the gzipped CSVs into SQL tables.
    print(f"Loading {required_files['patients']} into DuckDB...")
    con.execute(f"CREATE TABLE patients AS SELECT SUBJECT_ID, GENDER, DOB, DOD FROM read_csv_auto('{required_files['patients']}')")
    
    print(f"Loading {required_files['diagnoses']} into DuckDB...")
    con.execute(f"CREATE TABLE diagnoses_icd AS SELECT SUBJECT_ID, ICD9_CODE FROM read_csv_auto('{required_files['diagnoses']}')")
    
    # The LABEVENTS table is enormous. To handle it robustly, we use a two-stage process.
    print(f"Loading and processing {required_files['labevents']} (this may take several minutes)...")
    # 1. Load the data into a temporary 'staging' table, treating all columns as text (`all_varchar=True`).
    #    This prevents parsing errors with mixed data types. We also filter for only the lab item IDs we
    #    care about (50912 for Creatinine, 50852 for HbA1c) and use a regex to ensure VALUENUM is numeric.
    con.execute(f"""CREATE TABLE labevents_staging AS 
                   SELECT SUBJECT_ID, ITEMID, VALUENUM 
                   FROM read_csv_auto('{required_files['labevents']}', all_varchar=True) 
                   WHERE ITEMID IN ('50912', '50852') AND VALUENUM IS NOT NULL AND VALUENUM ~ '^[0-9]+(\\.[0-9]+)?$'
                """)
    # 2. Create the final, clean table by selecting from the staging table and casting the columns to their correct numeric types.
    con.execute("CREATE TABLE labevents AS SELECT SUBJECT_ID, CAST(ITEMID AS INTEGER) AS ITEMID, CAST(VALUENUM AS DOUBLE) AS VALUENUM FROM labevents_staging")
    # 3. Drop the temporary staging table to save space.
    con.execute("DROP TABLE labevents_staging")
    con.close()
    return db_path

In [20]:
# Execute the function to build the database.
db_path = load_real_mimic_data()

Attempting to load real MIMIC-III data from local CSVs...
ERROR: The following MIMIC-III files were not found:
- ./data/mimic_db/mimiciii_csvs/PATIENTS.csv.gz
- ./data/mimic_db/mimiciii_csvs/DIAGNOSES_ICD.csv.gz
- ./data/mimic_db/mimiciii_csvs/LABEVENTS.csv.gz

Please download them as instructed and place them in the correct directory.


In [None]:
# If the database was created successfully, connect to it and inspect the schema and some sample data.
if db_path:
    print(f"\nReal MIMIC-III database created at: {db_path}")
    print("\nTesting database connection and schema...")
    con = duckdb.connect(db_path)
    print(f"Tables in DB: {con.execute('SHOW TABLES').df()['name'].tolist()}")
    print("\nSample of 'patients' table:")
    print(con.execute("SELECT * FROM patients LIMIT 5").df())
    print("\nSample of 'diagnoses_icd' table:")
    print(con.execute("SELECT * FROM diagnoses_icd LIMIT 5").df())
    con.close()

Attempting to load real MIMIC-III data from local CSVs...
ERROR: The following MIMIC-III files were not found:
- ./data/mimic_db/mimiciii_csvs/PATIENTS.csv.gz
- ./data/mimic_db/mimiciii_csvs/DIAGNOSES_ICD.csv.gz
- ./data/mimic_db/mimiciii_csvs/LABEVENTS.csv.gz

Please download them as instructed and place them in the correct directory.
