# 01 - Ingest and Extract Study Elements

This notebook takes a clinical research paper (text or PDF), extracts key elements using an LLM, and saves them into the database.

In [50]:
# necessary packages
# install time sqlalchemy openai.error
# 02 - Imports
import json
import openai
import time
from sqlalchemy import create_engine, MetaData

In [24]:
# STEP 1: Prompt for OpenAI key
OPENAI_API_KEY = input("Enter your OpenAI API key: ").strip()

Enter your OpenAI API key:  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


In [51]:
# STEP 2: Define the database connection
engine = create_engine('postgresql+psycopg2://aimee:Gr72Ln27!@localhost:5432/dag_review_db')
metadata = MetaData()
metadata.reflect(bind=engine)

In [52]:
# STEP 3: Prompt user to input abstract text
abstract = input("Paste the study abstract here:").strip()

Paste the study abstract here: Objective To identify the prevalence of stage B heart failure (SBHF) in patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD).  Design Observational study.  Setting A single-centre study in which eligible patients were recruited from T2DM clinic. Following consent, patients completed a questionnaire and underwent physical examinations. Patients had blood drawn for laboratory investigations and had a transthoracic echocardiography.  Participants A total of 305 patients who were not known to have CVD were recruited. Patients with deranged liver function tests and end stage renal failure were excluded.  Main outcome measures Echocardiographic parameters such as left ventricular ejection fraction, left ventricular mass index (LVMI), left ventricular hypertrophy, left atrial enlargement and diastolic function were examined.  Results A total of 305 patients predominantly females (65%), with mean body mass index of 27.5 kg

In [53]:
# STEP 4: Define the expected output schema
import os

# Use working directory relative to project root
schema_path = os.path.abspath(os.path.join(os.getcwd(), "..", "data", "schemas", "extraction_schema.json"))

with open(schema_path, "r") as f:
    json_schema = json.load(f)

In [54]:
# STEP 5: Send prompt to OpenAI with error handling, retry, and fallback
# Load prompts from external files
prompt_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "src", "llm", "prompts"))
with open(os.path.join(prompt_dir, "abstract_extract_system.txt"), "r") as f:
    system_template = f.read().strip()
with open(os.path.join(prompt_dir, "abstract_extract_user.txt"), "r") as f:
    user_template = f.read().strip()

# Inject schema into system prompt
system_prompt = system_template + "" + json.dumps(json_schema, indent=2)

# Insert abstract into user prompt
dialog_user = user_template.replace("{{abstract}}", abstract)

models_to_try = ["gpt-4", "gpt-3.5-turbo"]
extracted = None
last_error = None

for model in models_to_try:
    for attempt in range(3):
        try:
            print(f"Trying model: {model}, attempt {attempt + 1}")
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": dialog_user}
                ],
                temperature=0.2)
            extracted = json.loads(response.choices[0].message.content)
            used_model = model
            break
        except Exception as e:
            print(f"Issue with model {model} on attempt {attempt + 1}: {e}. Retrying in 5 seconds...")
            time.sleep(5)
            last_error = e
    if extracted:
        break

if not extracted:
    raise RuntimeError(f"All model attempts failed. Last error: {last_error}")

print(json.dumps(extracted, indent=2))


Trying model: gpt-4, attempt 1
{
  "objectives": [
    "To identify the prevalence of stage B heart failure (SBHF) in patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD)."
  ],
  "eligibility": [
    {
      "criteria": "Patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD)",
      "inclusion": true
    },
    {
      "criteria": "Patients with deranged liver function tests and end stage renal failure",
      "inclusion": false
    }
  ],
  "outcomes": [
    {
      "type": "primary",
      "value": "Echocardiographic parameters such as left ventricular ejection fraction, left ventricular mass index (LVMI), left ventricular hypertrophy, left atrial enlargement and diastolic function"
    }
  ],
  "exposures": []
}


In [57]:
# STEP 5.5: Prompt user to review and optionally edit the extracted data
print("Review the extracted JSON:")
print(json.dumps(extracted, indent=2))

user_confirm = input("Do you want to edit the extracted data before saving to the database? (y/N): ").strip().lower()
if user_confirm == 'y':
    print("Paste corrected JSON below. Submit with an empty line when finished:")
    corrected_input = []
    while True:
        try:
            line = input()
            if line.strip() == "":
                break
            corrected_input.append(line)
        except EOFError:
            break
    try:
        extracted = json.loads("\n".join(corrected_input))
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON. Aborting save. Error: {e}")

Review the extracted JSON:
{
  "objectives": [
    "To identify the prevalence of stage B heart failure (SBHF) in patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD)."
  ],
  "eligibility": [
    {
      "criteria": "Patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD)",
      "inclusion": true
    },
    {
      "criteria": "Patients with deranged liver function tests and end stage renal failure",
      "inclusion": false
    }
  ],
  "outcomes": [
    {
      "type": "primary",
      "value": "Echocardiographic parameters such as left ventricular ejection fraction, left ventricular mass index (LVMI), left ventricular hypertrophy, left atrial enlargement and diastolic function"
    }
  ],
  "exposures": []
}


Do you want to edit the extracted data before saving to the database? (y/N):  y


Paste corrected JSON below. Submit with an empty line when finished:


 {   "objectives": [     "To identify the prevalence of stage B heart failure (SBHF) in patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD)."   ],   "eligibility": [     {       "criteria": "Patients with type 2 diabetes mellitus (T2DM) with no history of cardiovascular disease (CVD)",       "inclusion": true     }   ],   "outcomes": [     {       "type": "primary",       "value": "Echocardiographic parameters such as left ventricular ejection fraction, left ventricular mass index (LVMI), left ventricular hypertrophy, left atrial enlargement and diastolic function"     }   ],   "exposures": [] }
 


In [58]:
# STEP 6: Insert extracted data into the database
raw_llm_output = response.choices[0].message.content
user_edited = (user_confirm == 'y')
with engine.begin() as conn:
    result = conn.execute(metadata.tables['studies'].insert().values(
        title='Physical Activity and Cognitive Decline',
        authors='Smith et al.',
        doi='10.1234/example.doi',
        source='Mock Text',
        year=2024,
        llm_model_used=used_model
    ))
    study_id = result.inserted_primary_key[0]

    for obj in extracted['objectives']:
        conn.execute(metadata.tables['objectives'].insert().values(
            study_id=study_id,
            content=obj,
            type='primary',
            confidence_score=1.0
        ))
    for crit in extracted['eligibility']:
        conn.execute(metadata.tables['eligibility_criteria'].insert().values(
            study_id=study_id,
            criteria=crit['criteria'],
            inclusion=crit['inclusion']
        ))
    for outcome in extracted['outcomes']:
        conn.execute(metadata.tables['outcomes'].insert().values(
            study_id=study_id,
            data=[outcome],
            validated=False
        ))
    for exp in extracted['exposures']:
        conn.execute(metadata.tables['exposures'].insert().values(
            study_id=study_id,
            exposure=exp
        ))

    # Save LLM and user-reviewed JSON
    conn.execute(metadata.tables['extraction_reviews'].insert().values(
        study_id=study_id,
        raw_llm_json=raw_llm_output,
        user_json=json.dumps(extracted),
        user_edited=user_edited
    ))

print(f"Inserted study with ID: {study_id}")

Inserted study with ID: 2
