In [1]:
import duckdb
import os
from dotenv import load_dotenv

load_dotenv()
con = duckdb.connect()

try:
    con.sql(f"""
        INSTALL postgres;
        LOAD postgres;
        ATTACH 'dbname={os.getenv("PGDATABASE")} user={os.getenv("PGUSER")} password={os.getenv("PGPASSWORD")} \
            host={os.getenv("PGHOST")} port={os.getenv("PGPORT")}' AS remote_mimic (TYPE POSTGRES);
    """)
    print("DuckDB attached to remote PostgreSQL successfully.")

except Exception as e:
    print(f"Error attaching PostgreSQL: {e}")

DuckDB attached to remote PostgreSQL successfully.


In [5]:
import pandas as pd

cohort_query = """
DROP TABLE IF EXISTS cohort_tab;
CREATE TABLE cohort_tab AS
WITH cohort AS (
    SELECT
        ic.subject_id,
        ic.hadm_id,
        ic.stay_id,
        ic.intime,
        ic.outtime,
        -- Calculate age precisely using anchor_year logic
        (pa.anchor_age + (date_part('year', ic.intime) - pa.anchor_year)) AS age,
        adm.hospital_expire_flag,
        adm.deathtime,
        -- Rank stays: First stay of the first admission
        ROW_NUMBER() OVER (PARTITION BY ic.subject_id ORDER BY ic.intime) AS stay_rank
    FROM
        remote_mimic.mimiciv_icu.icustays ic
    INNER JOIN 
        remote_mimic.mimiciv_hosp.patients pa ON ic.subject_id = pa.subject_id
    INNER JOIN 
        remote_mimic.mimiciv_hosp.admissions adm ON ic.hadm_id = adm.hadm_id
)
SELECT
    subject_id,
    hadm_id,
    stay_id,
    intime,
    outtime,
    age,
    hospital_expire_flag AS y_ihm
FROM
    cohort
WHERE
    stay_rank = 1               -- First ICU stay only
    AND age >= 18               -- Adults only
    -- EXCLUSION: Patient must stay at least 24h to have a full observation window
    AND date_diff('minute', intime, outtime) >= 1440 
    -- EXCLUSION: If they died, they must have died AFTER the 24h window
    -- (Prevents the model from 'seeing' the death process in the vitals)
    AND (deathtime IS NULL OR deathtime > (intime + INTERVAL '24 HOURS'))
ORDER BY
    subject_id;"""

con.sql(cohort_query)
df_cohort = con.sql("SELECT * FROM cohort_tab").df()
print(f"Cohort selection complete. Cohort size: {len(df_cohort):,} patients.")
df_cohort.head()

Cohort selection complete. Cohort size: 51,674 patients.


Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,age,y_ihm
0,10000690,25860671,37081114,2150-11-02 19:37:00,2150-11-06 17:03:17,86,0
1,10001217,24597018,37067082,2157-11-20 19:18:02,2157-11-21 22:08:00,55,0
2,10001725,25563031,31205490,2110-04-11 15:52:22,2110-04-12 23:59:56,46,0
3,10001884,26184834,37510196,2131-01-11 04:20:05,2131-01-20 08:27:30,77,1
4,10002013,23581541,39060235,2160-05-18 10:00:53,2160-05-19 17:33:33,57,0


In [7]:
vitals_query = """
DROP TABLE IF EXISTS vitals_tab;
CREATE TABLE vitals_tab AS (
    SELECT 
        v.stay_id,
        FLOOR(date_diff('second', c.intime, v.charttime) / 3600) AS hr,
        v.heart_rate, v.sbp, v.spo2, v.temperature, v.resp_rate
    FROM cohort_tab c
    JOIN remote_mimic.mimiciv_derived.vitalsign v ON c.stay_id = v.stay_id
    WHERE v.charttime BETWEEN c.intime AND (c.intime + INTERVAL '24 HOURS')
);
"""

con.sql(vitals_query)
print("Hourly vitals extraction complete.")

labs_query = """
DROP TABLE IF EXISTS labs_tab;
CREATE TABLE labs_tab AS (
    SELECT 
        fdl.stay_id,
        fdb.lactate_max, fdl.bilirubin_total_max, fdl.creatinine_max, 
        fdl.wbc_max, fdl.glucose_max, fdl.bun_max
    FROM remote_mimic.mimiciv_derived.first_day_lab fdl
    JOIN remote_mimic.mimiciv_derived.first_day_bg_art fdb ON fdl.stay_id = fdb.stay_id
);
"""

con.sql(labs_query)
print("Lab values extraction complete.")

scores_query = """
DROP TABLE IF EXISTS scores_tab;
CREATE TABLE scores_tab AS (
    SELECT 
        s.stay_id, s.sapsii, o.oasis, so.sofa_24hours
    FROM remote_mimic.mimiciv_derived.sapsii s
    JOIN remote_mimic.mimiciv_derived.oasis o ON s.stay_id = o.stay_id
    JOIN remote_mimic.mimiciv_derived.sofa so ON s.stay_id = so.stay_id
);
"""

con.sql(scores_query)
print("Clinical scores extraction complete.")

Hourly vitals extraction complete.
Lab values extraction complete.
Clinical scores extraction complete.


In [4]:
features_query = """
DROP TABLE IF EXISTS features_tab;
CREATE TABLE features_tab AS (
    SELECT 
        v.*,
        l.lactate_max, l.creatinine_max,
        s.sapsii, s.sofa_24hours
    FROM vitals_tab v
    LEFT JOIN labs_tab l ON v.stay_id = l.stay_id
    LEFT JOIN scores_tab s ON v.stay_id = s.stay_id
);"""

con.sql(features_query)
df_features = con.sql("SELECT * FROM features_tab").df()
print(f"Extracted {len(df_features):,} features for cohort.")
df_features.head()

: 

In [None]:
import pandas as pd
from sqlalchemy import create_engine

# 1. Database Connection (Update with your credentials)
# Format: 'postgresql://username:password@host:port/database'
engine = create_engine('postgresql://user:pass@localhost:5432/mimiciv')

def extract_mortality_features():
    query = """
    SELECT 
        -- IDs and Labels
        ie.subject_id, ie.hadm_id, ie.stay_id,
        ie.hospital_expire_flag AS label,
        
        -- Demographics
        ie.admission_age AS age,
        CASE WHEN ie.gender = 'M' THEN 1 ELSE 0 END AS is_male,
        
        -- Derived Vitals (First 24h)
        v.heart_rate_mean, v.mbp_mean, v.resp_rate_mean, v.spo2_mean, v.tempc_mean,
        
        -- Derived Labs (First 24h)
        l.lactate_max, l.ph_min, l.bun_max, l.creatinine_max, l.glucose_max,
        l.hemoglobin_min, l.wbc_max, l.aniongap_max,
        
        -- Severity & Comorbidity
        c.charlson_comorbidity_index AS charlson_index,
        aps.apsiii
        
    FROM mimic_derived.icustay_detail ie
    LEFT JOIN mimic_derived.first_day_vitalsign v ON ie.stay_id = v.stay_id
    LEFT JOIN mimic_derived.first_day_lab l ON ie.stay_id = l.stay_id
    LEFT JOIN mimic_derived.charlson c ON ie.hadm_id = c.hadm_id
    LEFT JOIN mimic_derived.apsiii aps ON ie.stay_id = aps.stay_id
    
    WHERE ie.admission_age >= 18  -- Only adults
    AND ie.first_icu_stay = TRUE  -- Focus on first stay to avoid leakage
    """
    
    print("Extracting features from mimic_derived...")
    df = pd.read_sql_query(query, engine)
    
    # Basic Preprocessing for Deep Learning
    # 1. Handle missing values (simple median imputation for demonstration)
    df = df.fillna(df.median())
    
    # 2. Normalize continuous features
    cols_to_norm = ['age', 'heart_rate_mean', 'mbp_mean', 'resp_rate_mean', 
                    'spo2_mean', 'tempc_mean', 'lactate_max', 'ph_min', 
                    'bun_max', 'creatinine_max', 'glucose_max', 'aniongap_max',
                    'charlson_index', 'apsiii']
    
    df[cols_to_norm] = (df[cols_to_norm] - df[cols_to_norm].mean()) / df[cols_to_norm].std()
    
    return df

if __name__ == "__main__":
    data = extract_mortality_features()
    print(f"Feature matrix shape: {data.shape}")
    print(data.head())
    
    # Save for training
    # data.to_csv('mortality_features.csv', index=False)

┌────────────┬──────────┬──────────┬─────────┬────────────┬─────────────────────┬─────────────────────┬────────────────────┬────────────────────┬────────────────────────────────┬──────────────────────┬──────────────┬─────────────────┬─────────────────────┬─────────────────────┬─────────┬─────────────┬────────────────┐
│ subject_id │ hadm_id  │ stay_id  │ gender  │    dod     │      admittime      │      dischtime      │    los_hospital    │   admission_age    │              race              │ hospital_expire_flag │ hospstay_seq │ first_hosp_stay │     icu_intime      │     icu_outtime     │ los_icu │ icustay_seq │ first_icu_stay │
│   int32    │  int32   │  int32   │ varchar │    date    │      timestamp      │      timestamp      │       double       │       double       │            varchar             │        int16         │    int64     │     boolean     │      timestamp      │      timestamp      │ double  │    int64    │    boolean     │
├────────────┼──────────┼──────────┼────

In [2]:
# SQL Query targeting the derived tables in the remote_mimic database
query = """
SELECT 
    ie.subject_id, ie.hadm_id, ie.stay_id,
    ie.hospital_expire_flag AS label,
    ie.admission_age AS age,
    CASE WHEN ie.gender = 'M' THEN 1 ELSE 0 END AS is_male,
        
    -- Aggregated Vitals
    v.heart_rate_mean, v.mbp_mean, v.resp_rate_mean, v.temperature_mean,
        
    -- Critical Lab Markers
    fdb.lactate_max, l.bun_max, l.creatinine_max, l.aniongap_max,
        
    -- Scoring Systems
    c.charlson_comorbidity_index AS charlson_index,
    aps.apsiii
        
FROM remote_mimic.mimiciv_derived.icustay_detail ie
LEFT JOIN remote_mimic.mimiciv_derived.first_day_vitalsign v ON ie.stay_id = v.stay_id
LEFT JOIN remote_mimic.mimiciv_derived.first_day_bg_art fdb ON ie.stay_id = fdb.stay_id
LEFT JOIN remote_mimic.mimiciv_derived.first_day_lab l ON ie.stay_id = l.stay_id
LEFT JOIN remote_mimic.mimiciv_derived.charlson c ON ie.hadm_id = c.hadm_id
LEFT JOIN remote_mimic.mimiciv_derived.apsiii aps ON ie.stay_id = aps.stay_id
    
WHERE ie.admission_age >= 18 
AND ie.first_icu_stay = TRUE
"""
    
print("Executing high-performance join via DuckDB...")
# DuckDB returns a Relation object, .df() converts it to a Pandas DataFrame
df = con.execute(query).df()
    
# Pre-processing for Deep Learning - handling nulls (common in clinical data)
df = df.fillna(df.median(numeric_only=True))

print(f"Extracted {len(df)} rows.")
print(df.describe())

Executing high-performance join via DuckDB...
Extracted 85242 rows.
         subject_id       hadm_id       stay_id         label           age  \
count  8.524200e+04  8.524200e+04  8.524200e+04  85242.000000  85242.000000   
mean   1.500086e+07  2.498190e+07  3.499439e+07      0.111154     65.224730   
std    2.882597e+06  2.885086e+06  2.888970e+06      0.314325     16.871572   
min    1.000003e+07  2.000009e+07  3.000015e+07      0.000000     18.002527   
25%    1.251744e+07  2.248004e+07  3.249628e+07      0.000000     55.133382   
50%    1.499738e+07  2.497768e+07  3.499647e+07      0.000000     66.919088   
75%    1.751202e+07  2.746929e+07  3.748872e+07      0.000000     77.885681   
max    1.999999e+07  2.999983e+07  3.999986e+07      1.000000    103.823298   

            is_male  heart_rate_mean      mbp_mean  resp_rate_mean  \
count  85242.000000     85242.000000  85242.000000    85242.000000   
mean       0.557566        84.424331     79.365232       19.111884   
std       

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def extract_and_preprocess_mimic():
    global df  # Use the DataFrame extracted above
    
    # 2. Identify Features and Target
    # We drop 'stay_id' as it's an identifier, and 'label' is our target
    X = df.drop(columns=['subject_id', 'hadm_id', 'stay_id', 'label'])
    y = df['label']

    # 3. Train-Test Split (Crucial for Deep Learning)
    # We split BEFORE scaling to prevent data leakage
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
        )
    # 4. Use Scikit-Learn StandardScaler
    scaler = StandardScaler()

    # Define numeric columns that need scaling (age, vitals, labs, scores)
    # Binary/Categorical columns like 'is_male' don't strictly require Z-score scaling, 
    # but for Neural Nets, it's often safer to scale everything.
    cols_to_scale = X_train.columns 

    # fit_transform on Training data: Calculates Mean and Std
    X_train_scaled = X_train.copy()
    X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

    # transform on Test data: Uses the Mean and Std from Training (no new calculation)
    X_test_scaled = X_test.copy()
    X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

if __name__ == "__main__":
    X_train, X_test, y_train, y_test, fitted_scaler = extract_and_preprocess_mimic()
    
    print(f"Training set size: {X_train.shape}")
    print(f"Testing set size: {X_test.shape}")
    print("\nScaled Training Means (should be ~0):")
    print(X_train.mean().round(2))
    print("\nExtraction Complete.")
    print(f"Features for Training: {list(X_train.columns)}")
    print(f"Sample Scaled Row (Binary Gender included):\n{X_train.iloc[0]}")

Training set size: (68193, 12)
Testing set size: (17049, 12)

Scaled Training Means (should be ~0):
age                 0.0
is_male             0.0
heart_rate_mean    -0.0
mbp_mean           -0.0
resp_rate_mean     -0.0
temperature_mean    0.0
lactate_max         0.0
bun_max             0.0
creatinine_max      0.0
aniongap_max       -0.0
charlson_index      0.0
apsiii              0.0
dtype: float64

Extraction Complete.
Features for Training: ['age', 'is_male', 'heart_rate_mean', 'mbp_mean', 'resp_rate_mean', 'temperature_mean', 'lactate_max', 'bun_max', 'creatinine_max', 'aniongap_max', 'charlson_index', 'apsiii']
Sample Scaled Row (Binary Gender included):
age                -0.483959
is_male            -1.123328
heart_rate_mean    -0.348504
mbp_mean           -0.067863
resp_rate_mean     -0.826689
temperature_mean    0.196113
lactate_max        -0.174633
bun_max            -0.470604
creatinine_max     -0.540344
aniongap_max        0.578217
charlson_index     -0.648463
apsiii       

In [4]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

def build_and_train_model(X_train, X_test, y_train, y_test):
    # 1. Define the Architecture
    # We use a simple 3-layer dense network with Dropout to prevent overfitting
    model = models.Sequential([
        # Input layer size matches the number of features (e.g., 12)
        layers.Input(shape=(X_train.shape[1],)),
        
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),  # Randomly shuts off 20% of neurons to improve generalization
        
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.1),
        
        layers.Dense(16, activation='relu'),
        
        # Output layer: Sigmoid is used for binary classification (probability 0-1)
        layers.Dense(1, activation='sigmoid')
    ])

    # 2. Compile the Model
    # Binary Crossentropy is the standard loss function for mortality (0/1) prediction
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    # 3. Early Stopping
    # This stops training when the model stops improving on the test set
    early_stop = callbacks.EarlyStopping(
        monitor='val_loss', 
        patience=5, 
        restore_best_weights=True
    )

    # 4. Train
    print("Starting training...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )

    return model, history

# Usage assuming X_train etc. were generated by the previous DuckDB/Scikit script
# model, history = build_and_train_model(X_train, X_test, y_train, y_test)

  if not hasattr(np, "object"):


In [None]:
cohort_sql = """
WITH FirstICUStay AS (
    -- Select the minimum ICUSTAY_ID for each SUBJECT_ID to enforce 'First ICU Stay'
    SELECT
        ic.subject_id,
        ic.hadm_id,
        ic.stay_id,
        ic.intime,
        ic.outtime,
        ic.los,
        ROW_NUMBER() OVER (PARTITION BY ic.subject_id ORDER BY ic.intime) AS rn
    FROM
        remote_mimic.mimiciv_icu.icustays ic
),
AdultPatients AS (
    -- Calculate age and filter for adult patients
    SELECT
        fs.*,
        pa.gender,
        (
            (CAST(STRFTIME(fs.intime, '%Y') AS INTEGER) - pa.anchor_year) + pa.anchor_age
        ) AS age_at_admission
    FROM
        FirstICUStay fs
    INNER JOIN 
        remote_mimic.mimiciv_hosp.patients pa ON fs.subject_id = pa.subject_id
    WHERE
        fs.rn = 1 -- Only the first ICU stay
)
SELECT
    ap.subject_id,
    ap.hadm_id,
    ap.stay_id,
    ap.intime,
    ap.outtime,
    ap.los,
    ap.gender,
    ap.age_at_admission AS age,
    -- Target Label: In-Hospital Mortality (IHM)
    adm.hospital_expire_flag AS y_ihm
FROM
    AdultPatients ap
INNER JOIN 
    remote_mimic.mimiciv_hosp.admissions adm ON ap.hadm_id = adm.hadm_id
WHERE
    ap.age_at_admission >= 18 -- Inclusion: Adult patients
AND ap.los * 24 >= 8 -- Exclusion: Minimum length of stay
ORDER BY
    ap.subject_id, ap.intime;
"""

# Execute and store the core cohort data
df_cohort = con.sql(cohort_sql).df()
print(f"Phase 1 Complete. Cohort size: {len(df_cohort):,} patients.")

Phase 1 Complete. Cohort size: 64,363 patients.


In [4]:
static_feat_sql = """
SELECT
    coh.stay_id,
    coh.subject_id,
    coh.age,
    coh.gender,
    coh.y_ihm,
    -- Comorbidities
    cci.charlson_comorbidity_index,
    -- Severity Scores (First 24 Hours - Static)
    o.oasis,
    saps.sapsii,
    -- First Day Vitals (min/max/mean)
    g.gcs_min, -- Example GCS feature
    fvl.heart_rate_min, -- Min Heart Rate in first 24h
    fvl.sbp_max,        -- Max Systolic BP in first 24h
    fvl.resp_rate_mean  -- Mean Respiratory Rate in first 24h
FROM
    df_cohort coh
LEFT JOIN
    remote_mimic.mimiciv_derived.charlson cci ON coh.hadm_id = cci.hadm_id
LEFT JOIN
    remote_mimic.mimiciv_derived.oasis o ON coh.stay_id = o.stay_id
LEFT JOIN
    remote_mimic.mimiciv_derived.sapsii saps ON coh.stay_id = saps.stay_id
LEFT JOIN
    remote_mimic.mimiciv_derived.first_day_gcs g ON coh.stay_id = g.stay_id
LEFT JOIN
    remote_mimic.mimiciv_derived.first_day_vitalsign fvl ON coh.stay_id = fvl.stay_id
LEFT JOIN
    remote_mimic.mimiciv_derived.first_day_lab fdl ON coh.stay_id = fdl.stay_id;
"""

df_static_feat = con.sql(static_feat_sql).df()
print(f"Extracted {len(df_static_feat):,} static, derived feature vectors.")

Extracted 64,363 static, derived feature vectors.


In [6]:
sofa_sql = """
SELECT
    s.stay_id,
    -- Time from ICU intime in hours (already calculated in the derived table)
    s.starttime,
    s.sofa_24hours, -- The total score for the 24 hour period
    s.respiration,
    s.coagulation,
    s.liver,
    s.cardiovascular,
    s.cns,
    s.renal
FROM
    remote_mimic.mimiciv_derived.sofa s
INNER JOIN
    df_cohort coh ON s.stay_id = coh.stay_id
-- Filter to ensure we only get scores calculated during the observation window
WHERE
    s.starttime < coh.intime + INTERVAL '24 hour'
ORDER BY
    s.stay_id, s.starttime;
"""

df_sofa = con.sql(sofa_sql).df()
print(f"Extracted {len(df_sofa):,} dynamic SOFA score entries.")

Extracted 1,475,981 dynamic SOFA score entries.
