In [2]:
# Standard imports for our CLABSI project
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Database connection function
def connect_db():
    """Establish database connection"""
    try:
        engine = create_engine('postgresql://postgres:ramiel12@localhost:5432/mimiciv')
        connection = engine.connect()
        print("Connected to mimiciv successfully!")
        return engine
    except Exception as e:
        print(f"Connection failed: {e}")
        return None

# First establish connection
engine = connect_db()

# Load our existing CLABSI cohort for reference
def load_clabsi_cohort(engine):
    """Load the validated CLABSI 2.0 cohort for reference matching"""
    clabsi_query = text("""
    WITH clabsi_icd AS (
        SELECT DISTINCT hadm_id, MIN(icd_code) as icd_code
        FROM mimiciv_hosp.diagnoses_icd
        WHERE icd_code IN ('99931', '99932', 'T80211A')
        GROUP BY hadm_id
    ),
    validated_cases AS (
        SELECT DISTINCT
            i.subject_id,
            i.hadm_id,
            i.stay_id,
            i.intime as icu_admission,
            i.outtime as icu_discharge,
            l.starttime as line_start,
            l.endtime as line_end,
            l.line_type,
            pc.charttime as infection_time,
            pc.org_name,
            pc.cultures_of_org,
            pc.daily_org_cultures,
            CASE
                WHEN pc.org_name IN (
                    'STAPH AUREUS COAG +', 'ESCHERICHIA COLI',
                    'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA',
                    'ENTEROCOCCUS FAECIUM', 'CANDIDA ALBICANS'
                ) OR (
                    pc.org_name IN (
                        'STAPHYLOCOCCUS, COAGULASE NEGATIVE',
                        'STAPHYLOCOCCUS EPIDERMIDIS'
                    ) AND pc.daily_org_cultures >= 2
                )
                THEN 'Confirmed CLABSI'
                WHEN pc.daily_org_cultures >= 2
                THEN 'Possible CLABSI'
                ELSE 'Requires Review'
            END as clabsi_status
        FROM mimiciv_icu.icustays i
        INNER JOIN clabsi_icd c ON i.hadm_id = c.hadm_id
        INNER JOIN mimiciv_derived.invasive_line l ON i.stay_id = l.stay_id
        INNER JOIN (
            SELECT 
                hadm_id,
                charttime,
                org_name,
                COUNT(*) OVER (
                    PARTITION BY hadm_id, org_name
                ) as cultures_of_org,
                COUNT(*) OVER (
                    PARTITION BY hadm_id, 
                    DATE_TRUNC('day', charttime),
                    org_name
                ) as daily_org_cultures
            FROM mimiciv_hosp.microbiologyevents
            WHERE spec_type_desc = 'BLOOD CULTURE'
            AND org_name IS NOT NULL
        ) pc ON i.hadm_id = pc.hadm_id
        WHERE l.line_type IN (
            'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
            'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
            'Continuous Cardiac Output PA', 'PA'
        )
        AND pc.charttime > l.starttime
        AND pc.charttime <= l.starttime + INTERVAL '14 days'
        AND EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 >= 2
    )
    SELECT *
    FROM validated_cases
    WHERE clabsi_status = 'Confirmed CLABSI'
    ORDER BY stay_id, infection_time;
    """)
    
    clabsi_df = pd.read_sql(clabsi_query, engine)
    print("\nCLABSI Cohort Summary:")
    print("-" * 30)
    print(f"Total confirmed cases: {len(clabsi_df)}")
    print(f"Unique patients: {clabsi_df['subject_id'].nunique()}")
    print(f"Unique ICU stays: {clabsi_df['stay_id'].nunique()}")
    print("\nLine Type Distribution:")
    print(clabsi_df['line_type'].value_counts())
    
    return clabsi_df

if engine:
    # Load our CLABSI cohort for reference
    clabsi_cohort = load_clabsi_cohort(engine)

Connected to mimiciv successfully!

CLABSI Cohort Summary:
------------------------------
Total confirmed cases: 634
Unique patients: 141
Unique ICU stays: 158

Line Type Distribution:
line_type
Multi Lumen                     240
Dialysis                        183
PICC                            117
PA                               38
Cordis/Introducer                28
Hickman                          13
Portacath                        11
Continuous Cardiac Output PA      4
Name: count, dtype: int64


In [3]:
if engine:
    # Modified control selection query
    control_query = text("""
    WITH line_patients AS (
        -- Get all patients with qualifying central lines 
        SELECT DISTINCT
            i.subject_id,
            i.hadm_id,
            i.stay_id,
            i.intime as icu_admission,
            i.outtime as icu_discharge,
            l.starttime as line_start,
            l.endtime as line_end,
            l.line_type,
            EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 as line_duration_days
        FROM mimiciv_icu.icustays i
        INNER JOIN mimiciv_derived.invasive_line l 
            ON i.stay_id = l.stay_id
        WHERE l.line_type IN (
            'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
            'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
            'Continuous Cardiac Output PA', 'PA'
        )
        AND EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 >= 2
        AND EXISTS (
            SELECT 1
            FROM mimiciv_icu.chartevents ce
            WHERE ce.stay_id = i.stay_id
            AND ce.itemid IN (
                228137,  -- CHG Bath
                227293,  -- Multi Lumen Dressing
                227358,  -- PICC Dressing
                227357   -- Dialysis Cath Dressing
            )
        )
    ),
    excluded_patients AS (
        SELECT DISTINCT p.stay_id
        FROM line_patients p
        LEFT JOIN mimiciv_hosp.diagnoses_icd d 
            ON p.hadm_id = d.hadm_id
        LEFT JOIN mimiciv_hosp.microbiologyevents m
            ON p.hadm_id = m.hadm_id
            AND m.charttime BETWEEN p.line_start AND p.line_end
        WHERE d.icd_code IN ('99931', '99932', 'T80211A')
        OR (
            m.spec_type_desc = 'BLOOD CULTURE'
            AND m.org_name IS NOT NULL
        )
    ),
    ranked_controls AS (
        SELECT 
            p.*,
            -- Add demographic info
            pat.gender,
            pat.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(pat.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS admission_age,
            adm.race,
            adm.admission_type,
            ROW_NUMBER() OVER (
                PARTITION BY l.line_type 
                ORDER BY random()
            ) as type_rank,
            -- Adjusted target numbers (4:1 ratio)
            CASE 
                WHEN l.line_type = 'Multi Lumen' THEN 960   -- 4 x 240
                WHEN l.line_type = 'Dialysis' THEN 732      -- 4 x 183
                WHEN l.line_type = 'PICC' THEN 468          -- 4 x 117
                WHEN l.line_type = 'PA' THEN 152            -- 4 x 38
                WHEN l.line_type = 'Cordis/Introducer' THEN 112  -- 4 x 28
                WHEN l.line_type = 'Hickman' THEN 52        -- 4 x 13
                WHEN l.line_type = 'Portacath' THEN 44      -- 4 x 11
                ELSE 40  -- For remaining types
            END as target_n
        FROM line_patients p
        INNER JOIN mimiciv_hosp.admissions adm 
            ON p.hadm_id = adm.hadm_id
        INNER JOIN mimiciv_hosp.patients pat
            ON p.subject_id = pat.subject_id
        INNER JOIN mimiciv_derived.invasive_line l
            ON p.stay_id = l.stay_id
        WHERE NOT EXISTS (
            SELECT 1 
            FROM excluded_patients e 
            WHERE p.stay_id = e.stay_id
        )
    )
    SELECT *
    FROM ranked_controls
    WHERE type_rank <= target_n;
    """)
    
    # Execute query and load into dataframe
    controls_df = pd.read_sql(control_query, engine)
    
    print("\nControl Cohort Summary:")
    print("-" * 30)
    print(f"Total potential controls: {len(controls_df)}")
    print(f"Unique patients: {controls_df['subject_id'].nunique()}")
    print(f"Unique ICU stays: {controls_df['stay_id'].nunique()}")
    print("\nLine Type Distribution:")
    print(controls_df['line_type'].value_counts())



Control Cohort Summary:
------------------------------
Total potential controls: 3160
Unique patients: 1999
Unique ICU stays: 2128

Line Type Distribution:
line_type
Multi Lumen                     1259
PICC                             657
Dialysis                         576
Cordis/Introducer                288
PA                               159
Continuous Cardiac Output PA      90
Portacath                         53
Hickman                           42
Pre-Sep                           23
Triple Introducer                 13
Name: count, dtype: int64


In [4]:
# Add clinical features by incorporating our control selection directly
clinical_query = text("""
WITH line_patients AS (
    -- Get all patients with qualifying central lines 
    SELECT DISTINCT
        i.subject_id,
        i.hadm_id,
        i.stay_id,
        i.intime as icu_admission,
        i.outtime as icu_discharge,
        l.starttime as line_start,
        l.endtime as line_end,
        l.line_type,
        EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 as line_duration_days
    FROM mimiciv_icu.icustays i
    INNER JOIN mimiciv_derived.invasive_line l 
        ON i.stay_id = l.stay_id
    WHERE l.line_type IN (
        'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
        'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
        'Continuous Cardiac Output PA', 'PA'
    )
    AND EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 >= 2
),
excluded_patients AS (
    SELECT DISTINCT p.stay_id
    FROM line_patients p
    LEFT JOIN mimiciv_hosp.diagnoses_icd d 
        ON p.hadm_id = d.hadm_id
    LEFT JOIN mimiciv_hosp.microbiologyevents m
        ON p.hadm_id = m.hadm_id
        AND m.charttime BETWEEN p.line_start AND p.line_end
    WHERE d.icd_code IN ('99931', '99932', 'T80211A')
    OR (
        m.spec_type_desc = 'BLOOD CULTURE'
        AND m.org_name IS NOT NULL
    )
),
ranked_controls AS (
    SELECT 
        p.*,
        pat.gender,
        pat.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(pat.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS admission_age,
        adm.race,
        adm.admission_type,
        ROW_NUMBER() OVER (
            PARTITION BY p.line_type 
            ORDER BY random()
        ) as type_rank,
        CASE 
            WHEN p.line_type = 'Multi Lumen' THEN 960
            WHEN p.line_type = 'Dialysis' THEN 732
            WHEN p.line_type = 'PICC' THEN 468
            WHEN p.line_type = 'PA' THEN 152
            WHEN p.line_type = 'Cordis/Introducer' THEN 112
            WHEN p.line_type = 'Hickman' THEN 52
            WHEN p.line_type = 'Portacath' THEN 44
            ELSE 40
        END as target_n
    FROM line_patients p
    INNER JOIN mimiciv_hosp.admissions adm 
        ON p.hadm_id = adm.hadm_id
    INNER JOIN mimiciv_hosp.patients pat
        ON p.subject_id = pat.subject_id
    WHERE NOT EXISTS (
        SELECT 1 
        FROM excluded_patients e 
        WHERE p.stay_id = e.stay_id
    )
),
selected_controls AS (
    SELECT *
    FROM ranked_controls
    WHERE type_rank <= target_n
),
-- Now add clinical features
vitals AS (
    SELECT 
        sc.stay_id,
        MIN(vs.mbp) AS mbp_min,
        MAX(vs.mbp) AS mbp_max,
        MIN(vs.heart_rate) as heart_rate_min,
        MAX(vs.heart_rate) as heart_rate_max,
        MIN(vs.resp_rate) as resp_rate_min,
        MAX(vs.resp_rate) as resp_rate_max,
        MIN(vs.temperature) as temperature_min,
        MAX(vs.temperature) as temperature_max,
        MIN(vs.spo2) as spo2_min,
        MAX(vs.spo2) as spo2_max
    FROM selected_controls sc
    LEFT JOIN mimiciv_derived.vitalsign vs
        ON sc.stay_id = vs.stay_id
        AND vs.charttime BETWEEN sc.line_start AND sc.line_end
    GROUP BY sc.stay_id
),
labs AS (
    SELECT
        sc.stay_id,
        MAX(CASE WHEN itemid = 51300 THEN valuenum END) as wbc_max,
        MIN(CASE WHEN itemid = 51300 THEN valuenum END) as wbc_min,
        MAX(CASE WHEN itemid = 51265 THEN valuenum END) as platelet_max,
        MIN(CASE WHEN itemid = 51265 THEN valuenum END) as platelet_min,
        MAX(CASE WHEN itemid = 50862 THEN valuenum END) as albumin_max,
        MIN(CASE WHEN itemid = 50862 THEN valuenum END) as albumin_min,
        MAX(CASE WHEN itemid = 50912 THEN valuenum END) as creatinine_max,
        MIN(CASE WHEN itemid = 50912 THEN valuenum END) as creatinine_min
    FROM selected_controls sc
    LEFT JOIN mimiciv_hosp.labevents le
        ON sc.subject_id = le.subject_id
        AND le.charttime BETWEEN sc.line_start AND sc.line_end
    WHERE le.itemid IN (
        51300, -- WBC
        51265, -- Platelet Count
        50862, -- Albumin
        50912  -- Creatinine
    )
    GROUP BY sc.stay_id
)
SELECT 
    sc.*,
    v.mbp_min, v.mbp_max,
    v.heart_rate_min, v.heart_rate_max,
    v.resp_rate_min, v.resp_rate_max,
    v.temperature_min, v.temperature_max,
    v.spo2_min, v.spo2_max,
    l.wbc_min, l.wbc_max,
    l.platelet_min, l.platelet_max,
    l.albumin_min, l.albumin_max,
    l.creatinine_min, l.creatinine_max
FROM selected_controls sc
LEFT JOIN vitals v ON sc.stay_id = v.stay_id
LEFT JOIN labs l ON sc.stay_id = l.stay_id;
""")

# Execute and check results
controls_with_features = pd.read_sql(clinical_query, engine)

# Display summary of added features
print("\nClinical Features Summary:")
print("-" * 30)
print(f"Total controls with features: {len(controls_with_features)}")

numeric_features = ['heart_rate_max', 'temperature_max', 'wbc_max', 'platelet_min', 'creatinine_max']
print("\nKey Clinical Measurements:")
print(controls_with_features[numeric_features].describe().round(2))

# Check completeness
print("\nFeature Completeness:")
for col in numeric_features:
    pct_complete = (controls_with_features[col].notna().sum() / len(controls_with_features)) * 100
    print(f"{col}: {pct_complete:.1f}% complete")


Clinical Features Summary:
------------------------------
Total controls with features: 2615

Key Clinical Measurements:
       heart_rate_max  temperature_max  wbc_max  platelet_min  creatinine_max
count         2611.00          2543.00    20.00       2610.00         2610.00
mean           120.78            37.92     9.88        147.16            2.83
std             24.24             0.82     5.16        108.46            2.37
min             64.00            33.70     2.50          5.00            0.20
25%            104.00            37.28     6.08         70.00            1.00
50%            119.00            37.72     9.20        129.00            2.00
75%            134.00            38.44    14.68        195.00            4.00
max            250.00            41.44    19.60       1743.00           21.90

Feature Completeness:
heart_rate_max: 99.8% complete
temperature_max: 97.2% complete
wbc_max: 0.8% complete
platelet_min: 99.8% complete
creatinine_max: 99.8% complete


In [5]:
# Add line care features
care_query = text("""
WITH line_patients AS (
    SELECT DISTINCT
        i.subject_id,
        i.hadm_id,
        i.stay_id,
        i.intime as icu_admission,
        i.outtime as icu_discharge,
        l.starttime as line_start,
        l.endtime as line_end,
        l.line_type,
        EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 as line_duration_days
    FROM mimiciv_icu.icustays i
    INNER JOIN mimiciv_derived.invasive_line l 
        ON i.stay_id = l.stay_id
    WHERE l.line_type IN (
        'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
        'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
        'Continuous Cardiac Output PA', 'PA'
    )
    AND EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 >= 2
),
excluded_patients AS (
    SELECT DISTINCT p.stay_id
    FROM line_patients p
    LEFT JOIN mimiciv_hosp.diagnoses_icd d 
        ON p.hadm_id = d.hadm_id
    LEFT JOIN mimiciv_hosp.microbiologyevents m
        ON p.hadm_id = m.hadm_id
        AND m.charttime BETWEEN p.line_start AND p.line_end
    WHERE d.icd_code IN ('99931', '99932', 'T80211A')
    OR (
        m.spec_type_desc = 'BLOOD CULTURE'
        AND m.org_name IS NOT NULL
    )
),
ranked_controls AS (
    SELECT 
        p.*,
        pat.gender,
        pat.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(pat.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS admission_age,
        adm.race,
        adm.admission_type,
        ROW_NUMBER() OVER (
            PARTITION BY p.line_type 
            ORDER BY random()
        ) as type_rank,
        CASE 
            WHEN p.line_type = 'Multi Lumen' THEN 960
            WHEN p.line_type = 'Dialysis' THEN 732
            WHEN p.line_type = 'PICC' THEN 468
            WHEN p.line_type = 'PA' THEN 152
            WHEN p.line_type = 'Cordis/Introducer' THEN 112
            WHEN p.line_type = 'Hickman' THEN 52
            WHEN p.line_type = 'Portacath' THEN 44
            ELSE 40
        END as target_n
    FROM line_patients p
    INNER JOIN mimiciv_hosp.admissions adm 
        ON p.hadm_id = adm.hadm_id
    INNER JOIN mimiciv_hosp.patients pat
        ON p.subject_id = pat.subject_id
    WHERE NOT EXISTS (
        SELECT 1 
        FROM excluded_patients e 
        WHERE p.stay_id = e.stay_id
    )
),
selected_controls AS (
    SELECT *
    FROM ranked_controls
    WHERE type_rank <= target_n
),
care_events AS (
    SELECT 
        sc.stay_id,
        ce.itemid,
        ce.charttime,
        EXTRACT(EPOCH FROM (ce.charttime - LAG(ce.charttime) OVER 
            (PARTITION BY sc.stay_id, ce.itemid ORDER BY ce.charttime)))/86400.0 
            as days_since_last_care
    FROM selected_controls sc
    LEFT JOIN mimiciv_icu.chartevents ce
        ON sc.stay_id = ce.stay_id
        AND ce.charttime BETWEEN sc.line_start AND sc.line_end
        AND ce.itemid IN (
            228137,  -- CHG Bath
            227293,  -- Multi Lumen Dressing
            227358,  -- PICC Dressing
            227357,  -- Dialysis Cath Dressing
            224188,  -- PICC Site Assessment
            224289   -- Line Site Assessment
        )
),
line_care AS (
    SELECT 
        sc.stay_id,
        sc.line_type,
        -- CHG baths
        COUNT(CASE WHEN ce.itemid = 228137 THEN 1 END) as chg_bath_count,
        -- Dressing changes
        COUNT(CASE WHEN ce.itemid IN (227293, 227358, 227357) THEN 1 END) as dressing_changes,
        -- Line site assessments
        COUNT(CASE WHEN ce.itemid IN (224188, 224289) THEN 1 END) as site_assessments,
        -- Count distinct care days
        COUNT(DISTINCT DATE_TRUNC('day', ce.charttime)) as total_care_days
    FROM selected_controls sc
    LEFT JOIN mimiciv_icu.chartevents ce
        ON sc.stay_id = ce.stay_id
        AND ce.charttime BETWEEN sc.line_start AND sc.line_end
        AND ce.itemid IN (
            228137,  -- CHG Bath
            227293,  -- Multi Lumen Dressing
            227358,  -- PICC Dressing
            227357,  -- Dialysis Cath Dressing
            224188,  -- PICC Site Assessment
            224289   -- Line Site Assessment
        )
    GROUP BY sc.stay_id, sc.line_type
),
care_intervals AS (
    SELECT 
        stay_id,
        AVG(CASE WHEN days_since_last_care > 0 THEN days_since_last_care END) as avg_days_between_care
    FROM care_events
    GROUP BY stay_id
)
SELECT 
    sc.*,
    COALESCE(lc.chg_bath_count, 0) as chg_bath_count,
    COALESCE(lc.dressing_changes, 0) as dressing_changes,
    COALESCE(lc.site_assessments, 0) as site_assessments,
    COALESCE(ci.avg_days_between_care, 0) as avg_days_between_care,
    COALESCE(lc.total_care_days, 0) as total_care_days,
    -- Calculate care rates (per line day)
    CASE 
        WHEN sc.line_duration_days > 0 
        THEN COALESCE(lc.chg_bath_count, 0) / sc.line_duration_days 
        ELSE 0 
    END as chg_bath_rate,
    CASE 
        WHEN sc.line_duration_days > 0 
        THEN COALESCE(lc.dressing_changes, 0) / sc.line_duration_days 
        ELSE 0 
    END as dressing_change_rate
FROM selected_controls sc
LEFT JOIN line_care lc ON sc.stay_id = lc.stay_id
LEFT JOIN care_intervals ci ON sc.stay_id = ci.stay_id;
""")

# Execute and check results
controls_with_care = pd.read_sql(care_query, engine)

print("\nLine Care Summary:")
print("-" * 30)
print(f"Total controls analyzed: {len(controls_with_care)}")

care_features = ['chg_bath_count', 'dressing_changes', 'site_assessments', 
                'avg_days_between_care', 'total_care_days']

print("\nCare Metrics Summary:")
print(controls_with_care[care_features].describe().round(2))

# Calculate care rates by line type
print("\nMedian Care Events per Line Type:")
line_type_care = controls_with_care.groupby('line_type')[['chg_bath_count', 'dressing_changes', 'site_assessments']].median()
print(line_type_care.round(2))


Line Care Summary:
------------------------------
Total controls analyzed: 2815

Care Metrics Summary:
       chg_bath_count  dressing_changes  site_assessments  \
count         2815.00           2815.00           2815.00   
mean             0.43             21.05             26.97   
std              1.86             37.48             36.36   
min              0.00              0.00              0.00   
25%              0.00              0.00              6.00   
50%              0.00              9.00             16.00   
75%              0.00             28.00             34.00   
max             32.00            522.00            391.00   

       avg_days_between_care  total_care_days  
count                2815.00          2815.00  
mean                    0.22             6.29  
std                     0.12             5.57  
min                     0.00             0.00  
25%                     0.18             3.00  
50%                     0.21             5.00  
75%       

In [6]:
# Complete query with fixed SOFA columns
query = text("""
WITH line_patients AS (
    SELECT DISTINCT
        i.subject_id,
        i.hadm_id,
        i.stay_id,
        i.intime as icu_admission,
        i.outtime as icu_discharge,
        l.starttime as line_start,
        l.endtime as line_end,
        l.line_type,
        EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 as line_duration_days
    FROM mimiciv_icu.icustays i
    INNER JOIN mimiciv_derived.invasive_line l 
        ON i.stay_id = l.stay_id
    WHERE l.line_type IN (
        'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
        'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
        'Continuous Cardiac Output PA', 'PA'
    )
    AND EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 >= 2
),
excluded_patients AS (
    SELECT DISTINCT p.stay_id
    FROM line_patients p
    LEFT JOIN mimiciv_hosp.diagnoses_icd d 
        ON p.hadm_id = d.hadm_id
    LEFT JOIN mimiciv_hosp.microbiologyevents m
        ON p.hadm_id = m.hadm_id
        AND m.charttime BETWEEN p.line_start AND p.line_end
    WHERE d.icd_code IN ('99931', '99932', 'T80211A')
    OR (
        m.spec_type_desc = 'BLOOD CULTURE'
        AND m.org_name IS NOT NULL
    )
),
ranked_controls AS (
    SELECT 
        p.*,
        pat.gender,
        pat.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(pat.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS admission_age,
        adm.race,
        adm.admission_type,
        ROW_NUMBER() OVER (
            PARTITION BY p.line_type 
            ORDER BY random()
        ) as type_rank,
        CASE 
            WHEN p.line_type = 'Multi Lumen' THEN 960
            WHEN p.line_type = 'Dialysis' THEN 732
            WHEN p.line_type = 'PICC' THEN 468
            WHEN p.line_type = 'PA' THEN 152
            WHEN p.line_type = 'Cordis/Introducer' THEN 112
            WHEN p.line_type = 'Hickman' THEN 52
            WHEN p.line_type = 'Portacath' THEN 44
            ELSE 40
        END as target_n
    FROM line_patients p
    INNER JOIN mimiciv_hosp.admissions adm 
        ON p.hadm_id = adm.hadm_id
    INNER JOIN mimiciv_hosp.patients pat
        ON p.subject_id = pat.subject_id
    WHERE NOT EXISTS (
        SELECT 1 
        FROM excluded_patients e 
        WHERE p.stay_id = e.stay_id
    )
),
selected_controls AS (
    SELECT *
    FROM ranked_controls
    WHERE type_rank <= target_n
),
severity AS (
    SELECT 
        sc.stay_id,
        MAX(s.sofa_24hours) as max_sofa,
        AVG(s.sofa_24hours) as avg_sofa,
        MAX(s.respiration_24hours) as max_resp_score,
        MAX(s.cardiovascular_24hours) as max_cv_score,
        MAX(s.renal_24hours) as max_renal_score
    FROM selected_controls sc
    LEFT JOIN mimiciv_derived.sofa s
        ON sc.stay_id = s.stay_id
        AND s.starttime BETWEEN sc.line_start AND sc.line_end
    GROUP BY sc.stay_id
),
medications AS (
    SELECT 
        sc.stay_id,
        COUNT(DISTINCT a.antibiotic) as distinct_antibiotics,
        MAX(CASE WHEN 
            i.itemid IN (226089, 227690)  -- TPN, Lipids
            THEN 1 ELSE 0 END) as tpn_status,
        MAX(CASE WHEN 
            i.itemid IN (221662, 221653, 221668)  -- Vasopressors, Steroids, Immunosuppressants
            THEN 1 ELSE 0 END) as high_risk_med_status
    FROM selected_controls sc
    LEFT JOIN mimiciv_derived.antibiotic a
        ON sc.stay_id = a.stay_id
        AND a.starttime BETWEEN sc.line_start AND sc.line_end
    LEFT JOIN mimiciv_icu.inputevents i
        ON sc.stay_id = i.stay_id
        AND i.starttime BETWEEN sc.line_start AND sc.line_end
    GROUP BY sc.stay_id
),
care_events AS (
    SELECT 
        sc.stay_id,
        ce.itemid,
        ce.charttime,
        EXTRACT(EPOCH FROM (ce.charttime - LAG(ce.charttime) OVER 
            (PARTITION BY sc.stay_id, ce.itemid ORDER BY ce.charttime)))/86400.0 
            as days_since_last_care
    FROM selected_controls sc
    LEFT JOIN mimiciv_icu.chartevents ce
        ON sc.stay_id = ce.stay_id
        AND ce.charttime BETWEEN sc.line_start AND sc.line_end
        AND ce.itemid IN (
            228137,  -- CHG Bath
            227293,  -- Multi Lumen Dressing
            227358,  -- PICC Dressing
            227357,  -- Dialysis Cath Dressing
            224188,  -- PICC Site Assessment
            224289   -- Line Site Assessment
        )
),
line_care AS (
    SELECT 
        sc.stay_id,
        sc.line_type,
        COUNT(CASE WHEN ce.itemid = 228137 THEN 1 END) as chg_bath_count,
        COUNT(CASE WHEN ce.itemid IN (227293, 227358, 227357) THEN 1 END) as dressing_changes,
        COUNT(CASE WHEN ce.itemid IN (224188, 224289) THEN 1 END) as site_assessments,
        COUNT(DISTINCT DATE_TRUNC('day', ce.charttime)) as total_care_days
    FROM selected_controls sc
    LEFT JOIN mimiciv_icu.chartevents ce
        ON sc.stay_id = ce.stay_id
        AND ce.charttime BETWEEN sc.line_start AND sc.line_end
        AND ce.itemid IN (
            228137,  -- CHG Bath
            227293,  -- Multi Lumen Dressing
            227358,  -- PICC Dressing
            227357,  -- Dialysis Cath Dressing
            224188,  -- PICC Site Assessment
            224289   -- Line Site Assessment
        )
    GROUP BY sc.stay_id, sc.line_type
),
care_intervals AS (
    SELECT 
        stay_id,
        AVG(CASE WHEN days_since_last_care > 0 THEN days_since_last_care END) as avg_days_between_care
    FROM care_events
    GROUP BY stay_id
)
SELECT 
    sc.*,
    -- Severity scores
    COALESCE(s.max_sofa, 0) as max_sofa,
    COALESCE(s.avg_sofa, 0) as avg_sofa,
    COALESCE(s.max_resp_score, 0) as max_resp_score,
    COALESCE(s.max_cv_score, 0) as max_cv_score,
    COALESCE(s.max_renal_score, 0) as max_renal_score,
    -- Medications
    COALESCE(m.distinct_antibiotics, 0) as distinct_antibiotics,
    COALESCE(m.tpn_status, 0) as tpn_status,
    COALESCE(m.high_risk_med_status, 0) as high_risk_med_status,
    -- Line care
    COALESCE(lc.chg_bath_count, 0) as chg_bath_count,
    COALESCE(lc.dressing_changes, 0) as dressing_changes,
    COALESCE(lc.site_assessments, 0) as site_assessments,
    COALESCE(ci.avg_days_between_care, 0) as avg_days_between_care,
    COALESCE(lc.total_care_days, 0) as total_care_days,
    -- Care rates
    CASE 
        WHEN sc.line_duration_days > 0 
        THEN COALESCE(lc.chg_bath_count, 0) / sc.line_duration_days 
        ELSE 0 
    END as chg_bath_rate,
    CASE 
        WHEN sc.line_duration_days > 0 
        THEN COALESCE(lc.dressing_changes, 0) / sc.line_duration_days 
        ELSE 0 
    END as dressing_change_rate
FROM selected_controls sc
LEFT JOIN severity s ON sc.stay_id = s.stay_id
LEFT JOIN medications m ON sc.stay_id = m.stay_id
LEFT JOIN line_care lc ON sc.stay_id = lc.stay_id
LEFT JOIN care_intervals ci ON sc.stay_id = ci.stay_id;
""")

# Execute and check results
controls_with_features = pd.read_sql(query, engine)

# Display comprehensive summary
print("\nControl Cohort Feature Summary:")
print("-" * 30)
print(f"Total controls: {len(controls_with_features)}")

# Severity scores
severity_features = ['max_sofa', 'avg_sofa', 'max_resp_score', 'max_cv_score', 'max_renal_score']
print("\nSeverity Score Summary:")
print(controls_with_features[severity_features].describe().round(2))

# Medications
med_features = ['distinct_antibiotics', 'tpn_status', 'high_risk_med_status']
print("\nMedication Summary:")
print(controls_with_features[med_features].describe().round(2))

# Line care (as before)
care_features = ['chg_bath_count', 'dressing_changes', 'site_assessments', 
                'avg_days_between_care', 'total_care_days']
print("\nCare Metrics Summary:")
print(controls_with_features[care_features].describe().round(2))

# Show distribution of features by line type
print("\nFeature Distribution by Line Type:")
line_type_summary = controls_with_features.groupby('line_type').agg({
    'max_sofa': 'mean',
    'distinct_antibiotics': 'mean',
    'dressing_changes': 'mean',
    'site_assessments': 'mean',
    'tpn_status': 'mean'
}).round(2)
print(line_type_summary)


Control Cohort Feature Summary:
------------------------------
Total controls: 2844

Severity Score Summary:
       max_sofa  avg_sofa  max_resp_score  max_cv_score  max_renal_score
count   2844.00   2844.00         2844.00       2844.00          2844.00
mean       9.54      6.59            1.98          2.55             2.28
std        4.70      3.69            1.47          1.45             1.63
min        0.00      0.00            0.00          0.00             0.00
25%        6.00      3.79            0.00          1.00             1.00
50%        9.00      6.08            2.00          3.00             3.00
75%       13.00      8.66            3.00          4.00             4.00
max       23.00     19.52            4.00          4.00             4.00

Medication Summary:
       distinct_antibiotics  tpn_status  high_risk_med_status
count               2844.00     2844.00               2844.00
mean                   2.54        0.55                  0.37
std                    1.9