In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings('ignore')#


# Database connection
def connect_db():
    """Establish database connection to MIMIC IV"""
    try:
        engine = create_engine('***)
        print("Successfully connected to MIMIC IV database!")
        return engine
    except Exception as e:
        print(f"Connection failed: {e}")
        return None

# Initialize database connection
print("Initializing CLABSI cohort analysis...")
engine = connect_db()



In [None]:
def get_clabsi_cohort(engine):
    """
    Get validated CLABSI cases with enhanced organism classification
    and culture validation criteria.
    
    Parameters
    ----------
    engine : SQLAlchemy engine
        Database connection engine
        
    Returns
    -------
    pd.DataFrame
        Validated CLABSI cases with detailed classification
    """
    clabsi_query = text("""
    WITH clabsi_icd AS (
        -- Get initial CLABSI diagnoses
        SELECT DISTINCT hadm_id, MIN(icd_code) as icd_code
        FROM mimiciv_hosp.diagnoses_icd
        WHERE icd_code IN ('99931', '99932', 'T80211A')
        GROUP BY hadm_id
    ),
    microbe_classifications AS (
        -- Enhanced organism classification with validation criteria
        SELECT 
            m.hadm_id,
            m.charttime,
            m.org_name,
            COUNT(*) OVER (
                PARTITION BY m.hadm_id, m.org_name
            ) as cultures_of_org,
            COUNT(*) OVER (
                PARTITION BY m.hadm_id, 
                DATE_TRUNC('day', m.charttime),
                m.org_name
            ) as daily_org_cultures,
            CASE
                -- Primary pathogens (single culture sufficient)
                WHEN m.org_name IN (
                    'STAPH AUREUS COAG +', 'ESCHERICHIA COLI',
                    'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA',
                    'ENTEROCOCCUS FAECIUM', 'CANDIDA ALBICANS'
                ) THEN 'Primary Pathogen'
                
                -- Common skin contaminants (require multiple cultures)
                WHEN m.org_name IN (
                    'STAPHYLOCOCCUS, COAGULASE NEGATIVE',
                    'STAPHYLOCOCCUS EPIDERMIDIS'
                ) AND COUNT(*) OVER (
                    PARTITION BY m.hadm_id, 
                    DATE_TRUNC('day', m.charttime),
                    m.org_name
                ) >= 2 THEN 'Validated Common Contaminant'
                
                -- Other recognized pathogens
                WHEN m.org_name LIKE 'CANDIDA%' OR
                     m.org_name IN (
                        'ENTEROCOCCUS FAECALIS',
                        'SERRATIA MARCESCENS',
                        'BACTEROIDES FRAGILIS GROUP'
                     ) THEN 'Other Pathogen'
                ELSE 'Requires Review'
            END as organism_classification
        FROM mimiciv_hosp.microbiologyevents m
        WHERE m.spec_type_desc = 'BLOOD CULTURE'
        AND m.org_name IS NOT NULL
    ),
    validated_cases AS (
        SELECT DISTINCT
            i.subject_id,
            i.hadm_id,
            i.stay_id,
            i.intime as icu_admission,
            i.outtime as icu_discharge,
            l.starttime as line_start,
            l.endtime as line_end,
            l.line_type,
            EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 as line_duration_days,
            m.charttime as infection_time,
            m.org_name,
            m.cultures_of_org,
            m.daily_org_cultures,
            m.organism_classification
        FROM mimiciv_icu.icustays i
        INNER JOIN clabsi_icd c ON i.hadm_id = c.hadm_id
        INNER JOIN mimiciv_derived.invasive_line l ON i.stay_id = l.stay_id
        INNER JOIN microbe_classifications m ON i.hadm_id = m.hadm_id
        WHERE
            -- Line type validation
            l.line_type IN (
                'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
                'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
                'Continuous Cardiac Output PA', 'PA'
            )
            -- Time window validation 
            AND m.charttime IS NOT NULL
            -- Line duration requirement
            AND EXTRACT(EPOCH FROM (l.endtime - l.starttime))/86400 >= 2
            -- Organism validation
            AND m.organism_classification IN (
                'Primary Pathogen',
                'Validated Common Contaminant',
                'Other Pathogen'
            )
    )
    SELECT 
        *,
        CASE
            WHEN organism_classification = 'Primary Pathogen' THEN 'Confirmed CLABSI'
            WHEN organism_classification = 'Validated Common Contaminant' 
                 AND daily_org_cultures >= 2 THEN 'Confirmed CLABSI'
            WHEN organism_classification = 'Other Pathogen' 
                 AND cultures_of_org >= 2 THEN 'Confirmed CLABSI'
            ELSE 'Requires Review'
        END as clabsi_status
    FROM validated_cases v
    WHERE line_duration_days >= 2
    ORDER BY stay_id, infection_time;
    """)
    
    # Execute query and get results
    clabsi_df = pd.read_sql(clabsi_query, engine)
    
    # Print basic summary
    print("\nCLABSI Cohort Summary:")
    print("-" * 30)
    print(f"Total validated cases: {len(clabsi_df)}")
    print(f"Unique patients: {clabsi_df['subject_id'].nunique()}")
    print(f"Unique ICU stays: {clabsi_df['stay_id'].nunique()}")
    print("\nLine Type Distribution:")
    print(clabsi_df['line_type'].value_counts())
    
    return clabsi_df

# Execute CLABSI cohort identification
if engine:
    clabsi_cohort = get_clabsi_cohort(engine)

In [None]:
def get_control_cohort(engine, clabsi_cohort, matching_ratio=4):
    """
    Select controls matching CLABSI unique ICU stays times 4, excluding infected patients 
    """
    # Get unique stay-level line counts 
    unique_stays = clabsi_cohort.drop_duplicates('stay_id')[['stay_id', 'line_type']]
    line_targets = unique_stays['line_type'].value_counts() * matching_ratio
    
    control_query = text("""
    WITH excluded_patients AS (
        SELECT DISTINCT i.stay_id
        FROM mimiciv_icu.icustays i
        LEFT JOIN mimiciv_hosp.diagnoses_icd d 
            ON i.hadm_id = d.hadm_id
        LEFT JOIN mimiciv_hosp.microbiologyevents m
            ON i.hadm_id = m.hadm_id
        WHERE d.icd_code IN ('99931', '99932', 'T80211A')
        OR (m.spec_type_desc = 'BLOOD CULTURE' AND m.org_name IS NOT NULL)
    ),
    eligible_controls AS (
        SELECT 
            i.subject_id,
            i.hadm_id,
            i.stay_id,
            l.line_type,
            ROW_NUMBER() OVER (PARTITION BY l.line_type ORDER BY random()) as line_rank
        FROM mimiciv_icu.icustays i
        INNER JOIN mimiciv_derived.invasive_line l ON i.stay_id = l.stay_id
        WHERE l.line_type IN :line_types
        AND NOT EXISTS (
            SELECT 1 FROM excluded_patients e WHERE e.stay_id = i.stay_id
        )
    )
    SELECT ec.*
    FROM eligible_controls ec
    WHERE line_rank <= CASE 
        WHEN line_type = 'Multi Lumen' THEN :multi_lumen_n
        WHEN line_type = 'Dialysis' THEN :dialysis_n
        WHEN line_type = 'PICC' THEN :picc_n
        WHEN line_type = 'PA' THEN :pa_n
        WHEN line_type = 'Cordis/Introducer' THEN :cordis_n
        WHEN line_type = 'Hickman' THEN :hickman_n
        WHEN line_type = 'Portacath' THEN :portacath_n
        WHEN line_type = 'Continuous Cardiac Output PA' THEN :cco_pa_n
    END
    ORDER BY line_type, line_rank;
    """)

    controls_df = pd.read_sql(
        control_query,
        engine,
        params={
            'line_types': tuple(line_targets.index),
            'multi_lumen_n': int(line_targets.get('Multi Lumen', 0)),
            'dialysis_n': int(line_targets.get('Dialysis', 0)),
            'picc_n': int(line_targets.get('PICC', 0)),
            'pa_n': int(line_targets.get('PA', 0)),
            'cordis_n': int(line_targets.get('Cordis/Introducer', 0)),
            'hickman_n': int(line_targets.get('Hickman', 0)),
            'portacath_n': int(line_targets.get('Portacath', 0)),
            'cco_pa_n': int(line_targets.get('Continuous Cardiac Output PA', 0))
        }
    )

    print("\nControl Selection Summary:")
    print("-" * 30)
    print(f"Total controls: {len(controls_df)}")
    print(f"Unique patients: {controls_df['subject_id'].nunique()}")
    print(f"Unique ICU stays: {controls_df['stay_id'].nunique()}")
    print("\nControls by Line Type:")
    print(controls_df['line_type'].value_counts())
    
    # Compare to CLABSI cohort
    print("\nMatching Ratio Verification:")
    unique_clabsi = unique_stays['line_type'].value_counts()
    unique_controls = controls_df['line_type'].value_counts()
    ratios = pd.DataFrame({
        'CLABSI_stays': unique_clabsi,
        'Control_stays': unique_controls,
        'Ratio': unique_controls / unique_clabsi
    }).round(2)
    print(ratios)
    
    return controls_df

# Get controls
if 'clabsi_cohort' in locals():
    control_cohort = get_control_cohort(engine, clabsi_cohort)