In [1]:
"""
CLABSI Definition v2.0
----------------------
Updates:
- Expanded central line definitions to include all relevant types:
  * PICC (Peripherally Inserted Central Catheter)
  * Multi Lumen central lines
  * Dialysis catheters
  * Hickman/tunneled catheters
  * Portacath/implanted ports  
  * Triple/Cordis introducers
  * PA catheters
- Excluded non-central lines:
  * Arterial lines
  * Midlines
  * Peripheral IVs
  * Specialized cardiac devices (IABP, Impella, etc.)

This notebook validates CLABSI cases using:
1. ICD Codes (99931, 99932, T80211A)
2. Positive blood cultures
3. Central line presence
4. CDC/NHSN criteria for pathogen classification
"""

import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings('ignore')

# Database connection
def connect_db():
    try:
        engine = create_engine('postgresql://postgres:ramiel12@localhost:5432/mimiciv')
        connection = engine.connect()
        print("Connected to mimiciv!")
        return engine
    except Exception as e:
        print(f"Connection failed: {e}")
        return None

engine = connect_db()

Connected to mimiciv!


In [2]:
# Main CLABSI identification query with improved organism classification
clabsi_query = text("""
WITH clabsi_icd AS (
    SELECT DISTINCT hadm_id, MIN(icd_code) as icd_code
    FROM mimiciv_hosp.diagnoses_icd
    WHERE icd_code IN ('99931', '99932', 'T80211A')
    GROUP BY hadm_id
),
line_data AS (
    SELECT stay_id, line_type, starttime, endtime,
           EXTRACT(EPOCH FROM (endtime - starttime))/86400 as line_duration_days
    FROM mimiciv_derived.invasive_line
    WHERE line_type IN (
        'PICC',
        'Multi Lumen',
        'Dialysis',
        'Triple Introducer', 
        'Pre-Sep',
        'Hickman',
        'Portacath',
        'Cordis/Introducer',
        'Continuous Cardiac Output PA',
        'PA'
    )
    AND endtime > starttime  -- ensure valid duration
),
line_hadm AS (
    SELECT l.*, i.hadm_id
    FROM line_data l
    INNER JOIN mimiciv_icu.icustays i ON l.stay_id = i.stay_id
),
cultures AS (
    SELECT 
        m.hadm_id,
        m.charttime,
        m.spec_type_desc,
        m.org_name,
        COUNT(*) OVER (
            PARTITION BY m.hadm_id, m.org_name
        ) as cultures_of_org,
        COUNT(*) OVER (
            PARTITION BY m.hadm_id, 
            DATE_TRUNC('day', m.charttime),
            m.org_name
        ) as daily_org_cultures
    FROM mimiciv_hosp.microbiologyevents m
    WHERE spec_type_desc = 'BLOOD CULTURE'
    AND org_name IS NOT NULL
)
SELECT DISTINCT
    i.stay_id,
    i.hadm_id,
    c.icd_code,
    l.line_type,
    l.line_duration_days,
    l.starttime as line_start,
    l.endtime as line_end,
    pc.charttime as culture_time,
    pc.org_name,
    pc.cultures_of_org,
    pc.daily_org_cultures,
    EXTRACT(EPOCH FROM (pc.charttime - l.starttime))/86400 as days_to_culture,
    CASE
        -- High frequency confirmed pathogens
        WHEN pc.org_name LIKE '%STAPH AUREUS COAG +%' OR
             pc.org_name = 'STAPHYLOCOCCUS, COAGULASE NEGATIVE' OR
             pc.org_name = 'ENTEROCOCCUS FAECIUM' OR
             pc.org_name = 'STAPHYLOCOCCUS EPIDERMIDIS' OR
             pc.org_name = 'CANDIDA ALBICANS' OR
             pc.org_name = 'ENTEROCOCCUS FAECALIS'
        THEN 'Common Pathogen'
        
        -- Gram negative pathogens
        WHEN pc.org_name = 'KLEBSIELLA PNEUMONIAE' OR
             pc.org_name = 'ESCHERICHIA COLI' OR
             pc.org_name = 'PSEUDOMONAS AERUGINOSA' OR
             pc.org_name = 'SERRATIA MARCESCENS'
        THEN 'Gram Negative Pathogen'
        
        -- Less common Candida species
        WHEN pc.org_name = 'CANDIDA GLABRATA' OR
             pc.org_name = 'CANDIDA PARAPSILOSIS' OR
             pc.org_name = 'CANDIDA TROPICALIS'
        THEN 'Candida Species'
        
        -- Other recognized pathogens
        WHEN pc.org_name = 'BACTEROIDES FRAGILIS GROUP' OR
             pc.org_name = 'VIRIDANS STREPTOCOCCI'
        THEN 'Other Pathogen'
        
        -- Multiple cultures of same organism required
        WHEN pc.daily_org_cultures >= 2 
        THEN 'Confirmed by Multiple Cultures'
        
        ELSE 'Indeterminate'
    END as organism_classification,
    CASE
        WHEN pc.org_name LIKE '%STAPH%' OR pc.org_name LIKE '%STREP%'
        THEN 'Gram Positive'
        WHEN pc.org_name IN ('ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE', 
                           'PSEUDOMONAS AERUGINOSA', 'SERRATIA MARCESCENS')
        THEN 'Gram Negative'
        WHEN pc.org_name LIKE '%CANDIDA%'
        THEN 'Fungal'
        ELSE 'Other'
    END as organism_group,
    pc.daily_org_cultures >= 2 as multiple_cultures
FROM mimiciv_icu.icustays i
INNER JOIN clabsi_icd c ON i.hadm_id = c.hadm_id
INNER JOIN line_hadm l ON i.stay_id = l.stay_id
INNER JOIN cultures pc ON i.hadm_id = pc.hadm_id
WHERE 
    pc.charttime > l.starttime 
    AND pc.charttime <= l.starttime + INTERVAL '14 days'
    AND l.line_duration_days >= 2  -- Line present ≥2 calendar days
ORDER BY stay_id, line_start, culture_time;
""")

# Execute query
clabsi_df = pd.read_sql(clabsi_query, engine)

# Display initial results
print(f"\nFound {len(clabsi_df)} potential CLABSI events")
print(f"Unique patients: {len(clabsi_df['stay_id'].unique())}")

# Display organism distribution
print("\nOrganism Classification Distribution:")
print(clabsi_df['organism_classification'].value_counts())

print("\nOrganism Group Distribution:")
print(clabsi_df['organism_group'].value_counts())

print("\nSample of identified cases:")
print(clabsi_df.head())


Found 940 potential CLABSI events
Unique patients: 231

Organism Classification Distribution:
organism_classification
Common Pathogen                   684
Candida Species                    72
Gram Negative Pathogen             70
Confirmed by Multiple Cultures     45
Other Pathogen                     36
Indeterminate                      33
Name: count, dtype: int64

Organism Group Distribution:
organism_group
Gram Positive    420
Other            276
Fungal           174
Gram Negative     70
Name: count, dtype: int64

Sample of identified cases:
    stay_id   hadm_id icd_code    line_type  line_duration_days  \
0  30015010  28133020  99931           PICC            9.938889   
1  30017005  24426241  T80211A     Dialysis           63.048611   
2  30017005  24426241  T80211A  Multi Lumen           19.035417   
3  30170059  28102452  T80211A         PICC           10.310417   
4  30170059  28102452  T80211A         PICC           10.310417   

           line_start            line_en

In [3]:
# Cell: Summarize which lines you are capturing under your new definition
line_summary_query = text("""
SELECT line_type, COUNT(*) as num_lines
FROM mimiciv_derived.invasive_line
WHERE line_type IN (
    'PICC',
    'Multi Lumen',
    'Dialysis',
    'Triple Introducer', 
    'Pre-Sep',
    'Hickman',
    'Portacath',
    'Cordis/Introducer',
    'Continuous Cardiac Output PA',
    'PA'
)
GROUP BY line_type
ORDER BY num_lines DESC;
""")

line_summary_df = pd.read_sql(line_summary_query, engine)
print("Line Summary:")
print(line_summary_df)


Line Summary:
                      line_type  num_lines
0                   Multi Lumen      25502
1                          PICC      14595
2                            PA       5845
3             Cordis/Introducer       5783
4                      Dialysis       5504
5  Continuous Cardiac Output PA       1922
6                     Portacath       1602
7                       Hickman        388
8                       Pre-Sep        124
9             Triple Introducer         56


In [4]:
# Detailed organism analysis incorporating CDC/NHSN classifications
org_analysis_v2 = text("""
WITH clabsi_icd AS (
    SELECT DISTINCT hadm_id, MIN(icd_code) as icd_code
    FROM mimiciv_hosp.diagnoses_icd
    WHERE icd_code IN ('99931', '99932', 'T80211A')
    GROUP BY hadm_id
),
line_data AS (
    SELECT stay_id, line_type, starttime, endtime,
           EXTRACT(EPOCH FROM (endtime - starttime))/86400 as line_duration_days
    FROM mimiciv_derived.invasive_line
    WHERE line_type IN (
        'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer', 
        'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
        'Continuous Cardiac Output PA', 'PA'
    )
    AND endtime > starttime
),
line_hadm AS (
    SELECT l.*, i.hadm_id
    FROM line_data l
    INNER JOIN mimiciv_icu.icustays i ON l.stay_id = i.stay_id
),
cultures AS (
    SELECT 
        m.hadm_id,
        m.charttime,
        m.spec_type_desc,
        m.org_name,
        COUNT(*) OVER (
            PARTITION BY m.hadm_id, 
            DATE_TRUNC('day', m.charttime),
            m.org_name
        ) as daily_org_cultures
    FROM mimiciv_hosp.microbiologyevents m
    WHERE spec_type_desc = 'BLOOD CULTURE'
    AND org_name IS NOT NULL
),
organism_stats AS (
    SELECT 
        org_name,
        COUNT(DISTINCT i.stay_id) as unique_stays,
        AVG(c.daily_org_cultures) as avg_daily_cultures,
        AVG(EXTRACT(EPOCH FROM (c.charttime - l.starttime))/86400) as avg_days_to_culture,
        COUNT(*) as total_cultures,
        SUM(CASE WHEN c.daily_org_cultures >= 2 THEN 1 ELSE 0 END) as multi_culture_cases,
        COUNT(DISTINCT l.line_type) as line_types_affected
    FROM mimiciv_icu.icustays i
    INNER JOIN clabsi_icd cla ON i.hadm_id = cla.hadm_id
    INNER JOIN line_hadm l ON i.stay_id = l.stay_id
    INNER JOIN cultures c ON i.hadm_id = c.hadm_id
    WHERE 
        c.charttime > l.starttime 
        AND c.charttime <= l.starttime + INTERVAL '14 days'
        AND l.line_duration_days >= 2
    GROUP BY org_name
)
SELECT 
    org_name,
    unique_stays,
    total_cultures,
    ROUND(avg_daily_cultures::numeric, 2) as avg_daily_cultures,
    ROUND(avg_days_to_culture::numeric, 2) as avg_days_to_culture,
    multi_culture_cases,
    line_types_affected,
    CASE
        WHEN org_name IN (
            'STAPH AUREUS COAG +', 'ESCHERICHIA COLI',
            'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA',
            'ENTEROCOCCUS FAECIUM', 'CANDIDA ALBICANS'
        ) THEN 'Primary Pathogen'
        WHEN org_name IN (
            'STAPHYLOCOCCUS, COAGULASE NEGATIVE',
            'STAPHYLOCOCCUS EPIDERMIDIS'
        ) AND multi_culture_cases > 0 THEN 'Validated Common Contaminant'
        WHEN org_name LIKE 'CANDIDA%' THEN 'Other Candida Species'
        WHEN org_name IN (
            'ENTEROCOCCUS FAECALIS',
            'SERRATIA MARCESCENS',
            'BACTEROIDES FRAGILIS GROUP'
        ) THEN 'Secondary Pathogen'
        ELSE 'Other Organism'
    END as organism_category
FROM organism_stats
ORDER BY unique_stays DESC, total_cultures DESC
LIMIT 20;
""")

# Execute query
org_results = pd.read_sql(org_analysis_v2, engine)

# Display results
print("\nDetailed Organism Analysis:")
print("-------------------------")
print(f"Total organisms analyzed: {len(org_results)}")

print("\nOrganism Category Distribution:")
print(org_results['organism_category'].value_counts())

print("\nTop Organisms by Frequency:")
print(org_results[['org_name', 'unique_stays', 'avg_daily_cultures', 'avg_days_to_culture']].head(10))


Detailed Organism Analysis:
-------------------------
Total organisms analyzed: 20

Organism Category Distribution:
organism_category
Primary Pathogen                6
Other Organism                  5
Other Candida Species           4
Secondary Pathogen              3
Validated Common Contaminant    2
Name: count, dtype: int64

Top Organisms by Frequency:
                             org_name  unique_stays  avg_daily_cultures  \
0  STAPHYLOCOCCUS, COAGULASE NEGATIVE            58               12.52   
1                 STAPH AUREUS COAG +            41                7.16   
2                ENTEROCOCCUS FAECIUM            39                4.73   
3          STAPHYLOCOCCUS EPIDERMIDIS            35               11.27   
4                    CANDIDA ALBICANS            19                1.93   
5               ENTEROCOCCUS FAECALIS            13                3.94   
6                    CANDIDA GLABRATA            11                1.36   
7              PSEUDOMONAS AERUGINOSA   

In [5]:
# Comprehensive CLABSI validation incorporating CDC criteria and timing rules
clabsi_validation_v2 = text("""
WITH clabsi_icd AS (
    SELECT DISTINCT hadm_id, MIN(icd_code) as icd_code
    FROM mimiciv_hosp.diagnoses_icd
    WHERE icd_code IN ('99931', '99932', 'T80211A')
    GROUP BY hadm_id
),
line_data AS (
    SELECT stay_id, line_type, starttime, endtime,
           EXTRACT(EPOCH FROM (endtime - starttime))/86400 as line_duration_days
    FROM mimiciv_derived.invasive_line
    WHERE line_type IN (
        'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer', 
        'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
        'Continuous Cardiac Output PA', 'PA'
    )
    AND endtime > starttime
    AND EXTRACT(EPOCH FROM (endtime - starttime))/86400 >= 2  -- Minimum 2 day duration
),
line_hadm AS (
    SELECT l.*, i.hadm_id
    FROM line_data l
    INNER JOIN mimiciv_icu.icustays i ON l.stay_id = i.stay_id
),
cultures AS (
    SELECT 
        m.hadm_id,
        m.charttime,
        m.spec_type_desc,
        m.org_name,
        COUNT(*) OVER (
            PARTITION BY m.hadm_id, m.org_name
        ) as total_org_cultures,
        COUNT(*) OVER (
            PARTITION BY m.hadm_id, 
            DATE_TRUNC('day', m.charttime),
            m.org_name
        ) as daily_org_cultures
    FROM mimiciv_hosp.microbiologyevents m
    WHERE spec_type_desc = 'BLOOD CULTURE'
    AND org_name IS NOT NULL
),
base_cohort AS (
    SELECT 
        i.stay_id,
        i.hadm_id,
        l.line_type,
        l.line_duration_days,
        l.starttime as line_start,
        l.endtime as line_end,
        c.charttime as culture_time,
        c.org_name,
        c.daily_org_cultures,
        c.total_org_cultures,
        EXTRACT(EPOCH FROM (c.charttime - l.starttime))/86400 as days_to_culture,
        CASE
            -- Primary pathogens (single culture sufficient)
            WHEN c.org_name IN (
                'STAPH AUREUS COAG +', 'ESCHERICHIA COLI',
                'KLEBSIELLA PNEUMONIAE', 'PSEUDOMONAS AERUGINOSA',
                'ENTEROCOCCUS FAECIUM', 'CANDIDA ALBICANS'
            ) THEN TRUE
            
            -- Common skin contaminants (require multiple cultures)
            WHEN c.org_name IN (
                'STAPHYLOCOCCUS, COAGULASE NEGATIVE',
                'STAPHYLOCOCCUS EPIDERMIDIS'
            ) AND c.daily_org_cultures >= 2 THEN TRUE
            
            -- Other recognized pathogens
            WHEN c.org_name LIKE 'CANDIDA%' OR
                 c.org_name IN (
                    'ENTEROCOCCUS FAECALIS',
                    'SERRATIA MARCESCENS',
                    'BACTEROIDES FRAGILIS GROUP'
                 ) THEN TRUE
            
            ELSE FALSE
        END as is_confirmed_pathogen,
        CASE
            WHEN c.org_name LIKE '%STAPH%' OR c.org_name LIKE '%STREP%'
            THEN 'Gram Positive'
            WHEN c.org_name IN (
                'ESCHERICHIA COLI', 'KLEBSIELLA PNEUMONIAE',
                'PSEUDOMONAS AERUGINOSA', 'SERRATIA MARCESCENS'
            ) THEN 'Gram Negative'
            WHEN c.org_name LIKE '%CANDIDA%'
            THEN 'Fungal'
            ELSE 'Other'
        END as organism_group
    FROM mimiciv_icu.icustays i
    INNER JOIN clabsi_icd cla ON i.hadm_id = cla.hadm_id
    INNER JOIN line_hadm l ON i.stay_id = l.stay_id
    INNER JOIN cultures c ON i.hadm_id = c.hadm_id
    WHERE 
        c.charttime > l.starttime 
        AND c.charttime <= l.starttime + INTERVAL '14 days'
),
validation_summary AS (
    SELECT
        stay_id,
        COUNT(DISTINCT org_name) as distinct_organisms,
        COUNT(DISTINCT CASE WHEN is_confirmed_pathogen THEN org_name END) as confirmed_pathogens,
        COUNT(DISTINCT organism_group) as distinct_organism_groups,
        MIN(days_to_culture) as earliest_culture_days,
        MAX(days_to_culture) as latest_culture_days,
        bool_or(daily_org_cultures >= 2) as has_multiple_cultures,
        COUNT(*) as total_positive_cultures,
        MAX(line_duration_days) as max_line_duration,
        -- Calculate validation status here
        CASE
            WHEN COUNT(DISTINCT CASE WHEN is_confirmed_pathogen THEN org_name END) > 0 THEN 1
            WHEN bool_or(daily_org_cultures >= 2) AND COUNT(DISTINCT org_name) > 1 THEN 2
            WHEN bool_or(daily_org_cultures >= 2) THEN 3
            ELSE 4
        END as validation_rank
    FROM base_cohort
    GROUP BY stay_id
)
SELECT
    stay_id,
    distinct_organisms,
    confirmed_pathogens,
    distinct_organism_groups,
    earliest_culture_days,
    latest_culture_days,
    has_multiple_cultures,
    total_positive_cultures,
    max_line_duration,
    CASE
        WHEN validation_rank = 1 THEN 'Confirmed CLABSI'
        WHEN validation_rank = 2 THEN 'Probable CLABSI'
        WHEN validation_rank = 3 THEN 'Possible CLABSI'
        ELSE 'Requires Review'
    END as validation_status
FROM validation_summary
ORDER BY 
    validation_rank,
    confirmed_pathogens DESC,
    total_positive_cultures DESC;
""")

# Execute query and analyze results
validation_results = pd.read_sql(clabsi_validation_v2, engine)

print("\nCLABSI Validation Results:")
print("------------------------")
print(f"Total cases analyzed: {len(validation_results)}")

print("\nValidation Status Distribution:")
print(validation_results['validation_status'].value_counts())

print("\nKey Statistics:")
print(f"Average distinct organisms per case: {validation_results['distinct_organisms'].mean():.2f}")
print(f"Cases with confirmed pathogens: {(validation_results['confirmed_pathogens'] > 0).sum()}")
print(f"Cases with multiple cultures: {validation_results['has_multiple_cultures'].sum()}")
print(f"Average line duration (days): {validation_results['max_line_duration'].mean():.2f}")

print("\nTime to Culture Statistics (days):")
print(validation_results[['earliest_culture_days', 'latest_culture_days']].describe())

print("\nDetailed Results Sample:")
print(validation_results.head())


CLABSI Validation Results:
------------------------
Total cases analyzed: 231

Validation Status Distribution:
validation_status
Confirmed CLABSI    194
Requires Review      26
Possible CLABSI      11
Name: count, dtype: int64

Key Statistics:
Average distinct organisms per case: 1.29
Cases with confirmed pathogens: 194
Cases with multiple cultures: 173
Average line duration (days): 7.50

Time to Culture Statistics (days):
       earliest_culture_days  latest_culture_days
count             231.000000           231.000000
mean                3.994411             6.506578
std                 3.947134             4.638826
min                 0.003472             0.003472
25%                 0.636111             1.970486
50%                 2.531250             5.918056
75%                 6.415278            11.272917
max                13.959028            13.993056

Detailed Results Sample:
    stay_id  distinct_organisms  confirmed_pathogens  \
0  38320222                   4         

In [6]:
"""
CLABSI Definition 2.0 Final Documentation
---------------------------------------

Major Improvements from Version 1.0:

1. Expanded Line Type Definitions
   - Added comprehensive central line classifications
   - Included specialized lines (PA, dialysis, etc.)
   - Excluded non-central lines
   - Added duration validation (≥2 days)

2. Enhanced Organism Classification
   - Primary pathogens (single culture sufficient)
   - Common skin contaminants (multiple cultures required)
   - Other recognized pathogens
   - Gram classification
   - Daily culture tracking

3. Improved Validation Criteria
   - CDC/NHSN alignment
   - Multiple culture requirements
   - Time window specifications
   - Line duration requirements

Results Summary:
---------------
Total Cases: {len(validation_results)}
Distribution:
- Confirmed CLABSI: {len(validation_results[validation_results['validation_status'] == 'Confirmed CLABSI'])}
- Probable CLABSI: {len(validation_results[validation_results['validation_status'] == 'Probable CLABSI'])}
- Possible CLABSI: {len(validation_results[validation_results['validation_status'] == 'Possible CLABSI'])}
- Requires Review: {len(validation_results[validation_results['validation_status'] == 'Requires Review'])}

Notes:
- All timing based on documented line placement
- Multiple cultures counted within 24-hour periods
- Line types validated against CDC guidelines
- Organism classification aligned with NHSN

Version 2.0 Changes:
- More precise organism classification
- Better handling of common contaminants 
- Improved validation logic
- Enhanced documentation requirements
"""

# Print summary statistics
print("CLABSI Definition 2.0 Summary")
print("--------------------------")
print(f"Total validated cases: {len(validation_results)}")
print("\nValidation Status Distribution:")
print(validation_results['validation_status'].value_counts())

CLABSI Definition 2.0 Summary
--------------------------
Total validated cases: 231

Validation Status Distribution:
validation_status
Confirmed CLABSI    194
Requires Review      26
Possible CLABSI      11
Name: count, dtype: int64
