In [4]:
import pandas as pd
from sqlalchemy import create_engine, text

engine = create_engine('postgresql://postgres:ramiel12@localhost:5432/mimiciv')

# Check key tables
key_tables = [
    ('mimiciv_icu', 'procedureevents'),
    ('mimiciv_hosp', 'microbiologyevents'),
    ('mimiciv_derived', 'first_day_lab')
]

for schema, table in key_tables:
    query = f"""
    SELECT COUNT(*) as row_count 
    FROM {schema}.{table}
    """
    try:
        result = pd.read_sql(query, engine)
        print(f"{schema}.{table}: {result['row_count'][0]} rows")
    except Exception as e:
        print(f"{schema}.{table}: {str(e)}")

mimiciv_icu.procedureevents: 808706 rows
mimiciv_hosp.microbiologyevents: 3988224 rows
mimiciv_derived.first_day_lab: 94458 rows


In [6]:
# Second verification - more comprehensive table checks
table_groups = {
    'Line Tables': [
        ('mimiciv_hosp', 'procedures_icd'),
        ('mimiciv_icu', 'chartevents'),
        ('mimiciv_derived', 'invasive_line')
    ],
    'Patient Tables': [
        ('mimiciv_hosp', 'diagnoses_icd'),
        ('mimiciv_derived', 'sofa'),
        ('mimiciv_derived', 'first_day_vitalsign'),
        ('mimiciv_icu', 'icustays')
    ],
    'Clinical Tables': [
        ('mimiciv_hosp', 'prescriptions'),
        ('mimiciv_derived', 'blood_differential'),
        ('mimiciv_derived', 'first_day_lab')
    ]
}

# Check each group of tables
for group_name, tables in table_groups.items():
    print(f"\n{group_name}:")
    for schema, table in tables:
        query = f"""
        SELECT COUNT(*) as row_count 
        FROM {schema}.{table}
        """
        try:
            result = pd.read_sql(query, engine)
            print(f"{schema}.{table}: {result['row_count'][0]} rows")
        except Exception as e:
            print(f"{schema}.{table}: {str(e)}")


Line Tables:
mimiciv_hosp.procedures_icd: 859655 rows
mimiciv_icu.chartevents: 432997491 rows
mimiciv_derived.invasive_line: 108165 rows

Patient Tables:
mimiciv_hosp.diagnoses_icd: 6364488 rows
mimiciv_derived.sofa: 8219121 rows
mimiciv_derived.first_day_vitalsign: 94458 rows
mimiciv_icu.icustays: 94458 rows

Clinical Tables:
mimiciv_hosp.prescriptions: 20292611 rows
mimiciv_derived.blood_differential: 4154226 rows
mimiciv_derived.first_day_lab: 94458 rows


In [8]:
# Quality checks on key tables
quality_checks = [
    # Check ICU stays timeframes
    """
    SELECT 
        COUNT(*) as total_stays,
        COUNT(CASE WHEN outtime < intime THEN 1 END) as invalid_times,
        AVG(EXTRACT(EPOCH FROM (outtime - intime))/3600/24) as avg_los_days
    FROM mimiciv_icu.icustays
    """,
    
    # Check invasive line completeness
    """
    SELECT 
        COUNT(*) as total_lines,
        COUNT(CASE WHEN line_type IS NULL THEN 1 END) as missing_type,
        COUNT(CASE WHEN starttime IS NULL THEN 1 END) as missing_start
    FROM mimiciv_derived.invasive_line
    """,
    
    # Check diagnosis codes for potential CLABSI
    """
    SELECT COUNT(*) as clabsi_codes
    FROM mimiciv_hosp.diagnoses_icd
    WHERE icd_code IN ('99931', '99932', 'T80211A')  -- Common CLABSI codes
    """
]

for i, check in enumerate(quality_checks, 1):
    print(f"\nQuality Check {i}:")
    result = pd.read_sql(check, engine)
    print(result)


Quality Check 1:
   total_stays  invalid_times  avg_los_days
0        94458              0      3.630025

Quality Check 2:
   total_lines  missing_type  missing_start
0       108165             0              0

Quality Check 3:
   clabsi_codes
0          1844


In [10]:
# Examine line types and locations
line_analysis = """
WITH line_counts AS (
    SELECT 
        line_type,
        COUNT(*) as count,
        COUNT(DISTINCT stay_id) as unique_patients,
        AVG(EXTRACT(EPOCH FROM (endtime - starttime))/3600/24) as avg_duration_days
    FROM mimiciv_derived.invasive_line
    WHERE line_type IS NOT NULL
    GROUP BY line_type
)
SELECT 
    line_type,
    count,
    unique_patients,
    ROUND(count * 100.0 / SUM(count) OVER (), 2) as percentage,
    ROUND(avg_duration_days::numeric, 2) as avg_duration_days
FROM line_counts
ORDER BY count DESC
"""

print("Line Type Distribution:")
line_dist = pd.read_sql(line_analysis, engine)
print(line_dist)

Line Type Distribution:
                       line_type  count  unique_patients  percentage  \
0                       Arterial  38800            32238       35.87   
1                    Multi Lumen  25502            22562       23.58   
2                           PICC  14595            13616       13.49   
3                             PA   5845             5611        5.40   
4              Cordis/Introducer   5783             5408        5.35   
5                       Dialysis   5504             4464        5.09   
6   Continuous Cardiac Output PA   1922             1868        1.78   
7                Sheath (Venous)   1871             1497        1.73   
8                      Portacath   1602             1550        1.48   
9                        Midline   1562             1504        1.44   
10                          IABP   1076             1038        0.99   
11                           AVA    838              818        0.77   
12                  ICP Catheter    718 