In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings('ignore')

def connect_db():
    """Establish database connection to MIMIC IV"""
    try:
        engine = create_engine('postgresql://postgres:ramiel12@localhost:5432/mimiciv')
        print("Successfully connected to MIMIC IV database!")
        return engine
    except Exception as e:
        print(f"Connection failed: {e}")
        return None

# Create connection
engine = connect_db()

Successfully connected to MIMIC IV database!


In [2]:
# Define the initial cohort query
cohort_query = text("""
WITH first_icu_stays AS (
    SELECT 
        ie.subject_id,
        ie.hadm_id,
        ie.stay_id,
        ie.intime as icu_admission,
        ie.outtime as icu_discharge,
        EXTRACT(EPOCH FROM (ie.outtime - ie.intime))/3600 as icu_los_hours,
        ROW_NUMBER() OVER (PARTITION BY ie.subject_id ORDER BY ie.intime) as icu_stay_number,
        p.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(p.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS admission_age
    FROM mimiciv_icu.icustays ie
    INNER JOIN mimiciv_hosp.admissions adm 
        ON ie.hadm_id = adm.hadm_id
    INNER JOIN mimiciv_hosp.patients p 
        ON ie.subject_id = p.subject_id
    WHERE p.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(p.anchor_year, 1, 1, 0, 0, 0))/31556908.8 >= 18
),
first_lines AS (
    SELECT 
        fis.*,
        il.starttime as line_placement_time,
        il.endtime as line_removal_time,
        il.line_type,
        EXTRACT(EPOCH FROM (il.endtime - il.starttime))/24/3600 as line_days,
        ROW_NUMBER() OVER (PARTITION BY fis.stay_id ORDER BY il.starttime) as line_number
    FROM first_icu_stays fis
    INNER JOIN mimiciv_derived.invasive_line il 
        ON fis.stay_id = il.stay_id
    WHERE fis.icu_stay_number = 1  -- First ICU stay only
        AND fis.icu_los_hours >= 48  -- Survived at least 48 hours
        AND il.line_type IN (
            'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
            'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
            'Continuous Cardiac Output PA', 'PA'
        )
        AND EXTRACT(EPOCH FROM (il.endtime - il.starttime))/24/3600 >= 2  -- Line in place >2 days
),
-- Check for potential secondary BSIs
other_infections AS (
    SELECT DISTINCT
        hadm_id,
        charttime
    FROM mimiciv_hosp.microbiologyevents
    WHERE spec_type_desc NOT IN ('BLOOD CULTURE', '')
        AND org_name IS NOT NULL
),
blood_cultures AS (
    SELECT 
        qs.stay_id,
        qs.subject_id,
        qs.hadm_id,
        me.charttime,
        me.spec_type_desc,
        me.org_name,
        CASE 
            WHEN LOWER(me.org_name) SIMILAR TO '%(coagulase-negative staphylococci|staphylococcus epidermidis|staphylococcus haemolyticus|staphylococcus hominis|propionibacterium|corynebacterium|diphtheroids|bacillus species|micrococcus)%' 
                THEN 'common_commensal'
            WHEN LOWER(me.org_name) SIMILAR TO '%(campylobacter|salmonella|shigella|listeria|vibrio|yersinia|difficile|enterohemorrhagic|enteropathogenic|blastomyces|histoplasma|coccidioides|paracoccidioides|cryptococcus|pneumocystis)%' 
                THEN 'excluded'
            ELSE 'recognized_pathogen'
        END as organism_type
    FROM first_lines qs
    INNER JOIN mimiciv_hosp.microbiologyevents me 
        ON qs.hadm_id = me.hadm_id
    WHERE me.spec_type_desc = 'BLOOD CULTURE'
        AND me.org_name IS NOT NULL
        AND me.charttime > qs.line_placement_time + INTERVAL '2 days'
        AND me.charttime <= qs.line_removal_time
        AND qs.line_number = 1  -- Only consider first line
        -- Exclude if other infection within ±3 days
        AND NOT EXISTS (
            SELECT 1 
            FROM other_infections oi 
            WHERE oi.hadm_id = me.hadm_id 
                AND oi.charttime BETWEEN me.charttime - INTERVAL '3 days' 
                AND me.charttime + INTERVAL '3 days'
        )
),
clabsi_events AS (
    SELECT DISTINCT
        stay_id,
        subject_id,
        hadm_id,
        MIN(charttime) as infection_date
    FROM blood_cultures bc
    WHERE (organism_type = 'recognized_pathogen')
       OR (organism_type = 'common_commensal' 
           AND EXISTS (
               SELECT 1 
               FROM blood_cultures bc2 
               WHERE bc2.stay_id = bc.stay_id 
                   AND bc2.org_name = bc.org_name 
                   AND bc2.charttime != bc.charttime
                   AND bc2.charttime <= bc.charttime + INTERVAL '2 days'
           ))
    GROUP BY stay_id, subject_id, hadm_id
),
mortality_outcomes AS (
    SELECT 
        qs.stay_id,
        qs.subject_id,
        qs.hadm_id,
        qs.line_placement_time,
        CASE 
            WHEN p.dod IS NOT NULL 
                AND p.dod <= (qs.line_placement_time + INTERVAL '30 days')
                THEN 1
            ELSE 0
        END as mortality_30d,
        p.dod as death_date
    FROM first_lines qs
    LEFT JOIN mimiciv_hosp.patients p 
        ON qs.subject_id = p.subject_id
    WHERE qs.line_number = 1  -- Only consider first line
)
SELECT 
    qs.*,
    CASE 
        WHEN ce.stay_id IS NOT NULL THEN 1
        ELSE 0
    END as has_clabsi,
    ce.infection_date,
    mo.mortality_30d,
    mo.death_date
FROM first_lines qs
LEFT JOIN clabsi_events ce 
    ON qs.stay_id = ce.stay_id
LEFT JOIN mortality_outcomes mo 
    ON qs.stay_id = mo.stay_id
WHERE qs.line_number = 1  -- Only include first line for each stay
ORDER BY qs.subject_id, qs.icu_admission;
""")

# Execute query and create initial cohort
try:
    cohort_df = pd.read_sql(cohort_query, engine)
    print(f"Successfully created cohort with {len(cohort_df)} rows")
    print("\nCohort Summary:")
    print(f"Total patients: {cohort_df['subject_id'].nunique()}")
    print(f"CLABSI cases: {cohort_df['has_clabsi'].sum()}")
    print(f"30-day mortality cases: {cohort_df['mortality_30d'].sum()}")
    
    # Additional statistics
    print("\nDetailed Statistics:")
    print(f"CLABSI rate: {(cohort_df['has_clabsi'].sum() / len(cohort_df) * 100):.2f}%")
    print(f"30-day mortality rate: {(cohort_df['mortality_30d'].sum() / len(cohort_df) * 100):.2f}%")
    print("\nLine Type Distribution:")
    print(cohort_df['line_type'].value_counts())
    
    # Average age and LOS
    print("\nPatient Demographics:")
    print(f"Mean age: {cohort_df['admission_age'].mean():.1f} years")
    print(f"Median ICU LOS: {cohort_df['icu_los_hours'].median()/24:.1f} days")
    
except Exception as e:
    print(f"Error executing cohort query: {e}")

Successfully created cohort with 13328 rows

Cohort Summary:
Total patients: 13328
CLABSI cases: 114
30-day mortality cases: 3004

Detailed Statistics:
CLABSI rate: 0.86%
30-day mortality rate: 22.54%

Line Type Distribution:
line_type
Multi Lumen                     7461
PICC                            3048
Cordis/Introducer               1169
Dialysis                         567
PA                               516
Portacath                        318
Continuous Cardiac Output PA     151
Pre-Sep                           49
Hickman                           39
Triple Introducer                 10
Name: count, dtype: int64

Patient Demographics:
Mean age: 65.4 years
Median ICU LOS: 5.7 days


In [3]:
# ============================================
# NEXT CELL in your Jupyter Notebook
# ============================================

import numpy as np

# Shuffle your DataFrame (optional but common)
cohort_df = cohort_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Create a random number column for splitting
np.random.seed(42)  # ensures reproducible splits
cohort_df['rand_val'] = np.random.rand(len(cohort_df))

# Split into training (70%) and validation (30%) sets
train_df = cohort_df[cohort_df['rand_val'] < 0.7].copy()
val_df   = cohort_df[cohort_df['rand_val'] >= 0.7].copy()

print("============= DATA SPLIT =============")
print(f"Total rows in cohort_df: {len(cohort_df):,}")
print(f"Training set size:       {len(train_df):,}")
print(f"Validation set size:     {len(val_df):,}")

train_clabsi_count = train_df['has_clabsi'].sum()
val_clabsi_count   = val_df['has_clabsi'].sum()
print(f"\nTraining CLABSI cases: {train_clabsi_count} "
      f"({train_clabsi_count/len(train_df)*100:.2f}%)")
print(f"Validation CLABSI cases: {val_clabsi_count} "
      f"({val_clabsi_count/len(val_df)*100:.2f}%)")

# If you'd like to also check mortality distribution:
train_mort_count = train_df['mortality_30d'].sum()
val_mort_count   = val_df['mortality_30d'].sum()
print(f"\nTraining 30-day mortality: {train_mort_count} "
      f"({train_mort_count/len(train_df)*100:.2f}%)")
print(f"Validation 30-day mortality: {val_mort_count} "
      f"({val_mort_count/len(val_df)*100:.2f}%)")

# Now you have train_df and val_df for further modeling.


Total rows in cohort_df: 13,328
Training set size:       9,431
Validation set size:     3,897

Training CLABSI cases: 83 (0.88%)
Validation CLABSI cases: 31 (0.80%)

Training 30-day mortality: 2092 (22.18%)
Validation 30-day mortality: 912 (23.40%)


In [4]:
# Extract stay_ids from our existing cohort splits and convert to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Demographics feature query
demo_query = text("""
SELECT 
    ie.stay_id,
    p.gender,
    p.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(p.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS age,
    adm.race as ethnicity
FROM mimiciv_icu.icustays ie
INNER JOIN mimiciv_hosp.patients p 
    ON ie.subject_id = p.subject_id
INNER JOIN mimiciv_hosp.admissions adm 
    ON ie.hadm_id = adm.hadm_id
WHERE ie.stay_id IN :stay_ids
""").bindparams(stay_ids=stay_ids)

# Execute query and join to cohort
try:
    demo_df = pd.read_sql(demo_query, engine)
    
    # Join to existing splits
    train_demo = train_df.merge(demo_df, on='stay_id', how='left')
    val_demo = val_df.merge(demo_df, on='stay_id', how='left')
    
    print("\nDemographic Features Summary:")
    print("-" * 30)
    print("\nTraining Set:")
    print(f"Age (mean ± std): {train_demo['age'].mean():.1f} ± {train_demo['age'].std():.1f}")
    print("\nGender distribution:")
    print(train_demo['gender'].value_counts(normalize=True).multiply(100).round(1))
    print("\nEthnicity distribution:")
    print(train_demo['ethnicity'].value_counts(normalize=True).multiply(100).round(1))
    
    # Check for any missing values
    print("\nMissing values:")
    print(demo_df.isnull().sum())
    
except Exception as e:
    print(f"Error in demographic feature extraction: {e}")


Demographic Features Summary:
------------------------------

Training Set:
Age (mean ± std): 65.4 ± 16.1

Gender distribution:
gender
M    56.0
F    44.0
Name: proportion, dtype: float64

Ethnicity distribution:
ethnicity
WHITE                                        61.2
UNKNOWN                                      15.8
BLACK/AFRICAN AMERICAN                        6.8
OTHER                                         3.5
UNABLE TO OBTAIN                              1.5
WHITE - OTHER EUROPEAN                        1.5
HISPANIC/LATINO - PUERTO RICAN                1.2
ASIAN                                         1.0
ASIAN - CHINESE                               1.0
HISPANIC/LATINO - DOMINICAN                   0.7
HISPANIC OR LATINO                            0.7
WHITE - RUSSIAN                               0.6
PATIENT DECLINED TO ANSWER                    0.6
BLACK/CAPE VERDEAN                            0.6
BLACK/CARIBBEAN ISLAND                        0.6
BLACK/AFRICAN             

In [5]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Lab values query for first 24 hours
lab_query = text("""
WITH first_day_labs AS (
    SELECT 
        ie.stay_id,
        -- CBC
        AVG(cbc.wbc) as wbc_mean,
        MIN(cbc.wbc) as wbc_min,
        MAX(cbc.wbc) as wbc_max,
        AVG(cbc.platelet) as platelet_mean,
        MIN(cbc.platelet) as platelet_min,
        MAX(cbc.platelet) as platelet_max,
        AVG(cbc.hemoglobin) as hemoglobin_mean,
        MIN(cbc.hemoglobin) as hemoglobin_min,
        MAX(cbc.hemoglobin) as hemoglobin_max,
        -- Chemistry
        AVG(chem.aniongap) as aniongap_mean,
        AVG(chem.bicarbonate) as bicarbonate_mean,
        AVG(chem.creatinine) as creatinine_mean,
        AVG(chem.chloride) as chloride_mean,
        AVG(chem.glucose) as glucose_mean,
        AVG(chem.sodium) as sodium_mean,
        AVG(chem.potassium) as potassium_mean,
        -- Coagulation
        AVG(coag.inr) as inr_mean,
        AVG(coag.pt) as pt_mean,
        AVG(coag.ptt) as ptt_mean
    FROM mimiciv_icu.icustays ie
    LEFT JOIN mimiciv_derived.complete_blood_count cbc
        ON ie.subject_id = cbc.subject_id
        AND cbc.charttime >= ie.intime
        AND cbc.charttime <= ie.intime + INTERVAL '24 hours'
    LEFT JOIN mimiciv_derived.chemistry chem
        ON ie.subject_id = chem.subject_id
        AND chem.charttime >= ie.intime
        AND chem.charttime <= ie.intime + INTERVAL '24 hours'
    LEFT JOIN mimiciv_derived.coagulation coag
        ON ie.subject_id = coag.subject_id
        AND coag.charttime >= ie.intime
        AND coag.charttime <= ie.intime + INTERVAL '24 hours'
    WHERE ie.stay_id IN :stay_ids
    GROUP BY ie.stay_id
)
SELECT * FROM first_day_labs;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    lab_df = pd.read_sql(lab_query, engine)
    
    print("\nLaboratory Features Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(lab_df)}")
    
    # Basic statistics for each lab value
    print("\nLab Value Statistics:")
    print(lab_df.describe().round(2))
    
    # Missing value analysis
    missing_counts = lab_df.isnull().sum()
    missing_percents = (lab_df.isnull().sum() / len(lab_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in lab feature extraction: {e}")


Laboratory Features Summary:
------------------------------

Total rows: 13328

Lab Value Statistics:
           stay_id  wbc_mean   wbc_min   wbc_max  platelet_mean  platelet_min  \
count     13328.00  13212.00  13212.00  13212.00       13213.00      13213.00   
mean   34981256.06     13.67     11.83     15.61         196.15        180.21   
std     2898093.69      9.52      8.12     11.48         106.80        104.78   
min    30003598.00      0.10      0.10      0.10           7.33          5.00   
25%    32428727.00      8.85      7.50      9.90         125.25        109.00   
50%    34979983.00     12.25     10.60     13.80         178.00        162.00   
75%    37474407.00     16.35     14.30     18.80         245.67        230.00   
max    39999230.00    318.58    225.30    378.30        2180.50       2001.00   

       platelet_max  hemoglobin_mean  hemoglobin_min  hemoglobin_max  \
count      13213.00         13211.00        13211.00        13211.00   
mean         212.99    

In [6]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Separate lab features for processing
lab_features = lab_df.drop('stay_id', axis=1)

# Initialize KNN imputer and Min-Max scaler
imputer = KNNImputer(n_neighbors=5)
scaler = MinMaxScaler()

# Impute missing values
print("Imputing missing values...")
lab_features_imputed = pd.DataFrame(
    imputer.fit_transform(lab_features),
    columns=lab_features.columns,
    index=lab_features.index
)

# Scale the features
print("Scaling features...")
lab_features_scaled = pd.DataFrame(
    scaler.fit_transform(lab_features_imputed),
    columns=lab_features.columns,
    index=lab_features.index
)

# Add back stay_id
lab_features_processed = lab_features_scaled.copy()
lab_features_processed['stay_id'] = lab_df['stay_id']

print("\nProcessed Lab Features Summary:")
print("-" * 30)
print("\nStatistics after processing:")
print(lab_features_processed.describe().round(3))

# Store transformers for later use on validation set
lab_transformers = {
    'imputer': imputer,
    'scaler': scaler
}

Imputing missing values...
Scaling features...

Processed Lab Features Summary:
------------------------------

Statistics after processing:
        wbc_mean    wbc_min    wbc_max  platelet_mean  platelet_min  \
count  13328.000  13328.000  13328.000      13328.000     13328.000   
mean       0.043      0.052      0.041          0.087         0.088   
std        0.030      0.036      0.030          0.049         0.052   
min        0.000      0.000      0.000          0.000         0.000   
25%        0.028      0.033      0.026          0.054         0.053   
50%        0.038      0.047      0.036          0.079         0.079   
75%        0.051      0.063      0.049          0.109         0.112   
max        1.000      1.000      1.000          1.000         1.000   

       platelet_max  hemoglobin_mean  hemoglobin_min  hemoglobin_max  \
count     13328.000        13328.000       13328.000       13328.000   
mean          0.087            0.414           0.463           0.418   
std

In [7]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Vital signs query for first 24 hours
vital_query = text("""
WITH first_day_vitals AS (
    SELECT 
        ie.stay_id,
        AVG(vs.heart_rate) as heart_rate_mean,
        MIN(vs.heart_rate) as heart_rate_min,
        MAX(vs.heart_rate) as heart_rate_max,
        AVG(vs.sbp) as sbp_mean,
        MIN(vs.sbp) as sbp_min,
        MAX(vs.sbp) as sbp_max,
        AVG(vs.dbp) as dbp_mean,
        MIN(vs.dbp) as dbp_min,
        MAX(vs.dbp) as dbp_max,
        AVG(vs.mbp) as mbp_mean,
        MIN(vs.mbp) as mbp_min,
        MAX(vs.mbp) as mbp_max,
        AVG(vs.resp_rate) as resp_rate_mean,
        MIN(vs.resp_rate) as resp_rate_min,
        MAX(vs.resp_rate) as resp_rate_max,
        AVG(vs.temperature) as temperature_mean,
        MIN(vs.temperature) as temperature_min,
        MAX(vs.temperature) as temperature_max,
        AVG(vs.spo2) as spo2_mean,
        MIN(vs.spo2) as spo2_min,
        MAX(vs.spo2) as spo2_max
    FROM mimiciv_icu.icustays ie
    LEFT JOIN mimiciv_derived.vitalsign vs
        ON ie.subject_id = vs.subject_id
        AND vs.charttime >= ie.intime
        AND vs.charttime <= ie.intime + INTERVAL '24 hours'
    WHERE ie.stay_id IN :stay_ids
    GROUP BY ie.stay_id
)
SELECT * FROM first_day_vitals;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    vitals_df = pd.read_sql(vital_query, engine)
    
    print("\nVital Signs Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(vitals_df)}")
    
    # Basic statistics for vitals
    print("\nVital Signs Statistics:")
    print(vitals_df.describe().round(2))
    
    # Missing value analysis
    missing_counts = vitals_df.isnull().sum()
    missing_percents = (vitals_df.isnull().sum() / len(vitals_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in vital signs extraction: {e}")


Vital Signs Summary:
------------------------------

Total rows: 13328

Vital Signs Statistics:
           stay_id  heart_rate_mean  heart_rate_min  heart_rate_max  sbp_mean  \
count     13328.00         13301.00        13301.00        13301.00  13300.00   
mean   34981256.06            87.36           71.88          107.34    114.95   
std     2898093.69            16.63           16.08           21.77     15.09   
min    30003598.00            37.58            1.00           49.00     66.08   
25%    32428727.00            75.65           61.00           91.00    104.38   
50%    34979983.00            85.58           70.00          105.00    112.17   
75%    37474407.00            98.12           82.00          120.00    123.04   
max    39999230.00           161.44          143.00          257.00    198.72   

        sbp_min   sbp_max  dbp_mean   dbp_min   dbp_max  ...   mbp_max  \
count  13300.00  13300.00  13300.00  13300.00  13300.00  ...  13301.00   
mean      86.62    148.42

In [8]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Separate vital features for processing
vital_features = vitals_df.drop('stay_id', axis=1)

# Initialize KNN imputer and Min-Max scaler
imputer = KNNImputer(n_neighbors=5)
scaler = MinMaxScaler()

# Impute missing values
print("Imputing missing values...")
vital_features_imputed = pd.DataFrame(
    imputer.fit_transform(vital_features),
    columns=vital_features.columns,
    index=vital_features.index
)

# Scale the features
print("Scaling features...")
vital_features_scaled = pd.DataFrame(
    scaler.fit_transform(vital_features_imputed),
    columns=vital_features.columns,
    index=vital_features.index
)

# Add back stay_id
vital_features_processed = vital_features_scaled.copy()
vital_features_processed['stay_id'] = vitals_df['stay_id']

print("\nProcessed Vital Signs Summary:")
print("-" * 30)
print("\nStatistics after processing:")
print(vital_features_processed.describe().round(3))

# Store transformers for later use on validation set
vital_transformers = {
    'imputer': imputer,
    'scaler': scaler
}

Imputing missing values...
Scaling features...

Processed Vital Signs Summary:
------------------------------

Statistics after processing:
       heart_rate_mean  heart_rate_min  heart_rate_max   sbp_mean    sbp_min  \
count        13328.000       13328.000       13328.000  13328.000  13328.000   
mean             0.402           0.499           0.280      0.368      0.459   
std              0.134           0.113           0.105      0.114      0.089   
min              0.000           0.000           0.000      0.000      0.000   
25%              0.307           0.423           0.202      0.289      0.406   
50%              0.388           0.486           0.269      0.348      0.456   
75%              0.489           0.570           0.341      0.429      0.506   
max              1.000           1.000           1.000      1.000      1.000   

         sbp_max   dbp_mean    dbp_min    dbp_max   mbp_mean  ...  \
count  13328.000  13328.000  13328.000  13328.000  13328.000  ...   
m

In [9]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Clinical scores query with the correct sofa table
scores_query = text("""
SELECT 
    ie.stay_id,
    s.sofa AS sofa_score,
    a.apsiii AS apsiii_score,
    sa.sapsii AS sapsii_score
FROM mimiciv_icu.icustays ie
LEFT JOIN mimiciv_derived.first_day_sofa s
    ON ie.stay_id = s.stay_id
LEFT JOIN mimiciv_derived.apsiii a
    ON ie.stay_id = a.stay_id
LEFT JOIN mimiciv_derived.sapsii sa
    ON ie.stay_id = sa.stay_id
WHERE ie.stay_id IN :stay_ids;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    scores_df = pd.read_sql(scores_query, engine)
    
    print("\nClinical Scores Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(scores_df)}")
    
    # Basic statistics
    print("\nScores Statistics:")
    print(scores_df.describe().round(2))
    
    # Missing value analysis
    missing_counts = scores_df.isnull().sum()
    missing_percents = (scores_df.isnull().sum() / len(scores_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in scores extraction: {e}")



Clinical Scores Summary:
------------------------------

Total rows: 13328

Scores Statistics:
           stay_id  sofa_score  apsiii_score  sapsii_score
count     13328.00    13328.00      13328.00      13328.00
mean   34981256.06        6.56         52.85         42.03
std     2898093.69        3.84         22.89         14.63
min    30003598.00        0.00          3.00          0.00
25%    32428727.00        4.00         36.00         32.00
50%    34979983.00        6.00         49.00         40.00
75%    37474407.00        9.00         66.00         51.00
max    39999230.00       23.00        184.00        115.00

Missing Values (Count and Percentage):


In [10]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Separate score features for processing
score_features = scores_df.drop('stay_id', axis=1)

# Initialize Min-Max scaler
scaler = MinMaxScaler()

# Scale the features
print("Scaling features...")
score_features_scaled = pd.DataFrame(
    scaler.fit_transform(score_features),
    columns=score_features.columns,
    index=score_features.index
)

# Add back stay_id
score_features_processed = score_features_scaled.copy()
score_features_processed['stay_id'] = scores_df['stay_id']

print("\nProcessed Clinical Scores Summary:")
print("-" * 30)
print("\nStatistics after processing:")
print(score_features_processed.describe().round(3))

# Store transformers for later use on validation set
score_transformers = {
    'scaler': scaler
}

Scaling features...

Processed Clinical Scores Summary:
------------------------------

Statistics after processing:
       sofa_score  apsiii_score  sapsii_score       stay_id
count   13328.000     13328.000     13328.000  1.332800e+04
mean        0.285         0.275         0.365  3.498126e+07
std         0.167         0.126         0.127  2.898094e+06
min         0.000         0.000         0.000  3.000360e+07
25%         0.174         0.182         0.278  3.242873e+07
50%         0.261         0.254         0.348  3.497998e+07
75%         0.391         0.348         0.443  3.747441e+07
max         1.000         1.000         1.000  3.999923e+07


In [11]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Revised comorbidity query with correct column names
comorbidity_query = text("""
WITH cohort_admissions AS (
    SELECT DISTINCT ie.subject_id, ie.hadm_id, ie.stay_id
    FROM mimiciv_icu.icustays ie
    WHERE ie.stay_id IN :stay_ids
)
SELECT 
    ca.stay_id,
    COALESCE(ch.myocardial_infarct, 0) as mi,
    COALESCE(ch.congestive_heart_failure, 0) as chf,
    COALESCE(ch.chronic_pulmonary_disease, 0) as copd,
    COALESCE(GREATEST(COALESCE(ch.diabetes_without_cc, 0), COALESCE(ch.diabetes_with_cc, 0)), 0) as diabetes,
    COALESCE(ch.mild_liver_disease, 0) as liver_disease,
    COALESCE(ch.cerebrovascular_disease, 0) as cva,
    COALESCE(ch.malignant_cancer, 0) as cancer,
    COALESCE(ch.aids, 0) as aids
FROM cohort_admissions ca
LEFT JOIN mimiciv_derived.charlson ch
    ON ca.hadm_id = ch.hadm_id
ORDER BY ca.stay_id;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    comorbidity_df = pd.read_sql(comorbidity_query, engine)
    
    print("\nComorbidity Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(comorbidity_df)}")
    
    # Prevalence of each comorbidity
    print("\nComorbidity Prevalence:")
    for col in comorbidity_df.columns:
        if col != 'stay_id':
            n_cases = comorbidity_df[col].sum()
            pct = (n_cases / len(comorbidity_df) * 100)
            print(f"{col}: {n_cases} cases ({pct:.1f}%)")
    
    # Missing value analysis
    missing_counts = comorbidity_df.isnull().sum()
    missing_percents = (comorbidity_df.isnull().sum() / len(comorbidity_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in comorbidity extraction: {e}")


Comorbidity Summary:
------------------------------

Total rows: 13328

Comorbidity Prevalence:
mi: 2523 cases (18.9%)
chf: 4180 cases (31.4%)
copd: 3515 cases (26.4%)
diabetes: 4185 cases (31.4%)
liver_disease: 2059 cases (15.4%)
cva: 2356 cases (17.7%)
cancer: 1690 cases (12.7%)
aids: 80 cases (0.6%)

Missing Values (Count and Percentage):


In [12]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# RRT query using correct derived table
rrt_query = text("""
SELECT 
    s.stay_id,
    COALESCE(MAX(CASE WHEN r.dialysis_active = 1 THEN 1 ELSE 0 END), 0) as rrt
FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
LEFT JOIN mimiciv_derived.rrt r
    ON s.stay_id = r.stay_id
GROUP BY s.stay_id
ORDER BY s.stay_id;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    rrt_df = pd.read_sql(rrt_query, engine)
    
    print("\nRRT Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(rrt_df)}")
    
    # RRT prevalence
    n_cases = rrt_df['rrt'].sum()
    pct = (n_cases / len(rrt_df) * 100)
    print(f"\nRRT cases: {n_cases} ({pct:.1f}%)")
    
    # Missing value analysis
    missing_counts = rrt_df.isnull().sum()
    missing_percents = (rrt_df.isnull().sum() / len(rrt_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in RRT extraction: {e}")


RRT Summary:
------------------------------

Total rows: 13328

RRT cases: 1773 (13.3%)

Missing Values (Count and Percentage):


In [13]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Tracheostomy query - simply looking for presence of tracheostomy
trach_query = text("""
SELECT 
    s.stay_id,
    CASE 
        WHEN COUNT(v.ventilation_status) > 0 THEN 1
        ELSE 0
    END as has_tracheostomy
FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
LEFT JOIN mimiciv_derived.ventilation v
    ON s.stay_id = v.stay_id
    AND v.ventilation_status = 'Tracheostomy'
GROUP BY s.stay_id
ORDER BY s.stay_id;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    trach_df = pd.read_sql(trach_query, engine)
    
    print("\nTracheostomy Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(trach_df)}")
    
    # Tracheostomy prevalence
    n_cases = trach_df['has_tracheostomy'].sum()
    pct = (n_cases / len(trach_df) * 100)
    print(f"\nTracheostomy cases: {n_cases} ({pct:.1f}%)")
    
    # Missing value analysis
    missing_counts = trach_df.isnull().sum()
    missing_percents = (trach_df.isnull().sum() / len(trach_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in tracheostomy extraction: {e}")


Tracheostomy Summary:
------------------------------

Total rows: 13328

Tracheostomy cases: 797 (6.0%)

Missing Values (Count and Percentage):


In [14]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Ostomy query focusing just on presence of ostomies
ostomy_query = text("""
WITH ostomy_status AS (
    SELECT 
        s.stay_id,
        MAX(CASE WHEN 
            ce.itemid IN (
                227458,  -- Ostomy
                227637,  -- Ostomy Care
                228341,  -- Ostomy Bag
                228342,  -- Ostomy Output
                228343,  -- Ostomy Type
                228344,  -- Ostomy/RNWL
                228345,  -- Ostomy/RNTL
                228346,  -- Ostomy/RNY
                228347   -- Ostomy/ROY
            )
        THEN 1 ELSE 0 END) as has_ostomy
    FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
    LEFT JOIN mimiciv_icu.chartevents ce 
        ON s.stay_id = ce.stay_id
    GROUP BY s.stay_id
)
SELECT 
    s.stay_id,
    COALESCE(os.has_ostomy, 0) as has_ostomy
FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
LEFT JOIN ostomy_status os 
    ON s.stay_id = os.stay_id
ORDER BY s.stay_id;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    ostomy_df = pd.read_sql(ostomy_query, engine)
    
    print("\nOstomy Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(ostomy_df)}")
    
    # Ostomy prevalence
    n_cases = ostomy_df['has_ostomy'].sum()
    pct = (n_cases / len(ostomy_df) * 100)
    print(f"\nOstomy cases: {n_cases} ({pct:.1f}%)")
    
    # Missing value analysis
    missing_counts = ostomy_df.isnull().sum()
    missing_percents = (ostomy_df.isnull().sum() / len(ostomy_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in ostomy extraction: {e}")


Ostomy Summary:
------------------------------

Total rows: 13328

Ostomy cases: 550 (4.1%)

Missing Values (Count and Percentage):


In [15]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Query for LOS before CVC
cvc_timing_query = text("""
SELECT 
    s.stay_id,
    EXTRACT(EPOCH FROM (MIN(il.starttime) - icu.intime))/3600 as los_before_cvc
FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
LEFT JOIN mimiciv_icu.icustays icu
    ON s.stay_id = icu.stay_id
LEFT JOIN mimiciv_derived.invasive_line il
    ON s.stay_id = il.stay_id
    AND il.line_type IN ('PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
                        'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
                        'Continuous Cardiac Output PA', 'PA')
GROUP BY s.stay_id, icu.intime
ORDER BY s.stay_id;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    cvc_timing_df = pd.read_sql(cvc_timing_query, engine)
    
    print("\nCVC Timing Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(cvc_timing_df)}")
    
    # Basic statistics
    print("\nLOS before CVC (hours) statistics:")
    print(cvc_timing_df['los_before_cvc'].describe())
    
    # Missing value analysis
    missing_counts = cvc_timing_df.isnull().sum()
    missing_percents = (cvc_timing_df.isnull().sum() / len(cvc_timing_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in CVC timing extraction: {e}")


CVC Timing Summary:
------------------------------

Total rows: 13328

LOS before CVC (hours) statistics:
count    13328.000000
mean        17.031826
std         37.703097
min       -336.261667
25%          0.936875
50%          3.489306
75%         14.437083
max        772.566667
Name: los_before_cvc, dtype: float64

Missing Values (Count and Percentage):


In [16]:
# Clean and scale the CVC timing data
cvc_timing_processed = cvc_timing_df.copy()

# 1. Handle negative values - set them to 0 since they likely represent lines present on admission
cvc_timing_processed['los_before_cvc'] = cvc_timing_processed['los_before_cvc'].clip(lower=0)

# 2. Scale the data using MinMaxScaler
scaler = MinMaxScaler()
cvc_timing_processed['los_before_cvc_scaled'] = scaler.fit_transform(
    cvc_timing_processed[['los_before_cvc']]
)

# Show results
print("\nProcessed CVC Timing Summary:")
print("-" * 30)
print("\nOriginal LOS before CVC (hours) statistics:")
print(cvc_timing_processed['los_before_cvc'].describe())
print("\nScaled LOS before CVC statistics:")
print(cvc_timing_processed['los_before_cvc_scaled'].describe())

# Store transformer for validation set
cvc_transformers = {
    'scaler': scaler
}


Processed CVC Timing Summary:
------------------------------

Original LOS before CVC (hours) statistics:
count    13328.000000
mean        17.117585
std         37.432228
min          0.000000
25%          0.936875
50%          3.489306
75%         14.437083
max        772.566667
Name: los_before_cvc, dtype: float64

Scaled LOS before CVC statistics:
count    13328.000000
mean         0.022157
std          0.048452
min          0.000000
25%          0.001213
50%          0.004517
75%          0.018687
max          1.000000
Name: los_before_cvc_scaled, dtype: float64


In [17]:
# Convert stay_ids to regular Python integers
stay_ids = tuple(int(x) for x in cohort_df['stay_id'].unique())

# Query for multiple lines - simplified version
lines_query = text("""
WITH line_counts AS (
    SELECT 
        s.stay_id,
        COUNT(DISTINCT il.line_type) as unique_line_types,
        CASE WHEN COUNT(*) > 1 THEN 1 ELSE 0 END as multiple_lines
    FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
    LEFT JOIN mimiciv_derived.invasive_line il
        ON s.stay_id = il.stay_id
    WHERE il.line_type IN (
        'PICC', 
        'Multi Lumen', 
        'Dialysis', 
        'Triple Introducer',
        'Pre-Sep', 
        'Hickman', 
        'Portacath', 
        'Cordis/Introducer',
        'Continuous Cardiac Output PA', 
        'PA'
    )
    GROUP BY s.stay_id
)
SELECT 
    s.stay_id,
    COALESCE(lc.multiple_lines, 0) as multiple_lines,
    COALESCE(lc.unique_line_types, 0) as n_line_types
FROM (SELECT DISTINCT stay_id FROM mimiciv_icu.icustays WHERE stay_id IN :stay_ids) s
LEFT JOIN line_counts lc
    ON s.stay_id = lc.stay_id
ORDER BY s.stay_id;
""").bindparams(stay_ids=stay_ids)

# Execute query
try:
    lines_df = pd.read_sql(lines_query, engine)
    
    print("\nMultiple Lines Summary:")
    print("-" * 30)
    print(f"\nTotal rows: {len(lines_df)}")
    
    # Multiple lines prevalence
    n_cases = lines_df['multiple_lines'].sum()
    pct = (n_cases / len(lines_df) * 100)
    print(f"\nMultiple lines cases: {n_cases} ({pct:.1f}%)")
    
    # Line type counts statistics
    print("\nLine Type Counts:")
    print(lines_df['n_line_types'].value_counts().sort_index())
    
    # Missing value analysis
    missing_counts = lines_df.isnull().sum()
    missing_percents = (lines_df.isnull().sum() / len(lines_df) * 100).round(2)
    print("\nMissing Values (Count and Percentage):")
    for col, count in missing_counts.items():
        if count > 0:
            print(f"{col}: {count} ({missing_percents[col]}%)")

except Exception as e:
    print(f"Error in lines extraction: {e}")


Multiple Lines Summary:
------------------------------

Total rows: 13328

Multiple lines cases: 5804 (43.5%)

Line Type Counts:
n_line_types
1    8307
2    3739
3    1018
4     207
5      48
6       8
7       1
Name: count, dtype: int64

Missing Values (Count and Percentage):


In [18]:
# First modify n_line_types to bin 4+ together
lines_df['n_line_types_binned'] = lines_df['n_line_types'].apply(lambda x: min(x, 4))

# Create dummy variables
line_types_encoded = pd.get_dummies(
    lines_df['n_line_types_binned'], 
    prefix='n_line_types'
)

# Add back stay_id and multiple_lines flag
lines_processed = pd.concat([
    lines_df[['stay_id', 'multiple_lines']], 
    line_types_encoded
], axis=1)

# Show results
print("\nProcessed Multiple Lines Summary:")
print("-" * 30)
print(f"\nTotal rows: {len(lines_processed)}")

print("\nEncoded columns:")
print(list(lines_processed.columns))

print("\nSample of first few rows:")
print(lines_processed.head())

print("\nDistribution of binned categories:")
for col in line_types_encoded.columns:
    count = line_types_encoded[col].sum()
    pct = (count / len(line_types_encoded) * 100)
    print(f"{col}: {count} cases ({pct:.1f}%)")


Processed Multiple Lines Summary:
------------------------------

Total rows: 13328

Encoded columns:
['stay_id', 'multiple_lines', 'n_line_types_1', 'n_line_types_2', 'n_line_types_3', 'n_line_types_4']

Sample of first few rows:
    stay_id  multiple_lines  n_line_types_1  n_line_types_2  n_line_types_3  \
0  30003598               1           False            True           False   
1  30004530               0            True           False           False   
2  30005000               1           False            True           False   
3  30005362               0            True           False           False   
4  30005707               1            True           False           False   

   n_line_types_4  
0           False  
1           False  
2           False  
3           False  
4           False  

Distribution of binned categories:
n_line_types_1: 8307 cases (62.3%)
n_line_types_2: 3739 cases (28.1%)
n_line_types_3: 1018 cases (7.6%)
n_line_types_4: 264 cases (2.0%)


In [19]:
# Convert boolean to integers (True/False to 1/0)
bool_columns = ['n_line_types_1', 'n_line_types_2', 'n_line_types_3', 'n_line_types_4']
lines_processed[bool_columns] = lines_processed[bool_columns].astype(int)

print("\nProcessed Multiple Lines Summary (with 1/0):")
print("-" * 30)
print("\nSample of first few rows:")
print(lines_processed.head())

print("\nDistribution of binned categories:")
for col in bool_columns:
    count = lines_processed[col].sum()
    pct = (count / len(lines_processed) * 100)
    print(f"{col}: {count} cases ({pct:.1f}%)")


Processed Multiple Lines Summary (with 1/0):
------------------------------

Sample of first few rows:
    stay_id  multiple_lines  n_line_types_1  n_line_types_2  n_line_types_3  \
0  30003598               1               0               1               0   
1  30004530               0               1               0               0   
2  30005000               1               0               1               0   
3  30005362               0               1               0               0   
4  30005707               1               1               0               0   

   n_line_types_4  
0               0  
1               0  
2               0  
3               0  
4               0  

Distribution of binned categories:
n_line_types_1: 8307 cases (62.3%)
n_line_types_2: 3739 cases (28.1%)
n_line_types_3: 1018 cases (7.6%)
n_line_types_4: 264 cases (2.0%)


In [20]:
# List of DataFrames we have:
# 1. Demographic features (demo_df)
# 2. Lab values (lab_features_processed) - scaled
# 3. Vital signs (vital_features_processed) - scaled
# 4. Clinical scores (score_features_processed) - scaled
# 5. Comorbidities (comorbidity_df)
# 6. RRT (rrt_df)
# 7. LOS before CVC (cvc_timing_processed) - scaled
# 8. Tracheostomy (trach_df)
# 9. Ostomy (ostomy_df)
# 10. Multiple lines (lines_processed)

# Merge all features using stay_id as key
merged_features = pd.merge(demo_df, lab_features_processed, on='stay_id', how='left')
merged_features = pd.merge(merged_features, vital_features_processed, on='stay_id', how='left')
merged_features = pd.merge(merged_features, score_features_processed, on='stay_id', how='left')
merged_features = pd.merge(merged_features, comorbidity_df, on='stay_id', how='left')
merged_features = pd.merge(merged_features, rrt_df, on='stay_id', how='left')
merged_features = pd.merge(merged_features, cvc_timing_processed, on='stay_id', how='left')
merged_features = pd.merge(merged_features, trach_df, on='stay_id', how='left')
merged_features = pd.merge(merged_features, ostomy_df, on='stay_id', how='left')
merged_features = pd.merge(merged_features, lines_processed, on='stay_id', how='left')

# Check merged dataset
print("\nMerged Features Summary:")
print("-" * 30)
print(f"\nTotal rows: {len(merged_features)}")
print(f"Total features: {len(merged_features.columns)}")

# Check for any missing values after merge
print("\nMissing Values Summary:")
missing = merged_features.isnull().sum()[merged_features.isnull().sum() > 0]
if len(missing) > 0:
    print(missing)
else:
    print("No missing values found")

# Show sample of first few rows
print("\nSample of merged features (first 5 columns):")
print(merged_features.iloc[:5, :5])


Merged Features Summary:
------------------------------

Total rows: 13328
Total features: 65

Missing Values Summary:
No missing values found

Sample of merged features (first 5 columns):
    stay_id gender        age               ethnicity  wbc_mean
0  37510196      F  77.018296  BLACK/AFRICAN AMERICAN  0.047413
1  33987268      F  81.280232                   WHITE  0.078656
2  32128372      F  75.149738  BLACK/AFRICAN AMERICAN  0.013240
3  32824762      M  62.365832              PORTUGUESE  0.052280
4  34100191      M  47.149517      BLACK/CAPE VERDEAN  0.046974


In [21]:
# 1. First let's categorize our columns
binary_features = [
    'multiple_lines', 'n_line_types_1', 'n_line_types_2', 
    'n_line_types_3', 'n_line_types_4', 'has_tracheostomy', 
    'has_ostomy', 'rrt'
]

scaled_features = [
    'wbc_mean', 'platelet_mean', 'hemoglobin_mean', 
    'los_before_cvc_scaled',
    # Add other lab and vital features here
]

categorical_features = ['gender', 'ethnicity']

# Verification checks
print("Feature Verification Summary:")
print("-" * 30)

# Check binary features
print("\nBinary Features Check:")
for col in binary_features:
    unique_vals = sorted(merged_features[col].unique())
    print(f"{col}: unique values = {unique_vals}")

# Check scaled features
print("\nScaled Features Check (should be between 0-1):")
for col in scaled_features:
    min_val = merged_features[col].min()
    max_val = merged_features[col].max()
    print(f"{col}: range = [{min_val:.3f}, {max_val:.3f}]")

# Check categorical features
print("\nCategorical Features Unique Values:")
for col in categorical_features:
    n_unique = merged_features[col].nunique()
    print(f"{col}: {n_unique} unique values")

# Check for duplicate column names
print("\nDuplicate Column Check:")
duplicates = merged_features.columns[merged_features.columns.duplicated()].tolist()
if duplicates:
    print(f"Found duplicate columns: {duplicates}")
else:
    print("No duplicate columns found")

# Save the final dataset
try:
    # Save as CSV
    merged_features.to_csv('clabsi_features_final.csv', index=False)
    print("\nDataset saved successfully as 'clabsi_features_final.csv'")
    
    # Also save as pickle to preserve data types
    merged_features.to_pickle('clabsi_features_final.pkl')
    print("Dataset saved successfully as 'clabsi_features_final.pkl'")
except Exception as e:
    print(f"Error saving dataset: {e}")

# Print column names for reference
print("\nFull list of features:")
for i, col in enumerate(merged_features.columns, 1):
    print(f"{i}. {col}")

Feature Verification Summary:
------------------------------

Binary Features Check:
multiple_lines: unique values = [0, 1]
n_line_types_1: unique values = [0, 1]
n_line_types_2: unique values = [0, 1]
n_line_types_3: unique values = [0, 1]
n_line_types_4: unique values = [0, 1]
has_tracheostomy: unique values = [0, 1]
has_ostomy: unique values = [0, 1]
rrt: unique values = [0, 1]

Scaled Features Check (should be between 0-1):
wbc_mean: range = [0.000, 1.000]
platelet_mean: range = [0.000, 1.000]
hemoglobin_mean: range = [0.000, 1.000]
los_before_cvc_scaled: range = [0.000, 1.000]

Categorical Features Unique Values:
gender: 2 unique values
ethnicity: 33 unique values

Duplicate Column Check:
No duplicate columns found

Dataset saved successfully as 'clabsi_features_final.csv'
Dataset saved successfully as 'clabsi_features_final.pkl'

Full list of features:
1. stay_id
2. gender
3. age
4. ethnicity
5. wbc_mean
6. wbc_min
7. wbc_max
8. platelet_mean
9. platelet_min
10. platelet_max
11. 

In [22]:
# 1. First identify redundant features to remove
redundant_features = [
    # Remove raw LOS since we have scaled
    'los_before_cvc',
    
    # Remove min/max keeping only means for labs
    'wbc_min', 'wbc_max',
    'platelet_min', 'platelet_max',
    'hemoglobin_min', 'hemoglobin_max',
    
    # Remove min/max keeping only means for vitals
    'heart_rate_min', 'heart_rate_max',
    'sbp_min', 'sbp_max',
    'dbp_min', 'dbp_max',
    'mbp_min', 'mbp_max',
    'resp_rate_min', 'resp_rate_max',
    'temperature_min', 'temperature_max',
    'spo2_min', 'spo2_max'
]

# 2. Create clean dataset removing redundant features
clean_features = merged_features.drop(columns=redundant_features)

# 3. Encode categorical variables
# Encode gender
clean_features['gender'] = (clean_features['gender'] == 'M').astype(int)

# One-hot encode ethnicity, dropping first category to avoid multicollinearity
ethnicity_encoded = pd.get_dummies(clean_features['ethnicity'], prefix='ethnicity', drop_first=True)
clean_features = pd.concat([clean_features.drop('ethnicity', axis=1), ethnicity_encoded], axis=1)

# Show results
print("Features After Processing:")
print("-" * 30)
print(f"\nTotal features before: {len(merged_features.columns)}")
print(f"Total features after: {len(clean_features.columns)}")

print("\nSample of first few rows (first 5 columns):")
print(clean_features.iloc[:5, :5])

print("\nFinal list of features:")
for i, col in enumerate(clean_features.columns, 1):
    print(f"{i}. {col}")

# Save processed dataset
clean_features.to_csv('clabsi_features_processed.csv', index=False)
clean_features.to_pickle('clabsi_features_processed.pkl')

Features After Processing:
------------------------------

Total features before: 65
Total features after: 75

Sample of first few rows (first 5 columns):
    stay_id  gender        age  wbc_mean  platelet_mean
0  37510196       0  77.018296  0.047413       0.070251
1  33987268       0  81.280232  0.078656       0.119488
2  32128372       0  75.149738  0.013240       0.034819
3  32824762       1  62.365832  0.052280       0.088657
4  34100191       1  47.149517  0.046974       0.005645

Final list of features:
1. stay_id
2. gender
3. age
4. wbc_mean
5. platelet_mean
6. hemoglobin_mean
7. aniongap_mean
8. bicarbonate_mean
9. creatinine_mean
10. chloride_mean
11. glucose_mean
12. sodium_mean
13. potassium_mean
14. inr_mean
15. pt_mean
16. ptt_mean
17. heart_rate_mean
18. sbp_mean
19. dbp_mean
20. mbp_mean
21. resp_rate_mean
22. temperature_mean
23. spo2_mean
24. sofa_score
25. apsiii_score
26. sapsii_score
27. mi
28. chf
29. copd
30. diabetes
31. liver_disease
32. cva
33. cancer
34. aids

In [23]:
# Get list of ethnicity columns
ethnicity_columns = [col for col in clean_features.columns if col.startswith('ethnicity_')]

# Remove ethnicity columns
clean_features_no_eth = clean_features.drop(columns=ethnicity_columns)

print("Features After Removing Ethnicity:")
print("-" * 30)
print(f"\nTotal features before: {len(clean_features.columns)}")
print(f"Total features after: {len(clean_features_no_eth.columns)}")

print("\nFinal list of features:")
for i, col in enumerate(clean_features_no_eth.columns, 1):
    print(f"{i}. {col}")

# Save updated dataset
clean_features_no_eth.to_csv('clabsi_features_final_no_eth.csv', index=False)
clean_features_no_eth.to_pickle('clabsi_features_final_no_eth.pkl')

Features After Removing Ethnicity:
------------------------------

Total features before: 75
Total features after: 43

Final list of features:
1. stay_id
2. gender
3. age
4. wbc_mean
5. platelet_mean
6. hemoglobin_mean
7. aniongap_mean
8. bicarbonate_mean
9. creatinine_mean
10. chloride_mean
11. glucose_mean
12. sodium_mean
13. potassium_mean
14. inr_mean
15. pt_mean
16. ptt_mean
17. heart_rate_mean
18. sbp_mean
19. dbp_mean
20. mbp_mean
21. resp_rate_mean
22. temperature_mean
23. spo2_mean
24. sofa_score
25. apsiii_score
26. sapsii_score
27. mi
28. chf
29. copd
30. diabetes
31. liver_disease
32. cva
33. cancer
34. aids
35. rrt
36. los_before_cvc_scaled
37. has_tracheostomy
38. has_ostomy
39. multiple_lines
40. n_line_types_1
41. n_line_types_2
42. n_line_types_3
43. n_line_types_4


In [24]:
# Verify splits from cohort_df
print("Original Splits:")
print("-" * 30)
print("\nTraining:")
print(f"Total: {len(train_df)}")
print(f"CLABSI cases: {train_df['has_clabsi'].sum()}")
print(f"30-day mortality: {train_df['mortality_30d'].sum()}")

print("\nValidation:")
print(f"Total: {len(val_df)}")
print(f"CLABSI cases: {val_df['has_clabsi'].sum()}")
print(f"30-day mortality: {val_df['mortality_30d'].sum()}")

Original Splits:
------------------------------

Training:
Total: 9431
CLABSI cases: 83
30-day mortality: 2092

Validation:
Total: 3897
CLABSI cases: 31
30-day mortality: 912


In [25]:
# Merge features with original splits
train_final = pd.merge(
    train_df[['stay_id', 'has_clabsi', 'mortality_30d']], 
    clean_features_no_eth,
    on='stay_id',
    how='left'
)

val_final = pd.merge(
    val_df[['stay_id', 'has_clabsi', 'mortality_30d']], 
    clean_features_no_eth,
    on='stay_id',
    how='left'
)

print("\nFinal Datasets:")
print("-" * 30)
print(f"\nTraining set shape: {train_final.shape}")
print(f"Validation set shape: {val_final.shape}")


Final Datasets:
------------------------------

Training set shape: (9431, 45)
Validation set shape: (3897, 45)
