In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings('ignore')

# Connect to database
engine = create_engine('postgresql://postgres:ramiel12@localhost:5432/mimiciv')

# Use your exact cohort query
cohort_query = text("""
WITH first_icu_stays AS (
    SELECT 
        ie.subject_id,
        ie.hadm_id,
        ie.stay_id,
        ie.intime as icu_admission,
        ie.outtime as icu_discharge,
        EXTRACT(EPOCH FROM (ie.outtime - ie.intime))/3600 as icu_los_hours,
        ROW_NUMBER() OVER (PARTITION BY ie.subject_id ORDER BY ie.intime) as icu_stay_number,
        p.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(p.anchor_year, 1, 1, 0, 0, 0))/31556908.8 AS admission_age
    FROM mimiciv_icu.icustays ie
    INNER JOIN mimiciv_hosp.admissions adm 
        ON ie.hadm_id = adm.hadm_id
    INNER JOIN mimiciv_hosp.patients p 
        ON ie.subject_id = p.subject_id
    WHERE p.anchor_age + EXTRACT(EPOCH FROM adm.admittime - MAKE_TIMESTAMP(p.anchor_year, 1, 1, 0, 0, 0))/31556908.8 >= 18
),
first_lines AS (
    SELECT 
        fis.*,
        il.starttime as line_placement_time,
        il.endtime as line_removal_time,
        il.line_type,
        EXTRACT(EPOCH FROM (il.endtime - il.starttime))/24/3600 as line_days,
        ROW_NUMBER() OVER (PARTITION BY fis.stay_id ORDER BY il.starttime) as line_number
    FROM first_icu_stays fis
    INNER JOIN mimiciv_derived.invasive_line il 
        ON fis.stay_id = il.stay_id
    WHERE fis.icu_stay_number = 1  -- First ICU stay only
        AND fis.icu_los_hours >= 48  -- Survived at least 48 hours
        AND il.line_type IN (
            'PICC', 'Multi Lumen', 'Dialysis', 'Triple Introducer',
            'Pre-Sep', 'Hickman', 'Portacath', 'Cordis/Introducer',
            'Continuous Cardiac Output PA', 'PA'
        )
        AND EXTRACT(EPOCH FROM (il.endtime - il.starttime))/24/3600 >= 2  -- Line in place >2 days
),
other_infections AS (
    SELECT DISTINCT
        hadm_id,
        charttime
    FROM mimiciv_hosp.microbiologyevents
    WHERE spec_type_desc NOT IN ('BLOOD CULTURE', '')
        AND org_name IS NOT NULL
),
blood_cultures AS (
    SELECT 
        qs.stay_id,
        qs.subject_id,
        qs.hadm_id,
        me.charttime,
        me.spec_type_desc,
        me.org_name,
        CASE 
            WHEN LOWER(me.org_name) SIMILAR TO '%(coagulase-negative staphylococci|staphylococcus epidermidis|staphylococcus haemolyticus|staphylococcus hominis|propionibacterium|corynebacterium|diphtheroids|bacillus species|micrococcus)%' 
                THEN 'common_commensal'
            WHEN LOWER(me.org_name) SIMILAR TO '%(campylobacter|salmonella|shigella|listeria|vibrio|yersinia|difficile|enterohemorrhagic|enteropathogenic|blastomyces|histoplasma|coccidioides|paracoccidioides|cryptococcus|pneumocystis)%' 
                THEN 'excluded'
            ELSE 'recognized_pathogen'
        END as organism_type
    FROM first_lines qs
    INNER JOIN mimiciv_hosp.microbiologyevents me 
        ON qs.hadm_id = me.hadm_id
    WHERE me.spec_type_desc = 'BLOOD CULTURE'
        AND me.org_name IS NOT NULL
        AND me.charttime > qs.line_placement_time + INTERVAL '2 days'
        AND me.charttime <= qs.line_removal_time
        AND qs.line_number = 1  -- Only consider first line
        AND NOT EXISTS (
            SELECT 1 
            FROM other_infections oi 
            WHERE oi.hadm_id = me.hadm_id 
                AND oi.charttime BETWEEN me.charttime - INTERVAL '3 days' 
                AND me.charttime + INTERVAL '3 days'
        )
),
clabsi_events AS (
    SELECT DISTINCT
        stay_id,
        subject_id,
        hadm_id,
        MIN(charttime) as infection_date
    FROM blood_cultures bc
    WHERE (organism_type = 'recognized_pathogen')
       OR (organism_type = 'common_commensal' 
           AND EXISTS (
               SELECT 1 
               FROM blood_cultures bc2 
               WHERE bc2.stay_id = bc.stay_id 
                   AND bc2.org_name = bc.org_name 
                   AND bc2.charttime != bc.charttime
                   AND bc2.charttime <= bc.charttime + INTERVAL '2 days'
           ))
    GROUP BY stay_id, subject_id, hadm_id
),
mortality_outcomes AS (
    SELECT 
        qs.stay_id,
        qs.subject_id,
        qs.hadm_id,
        qs.line_placement_time,
        CASE 
            WHEN p.dod IS NOT NULL 
                AND p.dod <= (qs.line_placement_time + INTERVAL '30 days')
                THEN 1
            ELSE 0
        END as mortality_30d,
        p.dod as death_date
    FROM first_lines qs
    LEFT JOIN mimiciv_hosp.patients p 
        ON qs.subject_id = p.subject_id
    WHERE qs.line_number = 1  -- Only consider first line
)
SELECT 
    qs.*,
    CASE 
        WHEN ce.stay_id IS NOT NULL THEN 1
        ELSE 0
    END as has_clabsi,
    ce.infection_date,
    mo.mortality_30d,
    mo.death_date
FROM first_lines qs
LEFT JOIN clabsi_events ce 
    ON qs.stay_id = ce.stay_id
LEFT JOIN mortality_outcomes mo 
    ON qs.stay_id = mo.stay_id
WHERE qs.line_number = 1  -- Only include first line for each stay
ORDER BY qs.subject_id, qs.icu_admission;
""")


# Get full cohort with outcomes
cohort_df = pd.read_sql(cohort_query, engine)

# Load processed features
features_df = pd.read_pickle('clabsi_features_final_no_eth.pkl')

# Merge cohort outcomes with processed features
model_df = pd.merge(
    features_df,
    cohort_df[['stay_id', 'has_clabsi', 'mortality_30d']],
    on='stay_id',
    how='inner'
)

# Create stratified split maintaining both outcome proportions
X = model_df.drop(['has_clabsi', 'mortality_30d', 'stay_id'], axis=1)
y_clabsi = model_df['has_clabsi']
y_mortality = model_df['mortality_30d']

# Split data maintaining proportions of both outcomes
X_train, X_val, y_clabsi_train, y_clabsi_val, y_mortality_train, y_mortality_val = train_test_split(
    X, y_clabsi, y_mortality,
    test_size=0.30,
    random_state=42,
    stratify=pd.DataFrame({'clabsi': y_clabsi, 'mortality': y_mortality})
)

# Verify the splits maintain similar proportions to your original cohort
print("\nDataset Summary:")
print("-" * 30)
print(f"Total samples: {len(model_df)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(model_df)*100:.1f}%)")
print(f"Validation samples: {len(X_val)} ({len(X_val)/len(model_df)*100:.1f}%)")

print("\nCLABSI Distribution:")
print(f"Overall CLABSI rate: {y_clabsi.mean()*100:.2f}% ({sum(y_clabsi)} cases)")
print(f"Training CLABSI rate: {y_clabsi_train.mean()*100:.2f}% ({sum(y_clabsi_train)} cases)")
print(f"Validation CLABSI rate: {y_clabsi_val.mean()*100:.2f}% ({sum(y_clabsi_val)} cases)")

print("\n30-day Mortality Distribution:")
print(f"Overall mortality rate: {y_mortality.mean()*100:.2f}% ({sum(y_mortality)} cases)")
print(f"Training mortality rate: {y_mortality_train.mean()*100:.2f}% ({sum(y_mortality_train)} cases)")
print(f"Validation mortality rate: {y_mortality_val.mean()*100:.2f}% ({sum(y_mortality_val)} cases)")


Dataset Summary:
------------------------------
Total samples: 13328
Training samples: 9329 (70.0%)
Validation samples: 3999 (30.0%)

CLABSI Distribution:
Overall CLABSI rate: 0.86% (114 cases)
Training CLABSI rate: 0.86% (80 cases)
Validation CLABSI rate: 0.85% (34 cases)

30-day Mortality Distribution:
Overall mortality rate: 22.54% (3004 cases)
Training mortality rate: 22.53% (2102 cases)
Validation mortality rate: 22.56% (902 cases)


In [13]:
# Analyze CLABSI cases in detail
clabsi_cases = analysis_df[analysis_df['has_clabsi']==1]
non_clabsi = analysis_df[analysis_df['has_clabsi']==0]

print("CLABSI Cohort Analysis:")
print("-" * 30)
print(f"Total cases: {len(analysis_df)}")
print(f"CLABSI cases: {len(clabsi_cases)} ({len(clabsi_cases)/len(analysis_df)*100:.2f}%)")
print(f"Non-CLABSI cases: {len(non_clabsi)}")

# Look at characteristics of CLABSI cases
print("\nCLABSI Cases Characteristics:")
print("-" * 30)
for col in ['multiple_lines', 'rrt', 'temperature_mean']:
    print(f"\n{col}:")
    print(clabsi_cases[col].describe())

CLABSI Cohort Analysis:
------------------------------
Total cases: 9329
CLABSI cases: 80 (0.86%)
Non-CLABSI cases: 9249

CLABSI Cases Characteristics:
------------------------------

multiple_lines:
count    80.000000
mean      0.687500
std       0.466437
min       0.000000
25%       0.000000
50%       1.000000
75%       1.000000
max       1.000000
Name: multiple_lines, dtype: float64

rrt:
count    80.000000
mean      0.312500
std       0.466437
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: rrt, dtype: float64

temperature_mean:
count    80.000000
mean      0.632567
std       0.084640
min       0.405044
25%       0.575850
50%       0.631528
75%       0.694234
max       0.825916
Name: temperature_mean, dtype: float64


In [21]:
# 1. Create domain-specific feature combinations
X_train_enhanced = X_train.copy()
X_val_enhanced = X_val.copy()

# Clinical risk factor combinations
X_train_enhanced['lines_rrt'] = X_train['multiple_lines'] * X_train['rrt']
X_train_enhanced['temp_elevation'] = (X_train['temperature_mean'] > X_train['temperature_mean'].mean()).astype(int)
X_train_enhanced['high_risk'] = ((X_train['multiple_lines'] == 1) & 
                                (X_train['temperature_mean'] > X_train['temperature_mean'].mean())).astype(int)

# Create same features for validation set
X_val_enhanced['lines_rrt'] = X_val['multiple_lines'] * X_val['rrt']
X_val_enhanced['temp_elevation'] = (X_val['temperature_mean'] > X_val['temperature_mean'].mean()).astype(int)
X_val_enhanced['high_risk'] = ((X_val['multiple_lines'] == 1) & 
                              (X_val['temperature_mean'] > X_val['temperature_mean'].mean())).astype(int)

# Configure XGBoost for rare events
xgb_model_rare = xgb.XGBClassifier(
    max_depth=3,
    min_child_weight=3,
    gamma=0.5,
    subsample=0.8,
    colsample_bytree=0.8,
    learning_rate=0.01,
    scale_pos_weight=(1-y_clabsi_train.mean())/y_clabsi_train.mean(),
    n_estimators=1000,
    random_state=42
)

# Train model
xgb_model_rare.fit(
    X_train_enhanced, 
    y_clabsi_train,
    eval_set=[(X_val_enhanced, y_clabsi_val)],
    early_stopping_rounds=50,
    verbose=True
)

# Get predictions
y_pred_proba = xgb_model_rare.predict_proba(X_val_enhanced)[:, 1]

# Calculate metrics
fpr, tpr, _ = roc_curve(y_clabsi_val, y_pred_proba)
auc_score = auc(fpr, tpr)
precision, recall, _ = precision_recall_curve(y_clabsi_val, y_pred_proba)
auprc_score = auc(recall, precision)
brier = brier_score_loss(y_clabsi_val, y_pred_proba)

print("\nRare Event XGBoost Performance Metrics:")
print("-" * 30)
print(f"AUC: {auc_score:.3f}")
print(f"AUPRC: {auprc_score:.3f}")
print(f"Brier Score: {brier:.3f}")

# Analyze high-risk predictions
risk_thresh = np.percentile(y_pred_proba, 95)
high_risk_mask = y_pred_proba >= risk_thresh

print("\nHigh Risk Predictions Analysis:")
print("-" * 30)
print(f"Number of high-risk predictions: {sum(high_risk_mask)}")
print(f"CLABSI cases in high-risk group: {sum(y_clabsi_val[high_risk_mask])}")
print(f"Precision in high-risk group: {sum(y_clabsi_val[high_risk_mask])/sum(high_risk_mask):.3f}")

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'