In [None]:
from db import attach_duckdb, duckdb_to_df, load_sql

attach_duckdb("remote_mimic")
query = load_sql("rev-cohort.sql")
df = duckdb_to_df(query)
df.head()


DuckDB attached to remote PostgreSQL successfully.


Unnamed: 0,subject_id,hadm_id,label,anchor_age,gender,stay_id,icu_intime,charlson_comorbidity_index,heart_rate_mean,sbp_mean,...,hematocrit,hemoglobin,wbc,platelet,creatinine,bun,pf_ratio,ck_mb,prev_mi,stroke_history
0,13467195,21154580,0,68,M,34490721,2181-05-14 08:48:07,2,,,...,43.0,15.5,5.9,225.0,1.1,14.0,,,0,0
1,13468386,24561318,0,63,M,34950661,2174-06-24 10:48:52,5,,,...,48.8,16.9,10.4,201.0,1.0,23.0,,,1,0
2,13468386,24561318,0,63,M,34950661,2174-06-24 10:48:52,5,,,...,48.8,16.9,10.4,201.0,1.0,23.0,,,1,0
3,13466375,21011631,0,70,M,36305045,2149-09-19 12:19:26,6,,,...,44.7,14.7,26.2,341.0,2.3,74.0,,,0,0
4,13466461,24748943,0,88,F,33383412,2136-01-16 09:25:50,10,,,...,31.7,10.0,9.6,192.0,1.7,42.0,,22.0,1,0


In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# --- Step A: Define Feature Groups ---
# Continuous features need scaling + median imputation
continuous_cols = ['anchor_age', 'heart_rate_mean', 'sbp_mean', 'dbp_mean', 'mbp_mean', 'resp_rate_mean',
                   'spo2_mean', 'hematocrit', 'hemoglobin', 'wbc', 'platelet', 'creatinine',
                   'bun', 'pf_ratio', 'ck_mb']
# Score/Binary features need 0 imputation
score_cols = ['charlson_comorbidity_index', 'prev_mi', 'stroke_history']

# --- Step B: Handle Missing Values (The "Dual Strategy") ---
# 1. Impute Continuous with MEDIAN
imputer = SimpleImputer(strategy='median')
df[continuous_cols] = imputer.fit_transform(df[continuous_cols])

# 2. Impute Scores with ZERO (Assume NULL = Absence of condition)
df[score_cols] = df[score_cols].fillna(0)

# --- Step C: Encoding ---
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])

# Combine all features
feature_cols = continuous_cols + score_cols + ['gender']
X = df[feature_cols].values
y = df['label'].values

In [None]:
# --- Step D: Stratified Splitting ---
# Stratify=y ensures we have the same % of mortality in Train, Val, and Test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# --- Step E: Scaling (Standardization) ---
# CRITICAL: Fit scaler ONLY on X_train to prevent info leakage from Test set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# ==========================================
# 3. Addressing Class Imbalance
# ==========================================
# Calculate positive weight for BCEWithLogitsLoss
# Formula: number_of_negatives / number_of_positives
num_neg = (y_train == 0).sum()
num_pos = (y_train == 1).sum()
pos_weight_value = num_neg / num_pos

print("-" * 30)
print(f"Train Shape: {X_train.shape}")
print(f"Class Balance (Train): {num_neg} Survivors vs {num_pos} Deaths")
print(f"Calculated pos_weight: {pos_weight_value:.4f}")
print("-" * 30)

# ==========================================
# 4. Prepare for PyTorch
# ==========================================
# Convert to Tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)
pos_weight_tensor = torch.FloatTensor([pos_weight_value])

print("Ready for Model Training.")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 1. Convert to PyTorch Tensors
train_data = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train).unsqueeze(1))
val_data = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val).unsqueeze(1))
test_data = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test).unsqueeze(1))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

# 2. Define the Architecture
class MortalityPredictor(nn.Module):
    def __init__(self, input_dim):
        super(MortalityPredictor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),  # Prevent overfitting
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
            nn.Sigmoid()      # Output probability between 0 and 1
        )

    def forward(self, x):
        return self.network(x)

# 3. Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MortalityPredictor(input_dim=X_train.shape[1]).to(device)
#criterion = nn.BCELoss() # Binary Cross Entropy for classification
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Training Loop
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for inputs, labels in val_loader: # Use val or test loader
        inputs = inputs.to(device)
        outputs = model(inputs)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(outputs.cpu().numpy())

roc_auc = roc_auc_score(y_true, y_pred)
print(f"Model AUC-ROC: {roc_auc:.4f}")