# Phase 5: Prognostic Outcome Assessment Model

This notebook builds an ensemble prognostic model that combines predictions and features from prior phases to estimate neurodevelopmental outcome risk (continuous 0â€“1 and categorical prognosis).

## 1. Import Required Libraries

In [1]:
# Imports
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

sns.set(style="whitegrid")
print("Libraries imported.")

Libraries imported.


## 2. Load Predictions and Features from Previous Phases

In [2]:
# Paths
DATA_PATH = os.path.join('..', 'data', 'preprocessed_normalized_dataset.csv')
PHASE4_FEATURES_PATH = os.path.join('..', 'models', 'phase4_prediction_features.json')
SEIZURE_METRICS_PATH = os.path.join('..', 'models', 'seizure_model_metrics.json')
SEPSIS_METRICS_PATH = os.path.join('..', 'models', 'sepsis_model_metrics.json')
CARDIAC_METRICS_PATH = os.path.join('..', 'models', 'cardiac_model_metrics.json')
RENAL_METRICS_PATH = os.path.join('..', 'models', 'renal_model_metrics.json')

# Load base data
base_df = pd.read_csv(DATA_PATH)
print("Loaded base data:", base_df.shape)

# Create/confirm the Phase2 outcome label if present (e.g., neuro_outcome_72h)
if 'neuro_outcome_72h' not in base_df.columns:
    # Heuristic outcome: combine severity, temp stability, metabolic recovery
    # Normalize inputs as proxy
    sev = base_df.get('hie_severity', pd.Series(1, index=base_df.index)).fillna(1)
    temp_stab = (base_df.get('temp_grad_1h', pd.Series(0, index=base_df.index)).abs() < 0.2).astype(int)
    metabolic_ok = ((base_df['lactate'] < 4.0) & (base_df['ph'] >= 7.30)).astype(int)
    # Map to categorical risk: 0 best .. 4 worst, then normalize to 0..1
    score = (4 - (temp_stab + metabolic_ok)) + sev
    score_norm = (score - score.min()) / (score.max() - score.min() + 1e-9)
    base_df['neuro_outcome_72h'] = score_norm

# Convert to categorical prognosis buckets
# Excellent/Good/Moderate/Guarded/Poor
bins = [0.0, 0.2, 0.4, 0.6, 0.8, 1.01]
labels = ['Excellent', 'Good', 'Moderate', 'Guarded', 'Poor']
base_df['prognosis_bucket'] = pd.cut(base_df['neuro_outcome_72h'], bins=bins, labels=labels, include_lowest=True)
base_df['prognosis_target'] = (base_df['neuro_outcome_72h'] > 0.6).astype(int)  # binary: Guarded/Poor vs others

print(base_df[['neuro_outcome_72h', 'prognosis_bucket']].head())

# Build feature space: concatenate Phase4 predictions (placeholder: use risk labels) + Phase3 temp features
feature_cols = [
    # Physiological features
    'rectal_temp','heart_rate','systolic_bp','diastolic_bp','spo2','ph','lactate',
    'temp_grad_5m','temp_grad_30m','temp_grad_1h','hr_roll_mean','hr_roll_std','map_mmHg','pulse_pressure',
    # Phase4 label proxies (in production, replace with model outputs)
    'seizure_risk_high','sepsis_risk_flag','cardiac_distress_flag','renal_dysfunction_risk'
]
feature_cols = [c for c in feature_cols if c in base_df.columns]

work_df = base_df.dropna(subset=feature_cols + ['prognosis_target']).copy()
X = work_df[feature_cols].to_numpy(dtype=float)
y_bin = work_df['prognosis_target'].to_numpy(dtype=int)
print("Feature matrix:", X.shape)


Loaded base data: (43200, 51)
   neuro_outcome_72h prognosis_bucket
0                  0        Excellent
1                  0        Excellent
2                  0        Excellent
3                  0        Excellent
4                  0        Excellent
Feature matrix: (43200, 3)


## 3. Feature Engineering for Prognosis

In [3]:
# Aggregate per patient to summarize course features
if 'patient_id' in work_df.columns:
    # Use actual column names from preprocessed data
    available_agg_cols = {
        'rectal_temperature_c': ['mean','std'],
        'heart_rate_bpm': ['mean','std'],
        'systolic_bp_mmhg': ['mean','min'],
        'diastolic_bp_mmhg': ['mean','min'],
        'lactate_mmol': ['mean','max'],
        'pH': ['mean','min'],
        'neuro_outcome_72h': ['max']
    }
    # Filter to only existing columns
    agg_cols = {k: v for k, v in available_agg_cols.items() if k in work_df.columns}
    
    agg_df = work_df.groupby('patient_id').agg(agg_cols)
    # Flatten MultiIndex columns
    agg_df.columns = [f"{a}_{b}" for a,b in agg_df.columns]
    agg_df = agg_df.reset_index()
    # Targets
    y_patient = agg_df['neuro_outcome_72h_max'].to_numpy(dtype=int)
    # Features
    feature_cols_patient = [c for c in agg_df.columns if c not in ['patient_id','neuro_outcome_72h_max']]
    X_patient = agg_df[feature_cols_patient].to_numpy(dtype=float)
else:
    # Fallback to sample-level
    X_patient = X
    y_patient = y_bin
    feature_cols_patient = feature_cols

print("Patient-level feature matrix:", X_patient.shape)

KeyError: "Column(s) ['cardiac_distress_flag', 'heart_rate', 'lactate', 'map_mmHg', 'ph', 'rectal_temp', 'sepsis_risk_flag', 'temp_grad_1h'] do not exist"

## 4. Build Prognostic Models (Binary + Multiclass)

In [None]:
# Train/test split
X_tr, X_te, y_tr, y_te = train_test_split(X_patient, y_patient, test_size=0.2, random_state=42, stratify=y_patient)

# Binary prognosis (Guarded/Poor vs others)
binary_models = {
    'logreg': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
    ]),
    'rf': RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced'),
    'gb': GradientBoostingClassifier(random_state=42)
}

binary_metrics = {}
for name, clf in binary_models.items():
    clf.fit(X_tr, y_tr)
    preds = clf.predict(X_te)
    proba = clf.predict_proba(X_te)[:, 1] if hasattr(clf, 'predict_proba') else None
    auc = roc_auc_score(y_te, proba) if proba is not None else np.nan
    precision, recall, f1, _ = precision_recall_fscore_support(y_te, preds, average='binary')
    binary_metrics[name] = {'auc': float(auc), 'precision': float(precision), 'recall': float(recall), 'f1': float(f1)}

pd.DataFrame(binary_metrics).T

## 5. Model Evaluation and Calibration

In [None]:
# Pick best binary model by AUC
best_bin_name = max(binary_metrics.items(), key=lambda kv: kv[1]['auc'])[0]
best_bin_model = binary_models[best_bin_name]
print("Best binary prognosis model:", best_bin_name, binary_metrics[best_bin_name])

# Confusion matrix
from sklearn.metrics import RocCurveDisplay
if hasattr(best_bin_model, 'predict_proba'):
    RocCurveDisplay.from_estimator(best_bin_model, X_te, y_te)
    plt.title(f"Binary Prognosis ROC - {best_bin_name}")
    plt.show()

cm = confusion_matrix(y_te, best_bin_model.predict(X_te))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Binary Prognosis - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


## 6. Prognosis Category Mapping

In [None]:
# Map probability to prognosis categories

def prognosis_text(prob):
    if prob < 0.2:
        return "Excellent"
    elif prob < 0.4:
        return "Good"
    elif prob < 0.6:
        return "Moderate"
    elif prob < 0.8:
        return "Guarded"
    else:
        return "Poor"

# Example inference for a few samples
if hasattr(best_bin_model, 'predict_proba'):
    example_probs = best_bin_model.predict_proba(X_te[:10])[:, 1]
    mapped = [prognosis_text(p) for p in example_probs]
    print(list(zip(example_probs.round(3), mapped)))

## 7. Save Prognostic Model & Metadata

In [None]:
import pickle
import json
os.makedirs(os.path.join('..', 'models'), exist_ok=True)

# Save best binary prognosis model
with open(os.path.join('..', 'models', f'prognosis_model_{best_bin_name}.pkl'), 'wb') as f:
    pickle.dump(best_bin_model, f)

with open(os.path.join('..', 'models', 'prognosis_model_metrics.json'), 'w') as f:
    json.dump(binary_metrics, f, indent=2)

with open(os.path.join('..', 'models', 'prognosis_feature_columns.json'), 'w') as f:
    json.dump(feature_cols_patient, f, indent=2)

print("Prognosis model and metadata saved.")