End-to-end (current focus: Readmission with XGBoost + Optuna). Structure prepared for mortality & prolonged LOS later.

In [None]:
# Environment & core imports
import os, sys, json, random, platform, importlib, datetime
from pathlib import Path
import numpy as np, pandas as pd
SEED = 42
random.seed(SEED); np.random.seed(SEED)
PROJECT_ROOT = (Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd())
DATA_DIR = PROJECT_ROOT / 'data'
ARTIFACTS_DIR = PROJECT_ROOT / 'artifacts'
RUNS_ROOT = PROJECT_ROOT / 'runs'
print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir exists: {(DATA_DIR).exists()}")
VERSIONS = {'python': sys.version.split()[0], 'platform': platform.platform()}
for pkg in ['xgboost','optuna','shap','sklearn','pandas','numpy']:
    try:
        m = importlib.import_module(pkg if pkg != 'sklearn' else 'sklearn')
        VERSIONS[pkg] = getattr(m,'__version__','?')
    except Exception as e:
        VERSIONS[pkg] = f'NA({e})'
print('Versions:', json.dumps(VERSIONS, indent=2))

### Labels
Load readmission labels (or synthesize) and report prevalence.

In [None]:
# Load or generate labels (readmission focus)
import pandas as pd, random
LABELS_PATH = None
LABEL_CANDIDATES = [DATA_DIR / 'labels.csv', PROJECT_ROOT / 'labels.csv']
for cand in LABEL_CANDIDATES:
    if cand.exists():
        LABELS_PATH = cand
        break
labels_df = None
if LABELS_PATH is not None:
    labels_df = pd.read_csv(LABELS_PATH)
else:
    cohort_path = DATA_DIR / 'initial_cohort.csv'
    if not cohort_path.exists():
        raise FileNotFoundError('initial_cohort.csv missing; cannot synthesize labels')
    subj = pd.read_csv(cohort_path)
    random.seed(SEED)
    synth = pd.Series([1 if random.random() < 0.043 else 0 for _ in range(len(subj))])
    labels_df = pd.DataFrame({'subject_id': subj['subject_id'],'hadm_id': -1,'readmission_label': synth.values})
    LABELS_PATH = '<synthetic>'
# Normalize column name
if 'readmission_label' not in labels_df.columns:
    lower_map = {c.lower(): c for c in labels_df.columns}
    for alias in ['readmission_label','readmission','readmit','readmit_30d','readmission_30d']:
        if alias in lower_map:
            if lower_map[alias] != 'readmission_label':
                labels_df.rename(columns={lower_map[alias]:'readmission_label'}, inplace=True)
            break
if 'readmission_label' not in labels_df.columns:
    raise ValueError('Could not identify readmission label column')
labels_df = labels_df.drop_duplicates('subject_id')
labels_df['readmission_label'] = labels_df['readmission_label'].astype(int)
assert labels_df['subject_id'].isna().sum()==0
prev = labels_df['readmission_label'].mean()
print(f"Labels source: {LABELS_PATH} | shape={labels_df.shape} | prevalence={prev:.4f}")

### Features
Load (or regenerate) prepared feature matrix aligned to subjects.

In [None]:
# Load feature matrix (regenerate if tiny/corrupt)
import pandas as pd, json, hashlib
feature_path = ARTIFACTS_DIR / 'features_full.parquet'
regenerated = False
if feature_path.exists() and feature_path.stat().st_size < 1000:
    print('Corrupted feature parquet detected; attempting regeneration.')
    cache_dir = DATA_DIR / 'extracted_cache'
    try:
        from src.features import build_features, build_feature_provenance  # type: ignore
        def load_opt(name):
            p = cache_dir / name
            return pd.read_parquet(p) if p.exists() else None
        demo = load_opt('demographics.parquet')
        first_adm = load_opt('first_admissions.parquet')
        vitals = load_opt('vitals_48h.parquet')
        labs = load_opt('labs_48h.parquet')
        rx = load_opt('prescriptions_48h.parquet')
        proc = load_opt('procedures_48h.parquet')
        feats = build_features(first_adm, demo, vitals, labs, rx, proc)
        feats = feats.reindex(labels_df['subject_id']).fillna(0.0)
        feats.to_parquet(feature_path)
        prov = build_feature_provenance(feats)
        (ARTIFACTS_DIR / 'feature_provenance.json').write_text(json.dumps(prov, indent=2))
        (ARTIFACTS_DIR / 'feature_columns.json').write_text(json.dumps(list(feats.columns)))
        regenerated = True
        print('Regenerated features:', feats.shape)
    except Exception as e:
        print('Feature regeneration failed:', e)
if not feature_path.exists():
    raise FileNotFoundError(f'Missing {feature_path}; ensure extraction step executed.')
feature_df = pd.read_parquet(feature_path)
if 'subject_id' in feature_df.columns:
    feature_df = feature_df.set_index('subject_id')
feature_df = feature_df.reindex(labels_df['subject_id']).fillna(0.0)
print('Features loaded shape:', feature_df.shape, '| regenerated' if regenerated else '')

### Train/Validation/Test Split
Create 60/20/20 stratified split and compute imbalance weight.

In [None]:
# Train/valid/test split (60/20/20) + class weight factor
from sklearn.model_selection import train_test_split
readmit_y = labels_df['readmission_label'].astype(int).to_numpy()
subject_index = feature_df.index.to_numpy()
X = feature_df.values
X_tr, X_temp, y_tr, y_temp, sid_tr, sid_temp = train_test_split(
    X, readmit_y, subject_index, test_size=0.4, stratify=readmit_y, random_state=SEED)
X_val, X_te, y_val, y_te, sid_val, sid_te = train_test_split(
    X_temp, y_temp, sid_temp, test_size=0.5, stratify=y_temp, random_state=SEED)
pos_rate = y_tr.mean(); scale_pos_weight = (1-pos_rate)/max(pos_rate,1e-6)
print(f'Split -> train {X_tr.shape} valid {X_val.shape} test {X_te.shape} | pos_rate_train={pos_rate:.4f} | spw≈{scale_pos_weight:.2f}')

### Metrics Helpers
Utility functions to compute threshold-dependent metrics and cost.

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, confusion_matrix
import numpy as np
C_FP = 1.0; C_FN = 5.0
beta = 2.0

def metrics_at(proba, y, thr):
    pred = (proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    cost = C_FP*fp + C_FN*fn
    f1 = f1_score(y, pred)
    prec = tp/(tp+fp+1e-9); rec = tp/(tp+fn+1e-9)
    fbeta = (1+beta**2)*prec*rec/(beta**2*prec+rec+1e-9)
    return dict(f1=f1, precision=prec, recall=rec, cost=cost, fbeta=fbeta)

### Baseline Model
Train a simple class-weighted logistic regression for reference AUC.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
baseline_pipe = Pipeline([
    ("imp", SimpleImputer(strategy='median')),
    ("sc", StandardScaler(with_mean=False)),
    ("lr", LogisticRegression(max_iter=500, class_weight='balanced', solver='liblinear'))
])
baseline_pipe.fit(X_tr, y_tr)
baseline_val_proba = baseline_pipe.predict_proba(X_val)[:,1]
baseline_auc = roc_auc_score(y_val, baseline_val_proba)
print('Baseline Logistic Validation AUC:', round(baseline_auc,4))

### XGBoost Data Structures
Create DMatrix objects for training, validation, and test splits.

In [None]:
# XGBoost test DMatrix (train/val splits handled by CV inside objective)
import xgboost as xgb
D_te = xgb.DMatrix(X_te, label=y_te)

### Optuna Study Setup
Initialize study for AUC maximization with TPE + median pruning.

In [None]:
import optuna, xgboost as xgb
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED), pruner=MedianPruner())
print('Study created.')

### Objective Definition
Define Optuna objective: 5-fold stratified CV with early stopping (mean validation AUC).

In [None]:
# Optuna objective: 5-fold stratified CV AUC with early stopping
import numpy as np, xgboost as xgb
from sklearn.model_selection import StratifiedKFold
EARLY_STOP = 50
MAX_ROUNDS = 1200
N_FOLDS = 5
def objective(trial: optuna.Trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
        'eta': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_weight': trial.suggest_float('min_child_weight', 1.0, 10.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 5.0, log=True),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'scale_pos_weight': scale_pos_weight,
    }
    rounds = trial.suggest_int('n_estimators', 300, MAX_ROUNDS)
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    fold_aucs = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_tr, y_tr), 1):
        Xtr_f, Xva_f = X_tr[tr_idx], X_tr[va_idx]
        ytr_f, yva_f = y_tr[tr_idx], y_tr[va_idx]
        Dtr = xgb.DMatrix(Xtr_f, label=ytr_f)
        Dva = xgb.DMatrix(Xva_f, label=yva_f)
        booster = xgb.train(params, Dtr, num_boost_round=rounds,
                            evals=[(Dva,'valid')],
                            early_stopping_rounds=EARLY_STOP, verbose_eval=False)
        fold_aucs.append(booster.best_score)
    mean_auc = float(np.mean(fold_aucs))
    trial.set_user_attr('fold_aucs', fold_aucs)
    trial.set_user_attr('cv_mean_auc', mean_auc)
    return mean_auc

### Run Hyperparameter Search
Execute trials optimizing mean 5-fold CV AUC (early stopping each fold).

In [None]:
N_TRIALS = 15  # adjust upward for thorough search
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)
print('Best AUC:', study.best_value)
print('Best Params:', study.best_params)

### Inspect Trials
Overview of trials and (optional) optimization history plot.

In [None]:
trials_df = study.trials_dataframe()
print('Trials:', trials_df.shape)
try:
    optuna.visualization.plot_optimization_history(study)
except Exception:
    pass

### Final Model Training
Train final booster on combined train+validation using best params.

In [None]:
import xgboost as xgb, numpy as np
X_tr_full = np.vstack([X_tr, X_val])
y_tr_full = np.concatenate([y_tr, y_val])
D_full = xgb.DMatrix(X_tr_full, label=y_tr_full)
params = study.best_params.copy()
# Map naming differences
params_fixed = {
    'objective':'binary:logistic','eval_metric':'auc','tree_method':'hist','eta':params['learning_rate'],
    'max_depth':params['max_depth'],'min_child_weight':params['min_child_weight'],'subsample':params['subsample'],
    'colsample_bytree':params['colsample_bytree'],'lambda':params['lambda'],'alpha':params['alpha'],
    'gamma':params['gamma'],'scale_pos_weight':scale_pos_weight,
}
final_rounds = study.best_params['n_estimators']
final_booster = xgb.train(params_fixed, D_full, num_boost_round=final_rounds, evals=[(D_full,'train')], verbose_eval=False)
print('Final booster trained.')

### Calibration & Threshold
Fit isotonic on validation; pick F1-optimal threshold on calibrated validation.

In [None]:
# Manual isotonic calibration (avoids CalibratedClassifierCV API issues)
from xgboost import XGBClassifier
from sklearn.isotonic import IsotonicRegression
import numpy as np

base_params = {k:params_fixed[k] for k in ['max_depth','subsample','colsample_bytree']}
base_model = XGBClassifier(**base_params,
                           learning_rate=params['learning_rate'],
                           n_estimators=final_rounds,
                           min_child_weight=params['min_child_weight'],
                           reg_lambda=params['lambda'],
                           reg_alpha=params['alpha'],
                           gamma=params['gamma'],
                           objective='binary:logistic',
                           tree_method='hist',
                           scale_pos_weight=scale_pos_weight,
                           eval_metric='logloss',
                           verbosity=0)
# Fit on training only (validation reserved for calibration mapping)
base_model.fit(X_tr, y_tr)
val_proba_raw = base_model.predict_proba(X_val)[:,1]
iso = IsotonicRegression(out_of_bounds='clip')
iso.fit(val_proba_raw, y_val)
print('Isotonic calibration fitted on validation set.')

def predict_calibrated(X):
    return iso.transform(base_model.predict_proba(X)[:,1])

# Derive operating threshold on calibrated validation probabilities
val_cal = predict_calibrated(X_val)
ths = np.linspace(0.01,0.9,300)
threshold_info = None
for t in ths:
    m = metrics_at(val_cal, y_val, t)
    if (threshold_info is None) or (m['f1'] > threshold_info['f1']):
        threshold_info = {**m, 'threshold': float(t)}
print('Selected threshold (calibrated validation):', threshold_info)

### Test Evaluation
Apply calibrated model + selected threshold; report core metrics.

In [None]:
# Evaluate calibrated model on test set
from sklearn.metrics import brier_score_loss
import json
cal_proba_test = predict_calibrated(X_te)
auc = roc_auc_score(y_te, cal_proba_test)
pr = average_precision_score(y_te, cal_proba_test)
brier = brier_score_loss(y_te, cal_proba_test)
thr = threshold_info['threshold']
th_metrics = metrics_at(cal_proba_test, y_te, thr)
report = {
    'auc': float(auc),
    'pr_auc': float(pr),
    'brier': float(brier),
    'threshold': float(thr),
    'f1_at_threshold': float(th_metrics['f1']),
    'precision_at_threshold': float(th_metrics['precision']),
    'recall_at_threshold': float(th_metrics['recall']),
    'cost_at_threshold': float(th_metrics['cost']),
}
print(json.dumps(report, indent=2))

### SHAP Summary
Compute SHAP values on a sample for global importance.

In [None]:
# SHAP global explanation (sample subset)
import numpy as np
try:
    import shap
    X_tr_full = np.vstack([X_tr, X_val])
    sample_idx = np.random.choice(X_tr_full.shape[0], size=min(400, X_tr_full.shape[0]), replace=False)
    X_sample = X_tr_full[sample_idx]
    import xgboost as xgb
    explainer = shap.TreeExplainer(final_booster)
    shap_val = explainer.shap_values(X_sample)
    mean_abs = np.abs(shap_val).mean(axis=0)
    top_order = np.argsort(-mean_abs)[:20]
    print('Top SHAP feature indices (first 10 of 20):', top_order[:10])
except Exception as e:
    print('SHAP skipped:', e)

### Bootstrap AUC CI
Estimate uncertainty of test ROC AUC via stratified bootstrap.

In [None]:
# Bootstrap 95% CI for test AUC
import numpy as np
from sklearn.metrics import roc_auc_score
R = 1000
rng = np.random.default_rng(42)
auc_samples = []
for _ in range(R):
    idx_pos = np.where(y_te==1)[0]
    idx_neg = np.where(y_te==0)[0]
    b_pos = rng.choice(idx_pos, size=len(idx_pos), replace=True)
    b_neg = rng.choice(idx_neg, size=len(idx_neg), replace=True)
    b_idx = np.concatenate([b_pos, b_neg])
    auc_samples.append(roc_auc_score(y_te[b_idx], cal_proba_test[b_idx]))
auc_samples = np.array(auc_samples)
ci_low, ci_high = np.percentile(auc_samples, [2.5,97.5])
print(f'AUC bootstrap mean={auc_samples.mean():.4f} 95% CI=({ci_low:.4f},{ci_high:.4f}) n={R}')

### 5-Fold CV Comparison
Compare XGB vs logistic AUC on combined train+validation data.

In [None]:
# 5-fold CV AUC comparison: XGB vs Logistic baseline
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
X_cv = np.vstack([X_tr, X_val])
y_cv = np.concatenate([y_tr, y_val])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
auc_xgb = []; auc_lr = []
params_cv = params_fixed.copy(); params_cv['eval_metric'] = 'auc'
for fold,(tr_idx, va_idx) in enumerate(skf.split(X_cv, y_cv), 1):
    Xtr, Xva = X_cv[tr_idx], X_cv[va_idx]; ytr, yva = y_cv[tr_idx], y_cv[va_idx]
    lr_pipe = Pipeline([('imp', SimpleImputer(strategy='median')),('sc', StandardScaler(with_mean=False)),('lr', LogisticRegression(max_iter=500, class_weight='balanced', solver='liblinear'))])
    lr_pipe.fit(Xtr, ytr)
    auc_lr.append(roc_auc_score(yva, lr_pipe.predict_proba(Xva)[:,1]))
    Dtr = xgb.DMatrix(Xtr, label=ytr); Dva = xgb.DMatrix(Xva, label=yva)
    booster = xgb.train(params_cv, Dtr, num_boost_round=params_cv.get('n_estimators', final_rounds), evals=[(Dva,'valid')], verbose_eval=False)
    auc_xgb.append(roc_auc_score(yva, booster.predict(Dva)))
print('CV AUC Logistic: mean', f'{np.mean(auc_lr):.4f}', '±', f'{np.std(auc_lr):.4f}')
print('CV AUC XGB     : mean', f'{np.mean(auc_xgb):.4f}', '±', f'{np.std(auc_xgb):.4f}')

### Persist Artifacts
Save model, calibration objects, metrics, threshold, and metadata.

In [None]:
# Persist artifacts (model + calibration + metadata)
import json, joblib, hashlib, time, subprocess
OUT_DIR = ARTIFACTS_DIR
OUT_DIR.mkdir(exist_ok=True)
final_booster.save_model(str(OUT_DIR / 'model_readmission.json'))
joblib.dump(iso, OUT_DIR / 'isotonic.joblib')
joblib.dump(base_model, OUT_DIR / 'base_model.joblib')
with open(OUT_DIR / 'best_params.json','w',encoding='utf-8') as f: json.dump(study.best_params, f, indent=2)
with open(OUT_DIR / 'metrics.json','w',encoding='utf-8') as f: json.dump(report, f, indent=2)
with open(OUT_DIR / 'threshold.txt','w') as f: f.write(str(report['threshold']))
try:
    git_commit = subprocess.check_output(['git','rev-parse','HEAD'], text=True).strip()
except Exception:
    git_commit = 'UNKNOWN'
feat_cols = list(feature_df.columns)
feat_sig = hashlib.sha256(('|'.join(feat_cols)).encode()).hexdigest()[:16]
meta = {
    'saved_utc': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
    'git_commit': git_commit,
    'n_features': len(feat_cols),
    'feature_sig_sha256_16': feat_sig,
    'prevalence_train': float(y_tr.mean()),
    'prevalence_valid': float(y_val.mean()),
    'prevalence_test': float(y_te.mean()),
    'optuna_best_value': float(study.best_value),
    'threshold_info': threshold_info,
    'calibration': 'isotonic_on_validation',
}
with open(OUT_DIR / 'run_metadata.json','w',encoding='utf-8') as f: json.dump(meta, f, indent=2)
print('Artifacts saved ->', OUT_DIR)

### Experiment Registry
Append current run metrics to CSV registry for tracking.

In [None]:
# Append metrics to experiment registry
import csv, time
REG_PATH = PROJECT_ROOT / 'experiment_registry.csv'
row = {'ts': time.time(), **report}
write_header = not REG_PATH.exists()
with open(REG_PATH,'a',newline='') as f:
    w = csv.DictWriter(f, fieldnames=row.keys())
    if write_header: w.writeheader()
    w.writerow(row)
print('Logged metrics to', REG_PATH)

### Single Prediction Demo
Show calibrated probability for one test instance.

In [None]:
# Single example calibrated probability demo
raw_proba = final_booster.predict(D_te)[0]
calib_proba = iso.transform([raw_proba])[0]
print('Single test example calibrated probability:', float(calib_proba))

In [None]:
import numpy as np
sample_idx = np.random.choice(X_tr_full.shape[0], size=min(400, X_tr_full.shape[0]), replace=False)
X_sample = X_tr_full[sample_idx]
try:
    import shap
    explainer = shap.TreeExplainer(final_booster)
    shap_val = explainer.shap_values(X_sample)
    mean_abs = np.abs(shap_val).mean(axis=0)
    top_order = np.argsort(-mean_abs)[:20]
    print('Top 20 SHAP feature indices:', top_order[:10], '...')
except Exception as e:
    print('SHAP skipped:', e)