# Inference Parity Validation
This notebook validates that `run_pipeline_on_unseen_data` uses the exact same scoring logic as the training/evaluation pipeline by delegating to `shared_inference.score_features`.

In [1]:
# Setup: imports (duckdb connection creation left to user environment)
import duckdb, pandas as pd, importlib, os
from project.unseen_data_evaluation import run_pipeline_on_unseen_data
from project.shared_inference import score_features, _resolve_models_dir
from project import features as features_mod
importlib.reload(features_mod)  # ensure fresh features logic

<module 'project.features' from 'c:\\Users\\Almog Luz\\Documents\\GitHub\\mlhc-final-project\\project\\features.py'>

In [2]:
# Example connection (requires loaded MIMIC tables) - adapt as needed
con = duckdb.connect(database=':memory:')
# TODO: attach or create the required tables before running pipeline

# Placeholder subject IDs (replace with valid ones in your environment)
subject_ids = [10006, 10011, 10023]

In [5]:
# (Temporary) Minimal synthetic tables for parity debugging (drop when real MIMIC attached)
import pandas as pd, datetime
if 'admissions' not in con.execute("SHOW TABLES").fetchdf().name.values:
    adm_synth = pd.DataFrame({
        'subject_id': subject_ids,
        'hadm_id': [s*10 for s in subject_ids],
        'admittime': [pd.Timestamp('2020-01-01')]*len(subject_ids),
        'dischtime': [pd.Timestamp('2020-01-04')]*len(subject_ids),  # 72h -> passes LOS >=54h
        'deathtime': [None]*len(subject_ids),
        'admission_type': ['EMERGENCY']*len(subject_ids),
        'admission_location': ['EMERGENCY ROOM']*len(subject_ids),
        'discharge_location': ['HOME']*len(subject_ids),
        'diagnosis': ['SYNTH']*len(subject_ids),
        'insurance': ['Medicare']*len(subject_ids),
        'language': ['ENGLISH']*len(subject_ids),
        'marital_status': ['SINGLE']*len(subject_ids),
        'ethnicity': ['WHITE']*len(subject_ids)
    })
    con.register('admissions', adm_synth)
    pats = pd.DataFrame({
        'subject_id': subject_ids,
        'gender': ['M']*len(subject_ids),
        'dob': [pd.Timestamp('1980-01-01')]*len(subject_ids),
        'dod': [None]*len(subject_ids),
        'expire_flag': [0]*len(subject_ids)
    })
    con.register('patients', pats)
    # Empty modality tables (kept minimal)
    con.register('chartevents', pd.DataFrame(columns=['subject_id','hadm_id','charttime','itemid','valuenum','valueuom']))
    con.register('labevents', pd.DataFrame(columns=['subject_id','hadm_id','charttime','itemid','valuenum','value','valueuom','flag']))
    con.register('prescriptions', pd.DataFrame(columns=['subject_id','hadm_id','startdate','enddate','drug','drug_type','formulary_drug_cd','route']))
    con.register('procedureevents_mv', pd.DataFrame(columns=['subject_id','hadm_id','starttime','endtime','itemid','ordercategoryname','ordercategorydescription','location']))
    print('Synthetic minimal tables registered for debugging.')
else:
    print('Skipping synthetic table creation (admissions already present).')

Synthetic minimal tables registered for debugging.


In [6]:
# Run pipeline scoring (uses shared_inference internally)
from pathlib import Path
from project.shared_inference import _resolve_models_dir
models_dir = _resolve_models_dir()
needed = [
    'model_readmission.joblib',
    'isotonic_readmission.joblib',
    'model_mortality.joblib',
    'isotonic_mortality.joblib',
    'model_prolonged_los.joblib',
    'isotonic_prolonged_los.joblib',
    'feature_columns.json'
 ]
missing = [n for n in needed if not (Path(models_dir)/n).exists()]
if missing:
    print('WARNING: Missing expected artifacts:', missing)
pipeline_df = run_pipeline_on_unseen_data(subject_ids, con)
print('Pipeline output shape:', pipeline_df.shape)
print('Columns:', list(pipeline_df.columns))
display(pipeline_df.head())
pipeline_df

DEBUG: client_module=_duckdb, client_class=DuckDBPyConnection, requested_subjects=3
DEBUG: resolved backend -> duckdb=True, bigquery=False
DEBUG: first_adm.shape -> (3, 13)
DEBUG: remaining after LOS filter -> 3
DEBUG: modality sizes -> demo: 3 vitals: 0 labs: 0 rx: 0 proc: 0
DEBUG(build_features): first_adm columns -> ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'diagnosis', 'insurance', 'language', 'marital_status', 'ethnicity', 'los_hours']
DEBUG(build_features): demo columns -> ['subject_id', 'gender', 'dob', 'dod', 'expire_flag']
DEBUG: raw features shape -> (3, 7)
DEBUG: invoking shared_inference.score_features for unified parity scoring


RuntimeError: Feature extraction incomplete; 1422 required columns missing. First few: ['eth_ASIAN', 'eth_BLACK', 'eth_HISPANIC', 'eth_OTHER', 'aprotinin cc/hr__mean', 'aprotinin ml/hr__mean', 'art bp diastolic__mean', 'art bp systolic__mean', 'arterial blood pressure diastolic__mean', 'arterial blood pressure systolic__mean']

In [7]:
# Diagnostics for missing feature columns
from pathlib import Path
import json
feat_cols_path = Path(models_dir)/'feature_columns.json'
if feat_cols_path.exists():
    feature_cols_artifact = json.loads(feat_cols_path.read_text())
    print('Total required feature columns (artifact):', len(feature_cols_artifact))
    print('Sample first 25:', feature_cols_artifact[:25])
else:
    print('feature_columns.json missing; cannot list required feature set.')

Total required feature columns (artifact): 1429
Sample first 25: ['age', 'gender_M', 'gender_F', 'eth_ASIAN', 'eth_BLACK', 'eth_HISPANIC', 'eth_OTHER', 'eth_WHITE', 'aprotinin cc/hr__mean', 'aprotinin ml/hr__mean', 'art bp diastolic__mean', 'art bp systolic__mean', 'arterial blood pressure diastolic__mean', 'arterial blood pressure systolic__mean', 'arterial bp #2 [diastolic]__mean', 'arterial bp #2 [systolic]__mean', 'arterial bp [diastolic]__mean', 'arterial bp [systolic]__mean', 'arterial o2 saturation__mean', 'atr sens thresh mv__mean', 'atr stim thresh ma__mean', 'atrial pacemaker threshold [value]__mean', 'baseline current/ma__mean', 'blood temperature cco (c)__mean', 'bp cuff [diastolic]__mean']


In [None]:
# Parity self-check: Columns present and non-null
cols = ['mortality_proba','prolonged_LOS_proba','readmission_proba']
assert pipeline_df[cols].notna().all().all(), 'NaNs present in probability outputs'
print('Parity OK: probabilities populated via shared scorer.')

In [None]:
# Canonicalize probability column names & debug
import re, numpy as np, pandas as pd
prob_cols = [c for c in pipeline_df.columns if c.lower().endswith('_proba')]
print('Discovered probability columns:', prob_cols)
canonical_map = {}
for c in prob_cols:
    base = re.sub(r'_proba$','', c, flags=re.IGNORECASE)
    base_lower = base.lower()
    if base_lower in ('prolonged_los','prolonged-los','prolonged_los_label'):
        base_lower = 'prolonged_los'
    elif base_lower in ('readmit','readmission'):
        base_lower = 'readmission'
    elif base_lower in ('mort','mortality'):
        base_lower = 'mortality'
    canonical_map[c] = f'{base_lower}_proba'
if canonical_map:
    pipeline_df = pipeline_df.rename(columns=canonical_map)
expected = ['readmission_proba','mortality_proba','prolonged_los_proba']
missing_expected = [c for c in expected if c not in pipeline_df.columns]
if missing_expected:
    print('ERROR: Missing expected probability columns:', missing_expected)
else:
    print('All expected probability columns present.')
display(pipeline_df.head())

In [None]:
# (Optional) If you expose intermediate features from pipeline, you could re-score them directly.
# For now we just demonstrate idempotence: scoring the same features via score_features matches pipeline output.
from project.shared_inference import score_features

# Without direct access to feature DF here (pipeline encapsulates it), we perform a no-op parity:
# Re-score by reconstructing a minimal DF from probabilities (illustrative). In practice, expose features.
# assert (pipeline_df.set_index('subject_id')[cols] - pipeline_df.set_index('subject_id')[cols]).abs().max().max() < 1e-12
print('Calibration-aware parity path active. To perform strict matrix parity, expose features from pipeline.')

In [None]:
# Strict calibrated parity check (robust)
from project.shared_inference import get_last_aligned_features, score_features
import numpy as np
aligned = get_last_aligned_features()
if aligned is None:
    raise RuntimeError('No aligned features captured. Ensure pipeline scoring called shared scorer.')
# Re-score using exact aligned matrix
rescore_df = score_features(aligned)
print('Rescore columns:', list(rescore_df.columns))
# Prepare pipeline subset
pipe_idx = pipeline_df.set_index('subject_id').loc[aligned.index]
# Identify probability columns in both
pipe_prob_cols = [c for c in pipe_idx.columns if c.lower().endswith('_proba')]
rescore_prob_cols = [c for c in rescore_df.columns if c.lower().endswith('_proba')]
def norm(s): return s.lower()
pipe_norm = {norm(c): c for c in pipe_prob_cols}
rescore_norm = {norm(c): c for c in rescore_prob_cols}
overlap = sorted(set(pipe_norm.keys()) & set(rescore_norm.keys()))
if not overlap:
    raise RuntimeError(f'No overlapping probability columns between pipeline {pipe_prob_cols} and rescore {rescore_prob_cols}.')
col_report = []
for n in overlap:
    pc = pipe_norm[n]; rc = rescore_norm[n]
    pv = pipe_idx[pc].to_numpy(); rv = rescore_df[rc].to_numpy()
    if pv.shape != rv.shape:
        raise RuntimeError(f'Shape mismatch for {pc} vs {rc}: {pv.shape} vs {rv.shape}')
    diff = np.abs(pv - rv)
    col_report.append((pc, float(diff.max()), float(diff.mean())))
max_abs = max(r[1] for r in col_report)
mean_abs = sum(r[2] for r in col_report)/len(col_report)
print('Per-column diffs:')
for name, mx, mn in col_report:
    print(f'  {name}: max={mx:.3e} mean={mn:.3e}')
print(f'Global max abs diff: {max_abs:.3e} | global mean abs diff: {mean_abs:.3e}')
assert max_abs < 1e-12, f'Parity failure: max diff {max_abs}'
print('Calibrated prediction parity confirmed (<=1e-12).')