In [1]:
# Path setup so 'project' package imports regardless of IDE CWD
import sys, os, pathlib
cands = [pathlib.Path.cwd(), pathlib.Path.cwd().parent, pathlib.Path.cwd()/ 'project', pathlib.Path.cwd().parent / 'project']
for p in cands:
    if p.is_dir() and str(p) not in sys.path:
        sys.path.insert(0, str(p))
print('sys.path[0]=', sys.path[0])

sys.path[0]= c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\project


In [None]:
# User configuration
import os, pathlib, datetime as _dt
# Default copied from exploration notebook; change if your project differs.
GCP_PROJECT_ID = 'ml-for-healthcare-2025'
# Because this notebook lives inside the 'project/' directory, the cohort CSV is directly under 'data/'
# Use a robust resolver that tests a few candidate paths so it also works if run from repo root.
_COHORT_CANDIDATES = [
    'data/initial_cohort.csv',  # notebook cwd inside project/
    os.path.join('project','data','initial_cohort.csv'),  # if run from repo root
    os.path.join('..','project','data','initial_cohort.csv'),  # defensive
]
INITIAL_COHORT_CSV = None
for c in _COHORT_CANDIDATES:
    if os.path.exists(c):
        INITIAL_COHORT_CSV = c
        break
if INITIAL_COHORT_CSV is None:
    # leave as first candidate; downstream cell will still raise a clear error
    INITIAL_COHORT_CSV = _COHORT_CANDIDATES[0]
LABELS_CSV = None  # if provided, skip generation; must have subject_id + *_label columns
GENERATE_LABELS = True  # build labels if LABELS_CSV is None
# Per-run artifacts directory (local time is fine here; uniqueness matters more than timezone)
RUN_ID = _dt.datetime.now().strftime('%Y%m%d_%H%M%S')
# Determine where to place runs/ to avoid accidental project/project nesting.
# If we're already inside the 'project' directory (common when opening the notebook directly),
# create 'runs/' right here. If running from repo root (where a 'project' dir exists), place
# runs under project/runs. Fallback to a plain 'runs' if neither heuristic matches.
if os.path.basename(os.getcwd()) == 'project':
    RUNS_ROOT = 'runs'
elif os.path.isdir('project'):
    RUNS_ROOT = os.path.join('project', 'runs')
else:
    RUNS_ROOT = 'runs'
# Collapse any accidental duplicate segment (idempotent safeguard)
RUNS_ROOT = RUNS_ROOT.replace('project' + os.sep + 'project', 'project')
ARTIFACTS_DIR = os.path.join(RUNS_ROOT, RUN_ID)
OUTPUT_MODELS_DIR = os.path.join(ARTIFACTS_DIR,'models')
os.makedirs(OUTPUT_MODELS_DIR, exist_ok=True)
# Expose models dir for downstream inference scripts (unseen evaluation) via env var
os.environ['MLHC_MODELS_DIR'] = os.path.abspath(OUTPUT_MODELS_DIR)
print(f'Run ID: {RUN_ID}')
print('Artifacts root:', ARTIFACTS_DIR)
print('Models dir:', OUTPUT_MODELS_DIR)
TEST_SIZE = 0.2
RANDOM_STATE = 42
GCP_PROJECT_ID, INITIAL_COHORT_CSV, LABELS_CSV, GENERATE_LABELS, ARTIFACTS_DIR, OUTPUT_MODELS_DIR

Run ID: 20250905_113628
Artifacts root: project\runs\20250905_113628
Models dir: project\runs\20250905_113628\models


('ml-for-healthcare-2025',
 'data/initial_cohort.csv',
 None,
 True,
 'project\\runs\\20250905_113628',
 'project\\runs\\20250905_113628\\models')

In [3]:
# Load cohort & init BigQuery client (robust path resolution)
from google.cloud import bigquery
import pandas as pd, json
if not os.path.exists(INITIAL_COHORT_CSV):
    raise FileNotFoundError(
        'Missing cohort CSV. Looked for: ' + INITIAL_COHORT_CSV + '\n'
        'If running from repo root, set INITIAL_COHORT_CSV to project/data/initial_cohort.csv; '
        'if running inside project/, set to data/initial_cohort.csv.'
    )
cohort_df = pd.read_csv(INITIAL_COHORT_CSV)
if 'subject_id' not in cohort_df.columns:
    raise ValueError('INITIAL_COHORT_CSV must contain subject_id')
subject_ids = cohort_df['subject_id'].dropna().astype(int).tolist()
print(f'Cohort size: {len(subject_ids)} from {INITIAL_COHORT_CSV}')
os.makedirs(os.path.join(ARTIFACTS_DIR,'data'), exist_ok=True)
cohort_copy_path = os.path.join(ARTIFACTS_DIR,'data','cohort_snapshot.csv')
cohort_df.to_csv(cohort_copy_path, index=False)
print('Saved cohort snapshot to', cohort_copy_path)
try:
    bq_client = bigquery.Client(project=GCP_PROJECT_ID)
    print('BigQuery client ready.')
except Exception as e:
    bq_client = None
    print('Warning: BigQuery client not initialized ->', e)


Cohort size: 32513 from data/initial_cohort.csv
Saved cohort snapshot to project\runs\20250905_113628\data\cohort_snapshot.csv
BigQuery client ready.
BigQuery client ready.


In [4]:
# Build labels if needed (requires BigQuery access)
import datetime as _dt
if LABELS_CSV is None and GENERATE_LABELS:
    if bq_client is None:
        raise RuntimeError('Cannot generate labels without BigQuery client/auth. Provide LABELS_CSV instead.')
    from project.labels import build_and_save_labels
    os.makedirs(os.path.join(ARTIFACTS_DIR,'data'), exist_ok=True)
    LABELS_CSV = os.path.join(ARTIFACTS_DIR,'data', 'labels.csv')
    labels_df = build_and_save_labels(bq_client, subject_ids, LABELS_CSV)
    print(f'Generated labels file: {LABELS_CSV} (n={len(labels_df)})')
elif LABELS_CSV is not None:
    if not os.path.exists(LABELS_CSV):
        raise FileNotFoundError(f'Provided LABELS_CSV not found: {LABELS_CSV}')
    labels_df = pd.read_csv(LABELS_CSV)
    print(f'Loaded existing labels (n={len(labels_df)})')
else:
    raise ValueError('LABELS_CSV is None and GENERATE_LABELS is False -> nothing to train.')
labels_df.head()

Generated labels file: project\runs\20250905_113628\data\labels.csv (n=28473)


Unnamed: 0,subject_id,hadm_id,mortality_label,prolonged_los_label,readmission_label
0,2,163353,0,0,0
1,3,145834,0,1,0
2,4,185777,0,1,0
3,5,178980,0,0,0
4,7,118037,0,0,0


In [None]:
# Train models (with diagnostic wrapper) -- force reload to pick up latest changes in train.py
import importlib, sys, os, glob, json, traceback
# Ensure fresh copy of training module so old 'base_estimator' usage is not retained
if 'project.train' in sys.modules:
    import project.train as _train_mod
    importlib.reload(_train_mod)
    print('Reloaded project.train from', _train_mod.__file__)
else:
    import project.train as _train_mod
    print('Imported project.train from', _train_mod.__file__)
from project.train import train_from_labels  # now reflects reloaded code

# Sanity: show _fit_model source first line to confirm it uses "estimator="
import inspect
_fit_src = inspect.getsource(_train_mod._fit_model).splitlines()[0]
print('First line of _fit_model():', _fit_src)

PROFILE = True  # Toggle: set to False for a normal (quieter) run

if LABELS_CSV is None:
    raise RuntimeError('LABELS_CSV is not set; cannot proceed with training.')
try:
    metrics = train_from_labels(
        GCP_PROJECT_ID,
        LABELS_CSV,
        OUTPUT_MODELS_DIR,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        use_cache_first=True,
        profile=PROFILE,
    )
    metrics_path = os.path.join(OUTPUT_MODELS_DIR, 'metrics.json')
    print('Training complete. Metrics summary below (saved to', metrics_path, '):')
    metrics
except Exception as e:
    print('Training failed:', type(e).__name__, str(e))
    print('\n=== TRACEBACK ===')
    traceback.print_exc()
    print('\n=== ENV / PATH DIAGNOSTICS ===')
    print('Working directory:', os.getcwd())
    print('Existing cohort paths:')
    for p in ['data/initial_cohort.csv','project/data/initial_cohort.csv','../project/data/initial_cohort.csv']:
        print(' -', p, 'FOUND' if os.path.exists(p) else 'missing')
    print('\nLabel file:', LABELS_CSV, 'FOUND' if os.path.exists(LABELS_CSV) else 'missing')
    print('\nModels dir (after failure) contents:')
    if os.path.exists(OUTPUT_MODELS_DIR):
        for fp in glob.glob(os.path.join(OUTPUT_MODELS_DIR,'*')):
            print(' -', os.path.basename(fp))
    else:
        print(' (models dir not created)')
    print('\nPython version:', sys.version)
    print('sys.path[0:5]=', sys.path[:5])
    # Show current CalibratedClassifierCV signature to aid debugging
    import inspect as _insp
    from sklearn.calibration import CalibratedClassifierCV as _CC
    print('CalibratedClassifierCV signature:', _insp.signature(_CC))
    raise

Imported project.train from c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\train.py
First line of _fit_model(): def _fit_model(X_t, y: np.ndarray) -> CalibratedClassifierCV:
[1/5] Loading labels CSV ...
Copied labels to c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\project\runs\20250905_113628\labels.csv
[2/5] Building / loading feature matrices ...
[2/5] Building / loading feature matrices ...
Loaded features from cache directory (skipped BigQuery): c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\data\extracted_cache
Loaded features from cache directory (skipped BigQuery): c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\data\extracted_cache
[3/5] Fitting preprocessing pipeline on 28473 patients x 1117 features ...
[3/5] Fitting preprocessing pipeline on 28473 patients x 1117 features ...


[4/5] Training targets:   0%|          | 0/3 [00:00<?, ?model/s]

CalibratedClassifierCV created with method=isotonic (using 'estimator' kwarg).
CalibratedClassifierCV created with method=isotonic (using 'estimator' kwarg).
CalibratedClassifierCV created with method=isotonic (using 'estimator' kwarg).
CalibratedClassifierCV created with method=isotonic (using 'estimator' kwarg).
CalibratedClassifierCV created with method=isotonic (using 'estimator' kwarg).
[5/5] Writing metrics.json
Training complete. Metrics summary below (saved to project\runs\20250905_113628\models\metrics.json ):
[5/5] Writing metrics.json
Training complete. Metrics summary below (saved to project\runs\20250905_113628\models\metrics.json ):


In [6]:
# Inspect saved artifacts
import json, glob
print('Artifacts root contents:')
for root, dirs, files in os.walk(ARTIFACTS_DIR):
    level = root.replace(ARTIFACTS_DIR, '').count(os.sep)
    indent = '  ' * level
    print(f"{indent}{os.path.basename(root) or ARTIFACTS_DIR}/")
    subindent = '  ' * (level + 1)
    for f in files:
        print(f"{subindent}{f}")
metrics_path = os.path.join(OUTPUT_MODELS_DIR, 'metrics.json')
if os.path.exists(metrics_path):
    print('\nmetrics.json:')
    print(open(metrics_path,'r',encoding='utf-8').read())

Artifacts root contents:
20250905_113628/
  labels.csv
  data/
    cohort_snapshot.csv
    labels.csv
  models/
    feature_columns.json
    feature_importance_mortality.csv
    feature_importance_prolonged_los.csv
    feature_importance_readmission.csv
    metrics.json
    model_mortality.joblib
    model_prolonged_los.joblib
    model_readmission.joblib
    mortality_calibration.png
    mortality_curves.json
    mortality_pr.png
    mortality_roc.png
    preprocessor.joblib
    prolonged_los_calibration.png
    prolonged_los_curves.json
    prolonged_los_pr.png
    prolonged_los_roc.png
    readmission_calibration.png
    readmission_curves.json
    readmission_pr.png
    readmission_roc.png

metrics.json:
{
  "mortality": {
    "n": 5695,
    "positives": 622,
    "prevalence": 0.10921861281826163,
    "roc_auc": 0.8260170006648907,
    "pr_auc": 0.38438358294712693,
    "brier": 0.08057889225707435,
    "ece": 0.006782419778244072,
    "threshold": 0.22849839012180953,
    "precisi

In [None]:
# Sanity check: detect accidental nested project/project structure and report
nested_issue = False
cwd = os.getcwd()
# Look for a 'project/project' pattern relative to repo root heuristics
suspect_path = os.path.join('project','project')
if os.path.isdir(suspect_path):
    nested_issue = True
    print('\n[WARNING] Detected nested directory:', suspect_path)
    print('This likely came from earlier path logic. Current run artifacts are stored in:', ARTIFACTS_DIR)
    print('Recommended manual cleanup (run from repo root):')
    print('  - Move any needed runs from project/project/runs/* to project/runs/')
    print('  - Then remove the extra nested directory: project/project')
else:
    print('\nNo nested project/project directory detected.')

{'nested_issue': nested_issue, 'runs_root': os.path.abspath(os.path.join(ARTIFACTS_DIR, '..'))}