In [None]:
# Path setup so 'project' package imports regardless of IDE CWD
import sys, os, pathlib
cands = [pathlib.Path.cwd(), pathlib.Path.cwd().parent, pathlib.Path.cwd()/ 'project', pathlib.Path.cwd().parent / 'project']
for p in cands:
    if p.is_dir() and str(p) not in sys.path:
        sys.path.insert(0, str(p))
print('sys.path[0]=', sys.path[0])

# (Inspection placeholder) retaining content.

sys.path[0]= c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project


In [None]:
# User configuration
import os, pathlib, datetime as _dt
GCP_PROJECT_ID = 'ml-for-healthcare-2025'
_COHORT_CANDIDATES = [
    'data/initial_cohort.csv',
    os.path.join('project','data','initial_cohort.csv'),
    os.path.join('..','project','data','initial_cohort.csv'),
]
INITIAL_COHORT_CSV = next((c for c in _COHORT_CANDIDATES if os.path.exists(c)), _COHORT_CANDIDATES[0])
LABELS_CSV = None
GENERATE_LABELS = True
RUN_ID = _dt.datetime.now().strftime('%Y%m%d_%H%M%S')
if os.path.basename(os.getcwd()) == 'project':
    RUNS_ROOT = 'runs'
elif os.path.isdir('project'):
    RUNS_ROOT = os.path.join('project', 'runs')
else:
    RUNS_ROOT = 'runs'
RUNS_ROOT = RUNS_ROOT.replace('project' + os.sep + 'project', 'project')
ARTIFACTS_DIR = os.path.join(RUNS_ROOT, RUN_ID)
OUTPUT_ARTIFACTS_DIR = os.path.join(ARTIFACTS_DIR,'artifacts')
os.makedirs(OUTPUT_ARTIFACTS_DIR, exist_ok=True)
os.environ['MLHC_ARTIFACTS_DIR'] = os.path.abspath(OUTPUT_ARTIFACTS_DIR)
print(f'Run ID: {RUN_ID}')
print('Artifacts root:', ARTIFACTS_DIR)
print('Artifacts dir:', OUTPUT_ARTIFACTS_DIR)
TEST_SIZE = 0.2
RANDOM_STATE = 42
GCP_PROJECT_ID, INITIAL_COHORT_CSV, LABELS_CSV, GENERATE_LABELS, ARTIFACTS_DIR, OUTPUT_ARTIFACTS_DIR

Run ID: 20250905_141147
Artifacts root: runs\20250905_141147
Models dir: runs\20250905_141147\models


('ml-for-healthcare-2025',
 'data/initial_cohort.csv',
 None,
 True,
 'runs\\20250905_141147',
 'runs\\20250905_141147\\models')

In [9]:
# Load cohort & init BigQuery client (robust path resolution)
from google.cloud import bigquery
import pandas as pd, json
if not os.path.exists(INITIAL_COHORT_CSV):
    raise FileNotFoundError(
        'Missing cohort CSV. Looked for: ' + INITIAL_COHORT_CSV + '\n'
        'If running from repo root, set INITIAL_COHORT_CSV to project/data/initial_cohort.csv; '
        'if running inside project/, set to data/initial_cohort.csv.'
    )
cohort_df = pd.read_csv(INITIAL_COHORT_CSV)
if 'subject_id' not in cohort_df.columns:
    raise ValueError('INITIAL_COHORT_CSV must contain subject_id')
subject_ids = cohort_df['subject_id'].dropna().astype(int).tolist()
print(f'Cohort size: {len(subject_ids)} from {INITIAL_COHORT_CSV}')
os.makedirs(os.path.join(ARTIFACTS_DIR,'data'), exist_ok=True)
cohort_copy_path = os.path.join(ARTIFACTS_DIR,'data','cohort_snapshot.csv')
cohort_df.to_csv(cohort_copy_path, index=False)
print('Saved cohort snapshot to', cohort_copy_path)
try:
    bq_client = bigquery.Client(project=GCP_PROJECT_ID)
    print('BigQuery client ready.')
except Exception as e:
    bq_client = None
    print('Warning: BigQuery client not initialized ->', e)


Cohort size: 32513 from data/initial_cohort.csv
Saved cohort snapshot to runs\20250905_141147\data\cohort_snapshot.csv
BigQuery client ready.
BigQuery client ready.


In [10]:
# Build labels if needed (requires BigQuery access)
import datetime as _dt
if LABELS_CSV is None and GENERATE_LABELS:
    if bq_client is None:
        raise RuntimeError('Cannot generate labels without BigQuery client/auth. Provide LABELS_CSV instead.')
    from project.labels import build_and_save_labels
    os.makedirs(os.path.join(ARTIFACTS_DIR,'data'), exist_ok=True)
    LABELS_CSV = os.path.join(ARTIFACTS_DIR,'data', 'labels.csv')
    labels_df = build_and_save_labels(bq_client, subject_ids, LABELS_CSV)
    print(f'Generated labels file: {LABELS_CSV} (n={len(labels_df)})')
elif LABELS_CSV is not None:
    if not os.path.exists(LABELS_CSV):
        raise FileNotFoundError(f'Provided LABELS_CSV not found: {LABELS_CSV}')
    labels_df = pd.read_csv(LABELS_CSV)
    print(f'Loaded existing labels (n={len(labels_df)})')
else:
    raise ValueError('LABELS_CSV is None and GENERATE_LABELS is False -> nothing to train.')
labels_df.head()

Generated labels file: runs\20250905_141147\data\labels.csv (n=28473)


Unnamed: 0,subject_id,hadm_id,mortality_label,prolonged_los_label,readmission_label
0,2,163353,0,0,0
1,3,145834,0,1,0
2,4,185777,0,1,0
3,5,178980,0,0,0
4,7,118037,0,0,0


In [None]:
# Train models (with diagnostic wrapper)
import importlib, sys, os, glob, json, traceback
if 'project.train' in sys.modules:
    import project.train as _train_mod
    importlib.reload(_train_mod)
    print('Reloaded project.train from', _train_mod.__file__)
else:
    import project.train as _train_mod
    print('Imported project.train from', _train_mod.__file__)
from project.train import train_from_labels
import inspect
_fit_src = inspect.getsource(_train_mod._fit_model).splitlines()[0]
print('First line of _fit_model():', _fit_src)
PROFILE = True
SKIP_IMPORTANCE = True
MODEL_TYPE = 'logreg'
CALIB_CV = 5
CV_FOLDS = 0
FINALIZE_MODE = False
if FINALIZE_MODE:
    SKIP_IMPORTANCE = False
    print('[FINALIZE] Full calibration & permutation importance enabled.')
if LABELS_CSV is None:
    raise RuntimeError('LABELS_CSV is not set; cannot proceed with training.')
try:
    metrics = train_from_labels(
        GCP_PROJECT_ID,
        LABELS_CSV,
        OUTPUT_ARTIFACTS_DIR,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        use_cache_first=True,
        profile=PROFILE,
        model_type=MODEL_TYPE,
        calib_cv=CALIB_CV,
        skip_importance=SKIP_IMPORTANCE,
        cv_folds=CV_FOLDS,
        finalize_mode=FINALIZE_MODE,
    )
    metrics_path = os.path.join(OUTPUT_ARTIFACTS_DIR, 'metrics.json')
    print('Training complete. Metrics summary below (saved to', metrics_path, '):')
    metrics
except Exception as e:
    print('Training failed:', type(e).__name__, str(e))
    print('\n=== TRACEBACK ===')
    traceback.print_exc()
    print('\n=== ENV / PATH DIAGNOSTICS ===')
    print('Working directory:', os.getcwd())
    print('Existing cohort paths:')
    for p in ['data/initial_cohort.csv','project/data/initial_cohort.csv','../project/data/initial_cohort.csv']:
        print(' -', p, 'FOUND' if os.path.exists(p) else 'missing')
    print('\nLabel file:', LABELS_CSV, 'FOUND' if os.path.exists(LABELS_CSV) else 'missing')
    print('\nArtifacts dir (after failure) contents:')
    if os.path.exists(OUTPUT_ARTIFACTS_DIR):
        for fp in glob.glob(os.path.join(OUTPUT_ARTIFACTS_DIR,'*')):
            print(' -', os.path.basename(fp))
    else:
        print(' (artifacts dir not created)')
    import inspect as _insp
    from sklearn.calibration import CalibratedClassifierCV as _CC
    print('CalibratedClassifierCV signature:', _insp.signature(_CC))
    raise

Reloaded project.train from c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\train.py
First line of _fit_model(): def _fit_model(
[1/5] Loading labels CSV ...
Copied labels to c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\runs\20250905_141147\labels.csv
[2/5] Building / loading feature matrices ...
[2/5] Building / loading feature matrices ...
Loaded features from cache directory (skipped BigQuery): c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\data\extracted_cache
Loaded features from cache directory (skipped BigQuery): c:\Users\Almog Luz\Documents\GitHub\mlhc-final-project\project\data\extracted_cache
[3/5] Fitting preprocessing pipeline on 28473 patients x 1618 features ...
[3/5] Fitting preprocessing pipeline on 28473 patients x 1618 features ...


[4/5] Training targets:   0%|          | 0/3 [00:00<?, ?model/s]

CalibratedClassifierCV(logreg) method=isotonic cv=5
CalibratedClassifierCV(logreg) method=isotonic cv=5
CalibratedClassifierCV(logreg) method=isotonic cv=5
CalibratedClassifierCV(logreg) method=isotonic cv=5
CalibratedClassifierCV(logreg) method=isotonic cv=5
[5/5] Writing metrics.json

=== STAGE TIMINGS (seconds) ===
total                           217.017
model_fit_readmission            84.501
model_fit_mortality              50.707
model_fit_prolonged_los          36.856
feature_extraction_total         23.903
metrics_mortality                 6.192
metrics_prolonged_los             4.577
metrics_readmission               3.569
preprocessor_fit_transform        3.200
split_readmission                 0.505
split_mortality                   0.494
split_prolonged_los               0.464
plot_prolonged_los                0.257
plot_readmission                  0.234
plot_mortality                    0.227
curves_prolonged_los              0.020
curves_mortality                  0.017


In [12]:
# Inspect saved artifacts
import json, glob
print('Artifacts root contents:')
for root, dirs, files in os.walk(ARTIFACTS_DIR):
    level = root.replace(ARTIFACTS_DIR, '').count(os.sep)
    indent = '  ' * level
    print(f"{indent}{os.path.basename(root) or ARTIFACTS_DIR}/")
    subindent = '  ' * (level + 1)
    for f in files:
        print(f"{subindent}{f}")
metrics_path = os.path.join(OUTPUT_MODELS_DIR, 'metrics.json')
if os.path.exists(metrics_path):
    print('\nmetrics.json:')
    print(open(metrics_path,'r',encoding='utf-8').read())

Artifacts root contents:
20250905_141147/
  labels.csv
  data/
    cohort_snapshot.csv
    labels.csv
  models/
    feature_columns.json
    metrics.json
    model_mortality.joblib
    model_prolonged_los.joblib
    model_readmission.joblib
    mortality_calibration.png
    mortality_curves.json
    mortality_pr.png
    mortality_roc.png
    preprocessor.joblib
    prolonged_los_calibration.png
    prolonged_los_curves.json
    prolonged_los_pr.png
    prolonged_los_roc.png
    readmission_calibration.png
    readmission_curves.json
    readmission_pr.png
    readmission_roc.png
    run_metadata.json

metrics.json:
{
  "mortality": {
    "n": 5695,
    "positives": 622,
    "prevalence": 0.10921861281826163,
    "roc_auc": 0.8229600247955413,
    "pr_auc": 0.37501952036150565,
    "brier": 0.08109804094620389,
    "ece": 0.007166416276734353,
    "threshold": 0.2051432121558919,
    "precision": 0.34965719882468166,
    "recall": 0.5739549839228296,
    "f1": 0.4345709068776628,
    "s