In [None]:
# Ensure the repo root and project/ are on sys.path for 'project' imports
import sys, os, pathlib
candidates = []
cwd = pathlib.Path().resolve()
candidates.append(cwd)
candidates.append(cwd.parent)
candidates.append(cwd / 'project')
candidates.append(cwd.parent / 'project')
for p in candidates:
    p = str(p)
    if os.path.isdir(p) and p not in sys.path:
        sys.path.insert(0, p)
try:
    import project  # noqa: F401
    print('Import check OK. sys.path[0]=', sys.path[0])
except Exception as e:
    raise RuntimeError(f'Failed to import project package. sys.path= {sys.path}') from e

In [None]:
# Configure GCP project and input paths
GCP_PROJECT_ID = 'YOUR_GCP_PROJECT'  # <-- set this
SUBJECTS_CSV = os.path.join('project', 'data', 'test_example.csv')  # CSV with subject_id column
LABELS_CSV = None  # optional: set to labels CSV with subject_id and any of *_label columns
GCP_PROJECT_ID, SUBJECTS_CSV, LABELS_CSV

In [None]:
# Run predictions
from google.cloud import bigquery
from project.unseen_data_evaluation import run_pipeline_on_unseen_data
import pandas as pd

bq_client = bigquery.Client(project=GCP_PROJECT_ID)
subjects_df = pd.read_csv(SUBJECTS_CSV)
if 'subject_id' not in subjects_df.columns:
    raise ValueError('SUBJECTS_CSV must contain subject_id column')
subject_ids = subjects_df['subject_id'].dropna().astype(int).tolist()
preds = run_pipeline_on_unseen_data(subject_ids, bq_client)
print('Predictions shape:', preds.shape)
preds.head()

In [None]:
# Optional: compute metrics if LABELS_CSV provided
if LABELS_CSV:
    from project.metrics_utils import compute_binary_metrics, metrics_to_dict
    import json
    labels_df = pd.read_csv(LABELS_CSV)
    if 'subject_id' not in labels_df.columns:
        raise ValueError('Labels CSV must contain subject_id')
    # Normalize possible variant label column names
    rename_map = {
        'prolonged_los': 'prolonged_los_label',
        'prolonged_los_label': 'prolonged_los_label',
        'prolonged_los>7d': 'prolonged_los_label',
        'mortality': 'mortality_label',
        'readmission': 'readmission_label',
    }
    cols_lower = {c: c for c in labels_df.columns}
    for c in list(labels_df.columns):
        lc = c.lower()
        if lc in rename_map:
            cols_lower[c] = rename_map[lc]
        elif lc.endswith('_label'):
            cols_lower[c] = lc
    labels_df = labels_df.rename(columns=cols_lower)
    merged = preds.merge(labels_df, on='subject_id', how='inner')
    results = {}
    for y_col, p_col, name in [
        ('mortality_label', 'mortality_proba', 'mortality'),
        ('prolonged_los_label', 'prolonged_LOS_proba', 'prolonged_los'),
        ('readmission_label', 'readmission_proba', 'readmission'),
    ]:
        if y_col in merged.columns and p_col in merged.columns:
            y = merged[y_col].dropna().astype(int).values
            p = merged.loc[merged[y_col].dropna().index, p_col].astype(float).values
            if y.size:
                m = compute_binary_metrics(y, p, threshold_objective='f1')
                results[name] = metrics_to_dict(m)
    print(json.dumps(results, indent=2))
else:
    print('Set LABELS_CSV to compute metrics.')