In [None]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from ehrtriage.config import DATA_DIR, MODELS_DIR, load_config
from ehrtriage.synthetic_data import generate_synthetic_data
from ehrtriage.cohort import build_readmission_cohort, build_icu_mortality_cohort
from ehrtriage.features import build_snapshot_features
from ehrtriage.schemas import Event, PatientTimeline

print("✓ All imports successful")

## 1. Check Synthetic Data Generation

In [None]:
# Generate small synthetic dataset
print("Generating synthetic data...")
patients, admissions, icu_stays, events = generate_synthetic_data(
    n_patients=50,
    random_seed=12345
)

# Validate outputs
assert len(patients) == 50, "Expected 50 patients"
assert len(admissions) > 0, "Expected admissions"
assert len(events) > 0, "Expected events"

# Check schema
assert 'subject_id' in patients.columns
assert 'age' in patients.columns
assert 'gender' in patients.columns

assert 'hadm_id' in admissions.columns
assert 'admittime' in admissions.columns
assert 'dischtime' in admissions.columns

assert 'event_type' in events.columns
assert 'event_code' in events.columns
assert 'value' in events.columns

print("✓ Synthetic data generation: PASSED")
print(f"  - Patients: {len(patients)}")
print(f"  - Admissions: {len(admissions)}")
print(f"  - Events: {len(events)}")

## 2. Check Cohort Building

In [None]:
# Build readmission cohort
print("\nBuilding readmission cohort...")
readmission_cohort = build_readmission_cohort(admissions)

assert len(readmission_cohort) > 0, "Cohort should not be empty"
assert 'readmit_30d' in readmission_cohort.columns, "Missing label column"
assert readmission_cohort['readmit_30d'].isin([0, 1]).all(), "Labels should be 0 or 1"

print("✓ Readmission cohort building: PASSED")
print(f"  - Cohort size: {len(readmission_cohort)}")
print(f"  - Readmission rate: {readmission_cohort['readmit_30d'].mean():.2%}")

In [None]:
# Build ICU mortality cohort
print("\nBuilding ICU mortality cohort...")
icu_cohort = build_icu_mortality_cohort(icu_stays, patients)

assert len(icu_cohort) > 0, "Cohort should not be empty"
assert 'mortality_label' in icu_cohort.columns, "Missing label column"
assert icu_cohort['mortality_label'].isin([0, 1]).all(), "Labels should be 0 or 1"

print("✓ ICU mortality cohort building: PASSED")
print(f"  - Cohort size: {len(icu_cohort)}")
print(f"  - Mortality rate: {icu_cohort['mortality_label'].mean():.2%}")

## 3. Check Feature Engineering

In [None]:
# Test snapshot feature building
print("\nTesting snapshot feature building...")

# Filter events for one admission
sample_hadm = readmission_cohort.iloc[0]
sample_events = events[
    (events['hadm_id'] == sample_hadm['hadm_id']) &
    (pd.to_datetime(events['charttime']) <= pd.to_datetime(sample_hadm['discharge_time']))
]

print(f"Sample admission has {len(sample_events)} events")

# Build features for this admission
config = load_config()
sample_cohort = readmission_cohort.head(10)  # Use first 10 for testing

try:
    features_df = build_snapshot_features(
        events_df=events,
        cohort_df=sample_cohort,
        config=config,
        task='readmission'
    )
    
    assert len(features_df) == len(sample_cohort), "Feature rows should match cohort"
    assert features_df.shape[1] > 0, "Should have features"
    assert not features_df.isnull().all().any(), "No completely null columns"
    
    print("✓ Snapshot feature building: PASSED")
    print(f"  - Feature shape: {features_df.shape}")
    print(f"  - Sample features: {list(features_df.columns[:5])}")
except Exception as e:
    print(f"✗ Snapshot feature building: FAILED")
    print(f"  Error: {e}")

## 4. Check Sequence Building

In [None]:
from ehrtriage.sequence_builder import build_sequence_for_stay

print("\nTesting sequence building...")

# Build sequence for one admission
try:
    sequence, mask, times = build_sequence_for_stay(
        stay_id=sample_hadm['hadm_id'],
        events_df=events,
        start_time=pd.to_datetime(sample_hadm['index_admit_time']),
        end_time=pd.to_datetime(sample_hadm['discharge_time']),
        config=config,
        max_length=24  # 24 time bins
    )
    
    assert sequence.shape[0] <= 24, "Sequence length should not exceed max_length"
    assert sequence.shape[1] > 0, "Should have feature dimensions"
    assert len(mask) == len(sequence), "Mask length should match sequence"
    assert mask.sum() > 0, "Should have at least some valid time bins"
    
    print("✓ Sequence building: PASSED")
    print(f"  - Sequence shape: {sequence.shape}")
    print(f"  - Valid time bins: {mask.sum()} / {len(mask)}")
except Exception as e:
    print(f"✗ Sequence building: FAILED")
    print(f"  Error: {e}")

## 5. Check Model Loading and Prediction

In [None]:
from ehrtriage.models.baselines import LogisticBaseline

print("\nTesting model loading and prediction...")

try:
    model_dir = MODELS_DIR / "artifacts" / "readmission"
    if (model_dir / "logistic_model.pkl").exists():
        model = LogisticBaseline.load(model_dir, "logistic")
        
        # Make a prediction on dummy data
        n_features = len(model.feature_names) if hasattr(model, 'feature_names') else 100
        dummy_input = np.random.randn(1, n_features)
        
        proba = model.predict_proba(dummy_input)
        
        assert proba.shape == (1, 2), "Prediction shape should be (1, 2)"
        assert np.isclose(proba.sum(), 1.0), "Probabilities should sum to 1"
        assert (proba >= 0).all() and (proba <= 1).all(), "Probabilities should be in [0, 1]"
        
        print("✓ Model loading and prediction: PASSED")
        print(f"  - Model type: Logistic Regression")
        print(f"  - Prediction: {proba[0, 1]:.4f}")
    else:
        print("⊘ Model not found. Run training first.")
except Exception as e:
    print(f"✗ Model loading: FAILED")
    print(f"  Error: {e}")

## 6. Check Schema Validation

In [None]:
print("\nTesting Pydantic schemas...")

# Test Event schema
event = Event(
    time="2024-01-01T12:00:00",
    type="vital",
    code="HR",
    value=80.0
)
assert event.type == "vital"
assert event.value == 80.0

# Test PatientTimeline schema
timeline = PatientTimeline(
    subject_id="TEST001",
    stay_id="ADM001",
    events=[
        Event(time="2024-01-01T12:00:00", type="vital", code="HR", value=80.0),
        Event(time="2024-01-01T13:00:00", type="lab", code="CREATININE", value=1.2),
    ]
)
assert len(timeline.events) == 2
assert timeline.subject_id == "TEST001"

print("✓ Schema validation: PASSED")
print(f"  - Event schema: OK")
print(f"  - PatientTimeline schema: OK")

## 7. Check Explanation Generation

In [None]:
from ehrtriage.explain.text_generator import generate_logistic_explanation

print("\nTesting explanation generation...")

# Mock attributions
attributions = [
    ('HR_mean', 0.15),
    ('CREATININE_mean', 0.12),
    ('BP_SYSTOLIC_min', -0.08),
]

explanation = generate_logistic_explanation(
    risk_score=0.75,
    task='readmission',
    attributions=attributions
)

assert isinstance(explanation, str), "Explanation should be a string"
assert len(explanation) > 0, "Explanation should not be empty"
assert "research" in explanation.lower() or "prototype" in explanation.lower(), "Should contain disclaimer"

print("✓ Explanation generation: PASSED")
print(f"  - Explanation length: {len(explanation)} characters")
print(f"  - Sample: {explanation[:150]}...")

## Summary

All sanity checks completed. Review results above for any failures.

### Expected Status:
- ✓ = Passed
- ✗ = Failed
- ⊘ = Skipped (e.g., models not trained yet)

If any checks failed, review the error messages and ensure:
1. Dependencies are installed correctly
2. Data has been generated
3. Models have been trained (if testing model-related checks)