In [None]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ehrtriage.config import DATA_DIR, MODELS_DIR, load_config
from ehrtriage.schemas import Event, PatientTimeline

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports successful")

## 1. Load Synthetic Data

In [None]:
# Load synthetic data
synthetic_dir = DATA_DIR / "synthetic"

patients_df = pd.read_parquet(synthetic_dir / "patients.parquet")
admissions_df = pd.read_parquet(synthetic_dir / "admissions.parquet")
icu_stays_df = pd.read_parquet(synthetic_dir / "icu_stays.parquet")
events_df = pd.read_parquet(synthetic_dir / "events.parquet")

print(f"Patients: {len(patients_df):,}")
print(f"Admissions: {len(admissions_df):,}")
print(f"ICU Stays: {len(icu_stays_df):,}")
print(f"Events: {len(events_df):,}")

In [None]:
# Patient demographics
print("\n=== Patient Demographics ===")
print(patients_df.head())
print("\nAge distribution:")
print(patients_df['age'].describe())

# Plot age distribution
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
patients_df['age'].hist(bins=30, edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution')

plt.subplot(1, 2, 2)
patients_df['gender'].value_counts().plot(kind='bar')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Distribution')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 2. Explore Admissions and Readmissions

In [None]:
# Admissions overview
print("\n=== Admissions Overview ===")
print(admissions_df.head())

# Calculate length of stay
admissions_df['los_days'] = (
    pd.to_datetime(admissions_df['dischtime']) - 
    pd.to_datetime(admissions_df['admittime'])
).dt.total_seconds() / 86400

print("\nLength of stay (days):")
print(admissions_df['los_days'].describe())

# Plot length of stay
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
admissions_df['los_days'].hist(bins=30, edgecolor='black')
plt.xlabel('Length of Stay (days)')
plt.ylabel('Count')
plt.title('Length of Stay Distribution')

plt.subplot(1, 2, 2)
admissions_df['admission_type'].value_counts().plot(kind='bar')
plt.xlabel('Admission Type')
plt.ylabel('Count')
plt.title('Admission Types')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Analyze Clinical Events

In [None]:
# Event types
print("\n=== Clinical Events ===")
print(events_df.head())

print("\nEvent type distribution:")
print(events_df['event_type'].value_counts())

# Plot event types
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
events_df['event_type'].value_counts().plot(kind='bar')
plt.xlabel('Event Type')
plt.ylabel('Count')
plt.title('Event Type Distribution')
plt.xticks(rotation=45)

# Events per admission
plt.subplot(1, 2, 2)
events_per_adm = events_df.groupby('hadm_id').size()
events_per_adm.hist(bins=30, edgecolor='black')
plt.xlabel('Events per Admission')
plt.ylabel('Count')
plt.title('Events per Admission Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Vital signs distribution
vital_events = events_df[events_df['event_type'] == 'vital']
print("\nVital signs:")
print(vital_events['event_code'].value_counts())

# Lab values distribution
lab_events = events_df[events_df['event_type'] == 'lab']
print("\nLab tests:")
print(lab_events['event_code'].value_counts())

## 4. Load and Examine Cohorts

In [None]:
# Try to load processed cohorts (if training has been run)
processed_dir = DATA_DIR / "processed"

try:
    readmission_cohort = pd.read_parquet(processed_dir / "readmission_cohort.parquet")
    print("\n=== Readmission Cohort ===")
    print(f"Total: {len(readmission_cohort):,}")
    print(f"Readmissions: {readmission_cohort['readmit_30d'].sum():,}")
    print(f"Readmission rate: {readmission_cohort['readmit_30d'].mean():.2%}")
    
    # Plot readmission distribution
    plt.figure(figsize=(8, 5))
    readmission_cohort['readmit_30d'].value_counts().plot(kind='bar')
    plt.xlabel('30-Day Readmission')
    plt.ylabel('Count')
    plt.title('Readmission Cohort Distribution')
    plt.xticks([0, 1], ['No Readmission', 'Readmission'], rotation=0)
    plt.tight_layout()
    plt.show()
except FileNotFoundError:
    print("Cohort files not found. Run training script first.")

In [None]:
try:
    icu_cohort = pd.read_parquet(processed_dir / "icu_mortality_cohort.parquet")
    print("\n=== ICU Mortality Cohort ===")
    print(f"Total: {len(icu_cohort):,}")
    print(f"Deaths: {icu_cohort['mortality_label'].sum():,}")
    print(f"Mortality rate: {icu_cohort['mortality_label'].mean():.2%}")
    
    # Plot mortality distribution
    plt.figure(figsize=(8, 5))
    icu_cohort['mortality_label'].value_counts().plot(kind='bar')
    plt.xlabel('48-Hour Mortality')
    plt.ylabel('Count')
    plt.title('ICU Mortality Cohort Distribution')
    plt.xticks([0, 1], ['Survived', 'Died'], rotation=0)
    plt.tight_layout()
    plt.show()
except FileNotFoundError:
    print("Cohort files not found. Run training script first.")

## 5. Test Model Predictions

In [None]:
# Load a trained model (if available)
from ehrtriage.models.baselines import LogisticBaseline

try:
    model_dir = MODELS_DIR / "artifacts" / "readmission"
    model = LogisticBaseline.load(model_dir, "logistic")
    print("✓ Loaded readmission model")
    
    # Get feature importance
    importance = model.get_feature_importance(top_k=10)
    print("\nTop 10 most important features:")
    for feat, imp in importance:
        print(f"  {feat}: {imp:.4f}")
    
    # Plot feature importance
    features, importances = zip(*importance)
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(features)), importances)
    plt.yticks(range(len(features)), features)
    plt.xlabel('Importance (Coefficient)')
    plt.title('Top 10 Feature Importances - Readmission Model')
    plt.tight_layout()
    plt.show()
    
except FileNotFoundError:
    print("Model files not found. Run training script first.")

## 6. Visualize Example Patient Timeline

In [None]:
# Pick a random admission and visualize timeline
sample_hadm_id = admissions_df.sample(1)['hadm_id'].values[0]
sample_events = events_df[events_df['hadm_id'] == sample_hadm_id].sort_values('charttime')

print(f"\n=== Patient Timeline for hadm_id={sample_hadm_id} ===")
print(f"Number of events: {len(sample_events)}")
print("\nFirst 10 events:")
print(sample_events.head(10)[['charttime', 'event_type', 'event_code', 'value']])

# Plot timeline
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Vital signs
vitals = sample_events[sample_events['event_type'] == 'vital']
if len(vitals) > 0:
    for code in vitals['event_code'].unique():
        data = vitals[vitals['event_code'] == code]
        axes[0].plot(pd.to_datetime(data['charttime']), data['value'], 'o-', label=code)
    axes[0].set_ylabel('Value')
    axes[0].set_title('Vital Signs Over Time')
    axes[0].legend()
    axes[0].grid(True)

# Lab values
labs = sample_events[sample_events['event_type'] == 'lab']
if len(labs) > 0:
    for code in labs['event_code'].unique():
        data = labs[labs['event_code'] == code]
        axes[1].plot(pd.to_datetime(data['charttime']), data['value'], 's-', label=code)
    axes[1].set_ylabel('Value')
    axes[1].set_title('Lab Results Over Time')
    axes[1].legend()
    axes[1].grid(True)

# Medications (as bars)
meds = sample_events[sample_events['event_type'] == 'medication']
if len(meds) > 0:
    for i, code in enumerate(meds['event_code'].unique()):
        data = meds[meds['event_code'] == code]
        axes[2].scatter(pd.to_datetime(data['charttime']), [i] * len(data), label=code, s=100, marker='|')
    axes[2].set_ylabel('Medication')
    axes[2].set_xlabel('Time')
    axes[2].set_title('Medications Over Time')
    axes[2].legend()
    axes[2].grid(True)

plt.tight_layout()
plt.show()

## Summary

This notebook explored:
- Synthetic patient demographics and admissions
- Clinical event distributions
- Cohort characteristics
- Model feature importance
- Example patient timelines

**Remember**: This is synthetic data for demonstration only. Not for clinical use.