# 01 Â· Exploratory Data Analysis (EDA)

**Goal:** Load the CHMMOTv1 labels table, inspect distributions, define outcome bins, and export cleaned cohort + splits.

> Put your labels file at `data/raw/CHMMOTv1_labels.xlsx` (or `.csv`) before running.

In [None]:

from pathlib import Path
import pandas as pd
import numpy as np
from src.io_utils import read_labels, ensure_dirs
from src.preprocessing import bin_t2star_ms, make_splits, add_missing_indicators

RAW = Path('data/raw')
PROC = Path('data/processed')
ensure_dirs(PROC)

# === 1) Load ===
labels_path = RAW / 'CHMMOTv1_labels.xlsx'  # change to your filename
df = read_labels(labels_path)
print(df.head())

# === 2) Basic cleaning ===
# Standardize column names (example placeholders; update to match actual columns)
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

# Example expected columns (adjust to your file): 
# 'patient_id', 'cardiac_t2star_ms', 'hepatic_t2star_ms', 'ferritin', 'ast', 'alt', 'alp', 'sex', 'age'
assert 'cardiac_t2star_ms' in df.columns, "Make sure the column 'cardiac_t2star_ms' exists"

# Outcome (example: cardiac severity bins)
df['severity_bin'] = bin_t2star_ms(df['cardiac_t2star_ms']).astype(str)

# Add missingness indicators for key labs
lab_cols = [c for c in ['ferritin','ast','alt','alp'] if c in df.columns]
df = add_missing_indicators(df, lab_cols)

# === 3) Export cleaned cohort ===
df.to_parquet(PROC / 'cohort_clean.parquet', index=False)

# === 4) Train/valid/test splits (stratified by severity_bin) ===
train_df, val_df, test_df = make_splits(df, y_col='severity_bin', test_size=0.2, val_size=0.2, random_state=42)
train_df.to_parquet(PROC / 'split_train.parquet', index=False)
val_df.to_parquet(PROC / 'split_valid.parquet', index=False)
test_df.to_parquet(PROC / 'split_test.parquet', index=False)

print('Saved processed splits to', PROC)
