In [20]:
import numpy as np
import pandas as pd

# Paths
RAW_PATH = "../data/hospital_length_of_stay_raw.csv"
CLEAN_PATH = "../data/hospital_length_of_stay_clean.csv"

# Columns
MISSING_FLAG_COLS = ["heart_rate", "systolic_bp", "lab_abnormalities"]

INT_LIKE_COLUMNS = [
    "age",
    "heart_rate",
    "systolic_bp",
    "lab_abnormalities",
    "num_comorbidities",
]

# Clinical plausibility bounds (clip)
CLIP_BOUNDS = {
    "heart_rate": (30, 220),
    "systolic_bp": (70, 250),
}


In [21]:
def clean_los_dataframe(
    df: pd.DataFrame,
    missing_flag_cols=MISSING_FLAG_COLS,
    clip_bounds=CLIP_BOUNDS,
    int_like_columns=INT_LIKE_COLUMNS,
) -> pd.DataFrame:
    """
    Clean LOS dataset:
      1) Clip extreme physiologic outliers
      2) Create missingness indicator flags
      3) Median-impute select numeric columns
      4) Round/cast selected columns to int for realism
    Returns a new cleaned DataFrame (does not mutate input).
    """
    df = df.copy()

    # 1) Clip outliers (NaNs pass through unchanged)
    for col, (low, high) in clip_bounds.items():
        if col in df.columns:
            df[col] = df[col].clip(lower=low, upper=high)

    # 2) Missing flags (must happen BEFORE imputation)
    for col in missing_flag_cols:
        if col in df.columns:
            df[f"{col}_missing"] = df[col].isna().astype(int)

    # 3) Median imputation
    for col in missing_flag_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    # 4) Integer realism (safe now that NaNs are handled)
    for col in int_like_columns:
        if col in df.columns:
            # Round first to avoid float-to-int surprises
            df[col] = df[col].round().astype(int)

    return df


In [None]:
df_raw = pd.read_csv(RAW_PATH)
df_clean = clean_los_dataframe(df_raw)

# Quick validations (these should always pass)
assert df_clean[MISSING_FLAG_COLS].isna().sum().sum() == 0, "Imputation failed: still have NaNs in key cols."
for col in MISSING_FLAG_COLS:
    assert set(df_clean[f"{col}_missing"].unique()).issubset({0, 1}), f"Missing flag {col}_missing not binary."


heart_rate           0.1444
systolic_bp          0.1256
lab_abnormalities    0.1096
dtype: float64

heart_rate_missing           0.1444
systolic_bp_missing          0.1256
lab_abnormalities_missing    0.1096
dtype: float64

Unnamed: 0,age,num_comorbidities,heart_rate,systolic_bp,lab_abnormalities,length_of_stay_days,heart_rate_missing,systolic_bp_missing,lab_abnormalities_missing
count,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0
mean,65.6704,2.2456,87.652,131.3268,1.5196,11.951959,0.1444,0.1256,0.1096
std,15.922863,1.513604,20.663109,20.980946,1.21285,5.232723,0.351565,0.331464,0.312453
min,14.0,0.0,30.0,70.0,0.0,3.350352,0.0,0.0,0.0
25%,55.0,1.0,77.0,119.0,1.0,9.166354,0.0,0.0,0.0
50%,66.0,2.0,87.0,132.0,1.0,11.037407,0.0,0.0,0.0
75%,77.0,3.0,96.0,144.0,2.0,13.444791,0.0,0.0,0.0
max,126.0,9.0,220.0,217.0,7.0,63.007113,1.0,1.0,1.0


In [23]:
df_clean.to_csv(CLEAN_PATH, index=False)
print(f"Saved cleaned data to: {CLEAN_PATH}")

Saved cleaned data to: ../data/hospital_length_of_stay_clean.csv
