In [24]:
import numpy as np
import pandas as pd

# Paths
RAW_PATH = "../data/hospital_length_of_stay_raw.csv"
CLEAN_PATH = "../data/hospital_length_of_stay_clean.csv"

# Columns with missing data
MISSING_FLAG_COLS = [
    "heart_rate_on_admission",
    "systolic_bp",
    "lab_abnormality_count",
]

# Columns that need to be converte dto ints for realism
INT_LIKE_COLUMNS = [
    "age",
    "heart_rate_on_admission",
    "systolic_bp",
    "lab_abnormality_count",
    "number_of_comorbidities",
]

# Columns that need to be clipped because of extreme values
CLIP_BOUNDS = {
    "heart_rate_on_admission": (30, 220),
    "systolic_bp": (70, 250),
}


# Column renaming (raw -> clean)
RENAME_MAP = {
    "num_comorbidities": "number_of_comorbidities",
    "heart_rate": "heart_rate_on_admission",
    "lab_abnormalities": "lab_abnormality_count",
    "length_of_stay_days": "length_of_stay",
}



In [25]:
def clean_los_dataframe(
    df: pd.DataFrame,
    rename_map=RENAME_MAP,
    missing_flag_cols=MISSING_FLAG_COLS,
    clip_bounds=CLIP_BOUNDS,
    int_like_columns=INT_LIKE_COLUMNS,
) -> pd.DataFrame:
    """
    Clean LOS dataset:
      1) Clip extreme physiologic outliers
      2) Create missingness indicator flags
      3) Median-impute select numeric columns
      4) Round/cast selected columns to int for realism
    Returns a new cleaned DataFrame (does not mutate input).
    """
    df = df.copy()

    # 0) Rename columns to standardized analysis names
    df = df.rename(columns=rename_map)

    # 1) Clip outliers (NaNs pass through unchanged)
    for col, (low, high) in clip_bounds.items():
        if col in df.columns:
            df[col] = df[col].clip(lower=low, upper=high)

    # 2) Missing flags (must happen BEFORE imputation)
    for col in missing_flag_cols:
        if col in df.columns:
            df[f"{col}_missing"] = df[col].isna().astype(int)

    # 3) Median imputation
    for col in missing_flag_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())

    # 4) Integer realism (safe now that NaNs are handled)
    for col in int_like_columns:
        if col in df.columns:
            # Round first to avoid float-to-int surprises
            df[col] = df[col].round().astype(int)

    return df


In [26]:
df_raw = pd.read_csv(RAW_PATH)
df_clean = clean_los_dataframe(df_raw)

# Quick validations (these should always pass)
assert df_clean[MISSING_FLAG_COLS].isna().sum().sum() == 0, "Imputation failed: still have NaNs in key cols."
for col in MISSING_FLAG_COLS:
    assert set(df_clean[f"{col}_missing"].unique()).issubset({0, 1}), f"Missing flag {col}_missing not binary."


In [27]:
df_clean.to_csv(CLEAN_PATH, index=False)
print(f"Saved cleaned data to: {CLEAN_PATH}")

Saved cleaned data to: ../data/hospital_length_of_stay_clean.csv
