## Notebook Objectives

1. Examine LOS and charges by demographic groups
2. Identify potential disparities in utilization and cost
3. Quantify outcome differences across age and gender
4. Provide equity-relevant insights for policy and planning

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from scipy import stats

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)

sns.set_style("whitegrid")

In [None]:
DATA_PATH = Path("../data/processed/hospital_inpatient_discharges_cleaned.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
demographic_cols = ["age", "gender", "race", "ethnicity"]
[col for col in demographic_cols if col in df.columns]

In [None]:
if "age" in df.columns:
    bins = [0, 17, 35, 50, 65, 80, 120]
    labels = ["0–17", "18–35", "36–50", "51–65", "66–80", "80+"]

    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels)

In [None]:
if "age_group" in df.columns:
    df.groupby("age_group")["length_of_stay"].describe()

In [None]:
if "age_group" in df.columns:
    plt.figure(figsize=(8,4))
    sns.boxplot(
        data=df,
        x="age_group",
        y="length_of_stay"
    )
    plt.title("Length of Stay by Age Group")
    plt.xlabel("Age Group")
    plt.ylabel("Length of Stay (Days)")
    plt.show()

In [None]:
if "age_group" in df.columns:
    df.groupby("age_group")["total_charges"].describe()

In [None]:
if "age_group" in df.columns:
    plt.figure(figsize=(8,4))
    sns.boxplot(
        data=df,
        x="age_group",
        y="total_charges"
    )
    plt.yscale("log")
    plt.title("Total Charges by Age Group (Log Scale)")
    plt.xlabel("Age Group")
    plt.ylabel("Total Charges")
    plt.show()

In [None]:
if "gender" in df.columns:
    df.groupby("gender")["length_of_stay"].describe()

In [None]:
if "gender" in df.columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(
        data=df,
        x="gender",
        y="length_of_stay"
    )
    plt.title("Length of Stay by Gender")
    plt.xlabel("Gender")
    plt.ylabel("Length of Stay (Days)")
    plt.show()

In [None]:
if "gender" in df.columns:
    df.groupby("gender")["total_charges"].describe()

In [None]:
if "gender" in df.columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(
        data=df,
        x="gender",
        y="total_charges"
    )
    plt.yscale("log")
    plt.title("Total Charges by Gender (Log Scale)")
    plt.xlabel("Gender")
    plt.ylabel("Total Charges")
    plt.show()

In [None]:
if "age_group" in df.columns:
    los_groups = [
        df[df["age_group"] == g]["length_of_stay"].dropna()
        for g in df["age_group"].unique()
    ]
    stats.f_oneway(*los_groups)

In [None]:
if "gender" in df.columns:
    groups = df["gender"].dropna().unique()

    if len(groups) == 2:
        g1 = df[df["gender"] == groups[0]]["length_of_stay"]
        g2 = df[df["gender"] == groups[1]]["length_of_stay"]
        stats.ttest_ind(g1, g2, nan_policy="omit")

In [None]:
df["high_cost_case"] = df["total_charges"] >= df["total_charges"].quantile(0.95)
df["prolonged_los"] = df["length_of_stay"] >= df["length_of_stay"].quantile(0.90)

In [None]:
if "age_group" in df.columns:
    pd.crosstab(df["age_group"], df["high_cost_case"], normalize="index")

In [None]:
if "age_group" in df.columns:
    pd.crosstab(df["age_group"], df["prolonged_los"], normalize="index")

In [None]:
if {"age_group", "gender"}.issubset(df.columns):
    intersectional = (
        df.groupby(["age_group", "gender"])
          .agg(
              avg_los=("length_of_stay", "mean"),
              median_charges=("total_charges", "median"),
              cases=("length_of_stay", "count")
          )
    )
    intersectional

In [None]:
OUTPUT_DIR = Path("../outputs/tables")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

if "age_group" in df.columns:
    df.groupby("age_group")[["length_of_stay", "total_charges"]].mean().to_csv(
        OUTPUT_DIR / "age_group_equity_summary.csv"
    )

if "gender" in df.columns:
    df.groupby("gender")[["length_of_stay", "total_charges"]].mean().to_csv(
        OUTPUT_DIR / "gender_equity_summary.csv"
    )

## Key Insights

* Older age groups consistently exhibit **longer LOS and higher charges**
* Cost and LOS distributions differ by gender, though effect sizes may vary
* High-cost and prolonged LOS cases are **unevenly distributed across age groups**
* Intersectional analysis reveals compounding effects (e.g., elderly males/females)
* Results highlight the importance of **age-adjusted and equity-aware planning**