## Notebook Objectives

1. Analyze overall LOS distribution
2. Examine LOS by demographic groups
3. Evaluate LOS by diagnosis and hospital
4. Identify drivers of prolonged hospitalization
5. Generate operationally relevant insights

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from scipy import stats

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)

sns.set_style("whitegrid")

In [None]:
DATA_PATH = Path("../data/processed/hospital_inpatient_discharges_cleaned.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
df["length_of_stay"].describe()

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df["length_of_stay"], bins=30, kde=True)
plt.title("Overall Length of Stay Distribution")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

In [None]:
if "age" in df.columns:
    bins = [0, 17, 35, 50, 65, 80, 120]
    labels = ["0–17", "18–35", "36–50", "51–65", "66–80", "80+"]

    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels)

In [None]:
if "age_group" in df.columns:
    df.groupby("age_group")["length_of_stay"].describe()

In [None]:
if "age_group" in df.columns:
    plt.figure(figsize=(8,4))
    sns.boxplot(
        data=df,
        x="age_group",
        y="length_of_stay"
    )
    plt.title("Length of Stay by Age Group")
    plt.xlabel("Age Group")
    plt.ylabel("Length of Stay (Days)")
    plt.show()

In [None]:
if "gender" in df.columns:
    df.groupby("gender")["length_of_stay"].describe()

In [None]:
if "gender" in df.columns:
    plt.figure(figsize=(6,4))
    sns.boxplot(
        data=df,
        x="gender",
        y="length_of_stay"
    )
    plt.title("Length of Stay by Gender")
    plt.xlabel("Gender")
    plt.ylabel("Length of Stay (Days)")
    plt.show()

In [None]:
if "principal_diagnosis" in df.columns:
    top_dx = df["principal_diagnosis"].value_counts().head(10).index

In [None]:
if "principal_diagnosis" in df.columns:
    dx_los = (
        df[df["principal_diagnosis"].isin(top_dx)]
        .groupby("principal_diagnosis")["length_of_stay"]
        .mean()
        .sort_values(ascending=False)
    )
    dx_los

In [None]:
if "principal_diagnosis" in df.columns:
    plt.figure(figsize=(10,4))
    dx_los.plot(kind="bar")
    plt.title("Average Length of Stay by Top Diagnoses")
    plt.xlabel("Diagnosis")
    plt.ylabel("Average LOS (Days)")
    plt.show()

In [None]:
if "hospital_name" in df.columns:
    hospital_los = (
        df.groupby("hospital_name")
          .agg(
              avg_los=("length_of_stay", "mean"),
              median_los=("length_of_stay", "median"),
              discharges=("hospital_name", "count")
          )
          .query("discharges >= 100")
          .sort_values("avg_los", ascending=False)
    )

    hospital_los.head(10)

In [None]:
los_threshold = df["length_of_stay"].quantile(0.90)
los_threshold

In [None]:
df["prolonged_los"] = df["length_of_stay"] >= los_threshold

In [None]:
df["prolonged_los"].value_counts(normalize=True)

In [None]:
if "age_group" in df.columns:
    pd.crosstab(df["age_group"], df["prolonged_los"], normalize="index")

In [None]:
if "gender" in df.columns:
    male_los = df[df["gender"] == "M"]["length_of_stay"]
    female_los = df[df["gender"] == "F"]["length_of_stay"]

    stats.ttest_ind(male_los, female_los, nan_policy="omit")

In [None]:
if "age_group" in df.columns:
    groups = [
        df[df["age_group"] == g]["length_of_stay"].dropna()
        for g in df["age_group"].unique()
    ]

    stats.f_oneway(*groups)

In [None]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols].corr()["length_of_stay"].sort_values(ascending=False)

## Key Insights

* LOS distribution is **heavily right-skewed**
* Older age groups tend to have **longer hospital stays**
* Certain diagnoses are consistently associated with prolonged LOS
* Hospital-level variation suggests **process and efficiency differences**
* Top 10% of stays consume a disproportionate share of bed-days