## Notebook Objectives

1. Load the cleaned dataset
2. Generate descriptive statistics
3. Analyze distributions of key variables
4. Examine relationships between LOS and charges
5. Produce reusable summary tables and figures

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)

sns.set_style("whitegrid")

In [None]:
DATA_PATH = Path("../data/processed/hospital_inpatient_discharges_cleaned.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
df.shape[0]

In [None]:
if "discharge_year" in df.columns:
    df["discharge_year"].value_counts().sort_index()

In [None]:
if "discharge_year" in df.columns:
    plt.figure(figsize=(8,4))
    df["discharge_year"].value_counts().sort_index().plot(kind="bar")
    plt.title("Admissions by Year")
    plt.xlabel("Year")
    plt.ylabel("Number of Discharges")
    plt.show()

In [None]:
if "length_of_stay" in df.columns:
    df["length_of_stay"].describe()

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(df["length_of_stay"], bins=30, kde=True)
plt.title("Length of Stay Distribution")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

In [None]:
df["length_of_stay"].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
if "total_charges" in df.columns:
    df["total_charges"].describe()

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(np.log1p(df["total_charges"]), bins=30, kde=True)
plt.title("Log-Transformed Total Charges Distribution")
plt.xlabel("log(1 + Total Charges)")
plt.ylabel("Frequency")
plt.show()

In [None]:
df["total_charges"].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

In [None]:
sample_df = df.sample(n=min(5000, len(df)), random_state=42)

plt.figure(figsize=(6,5))
sns.scatterplot(
    data=sample_df,
    x="length_of_stay",
    y="total_charges",
    alpha=0.4
)
plt.title("Length of Stay vs Total Charges")
plt.xlabel("Length of Stay (days)")
plt.ylabel("Total Charges")
plt.show()

In [None]:
df[["length_of_stay", "total_charges"]].corr()

In [None]:
categorical_cols = df.select_dtypes(include="object").columns

for col in categorical_cols:
    print(f"\nTop values for: {col}")
    print(df[col].value_counts().head(10))

In [None]:
if "hospital_name" in df.columns:
    hospital_summary = (
        df.groupby("hospital_name")
          .agg(
              discharges=("hospital_name", "count"),
              avg_los=("length_of_stay", "mean"),
              median_charges=("total_charges", "median")
          )
          .sort_values("discharges", ascending=False)
    )

    hospital_summary.head(10)

In [None]:
OUTPUT_DIR = Path("../outputs/tables")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

df.describe().to_csv(OUTPUT_DIR / "numeric_summary_statistics.csv")

if "hospital_name" in df.columns:
    hospital_summary.to_csv(OUTPUT_DIR / "hospital_level_summary.csv")

## Key Insights

* LOS is **right-skewed**, with most admissions being short stays
* Hospital charges show **extreme right skew**, justifying log transformation
* Strong positive relationship between LOS and total charges
* A small percentage of cases account for a large share of total costs
* Admission volumes may vary significantly by year and facility