## Notebook Objectives

1. Benchmark hospitals by volume, LOS, and charges
2. Identify high- and low-performing facilities
3. Analyze efficiency using LOS-adjusted cost metrics
4. Highlight inter-hospital variation
5. Produce performance tables for reporting

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)

sns.set_style("whitegrid")

In [None]:
DATA_PATH = Path("../data/processed/hospital_inpatient_discharges_cleaned.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
hospital_volume = (
    df.groupby("hospital_name")
      .size()
      .sort_values(ascending=False)
)

hospital_volume.head(10)

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(hospital_volume, bins=30)
plt.title("Distribution of Hospital Discharge Volumes")
plt.xlabel("Number of Discharges")
plt.ylabel("Number of Hospitals")
plt.show()

In [None]:
hospital_los = (
    df.groupby("hospital_name")
      .agg(
          discharges=("hospital_name", "count"),
          avg_los=("length_of_stay", "mean"),
          median_los=("length_of_stay", "median")
      )
      .query("discharges >= 100")
      .sort_values("avg_los")
)

hospital_los.head(10)

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(
    data=df[df["hospital_name"].isin(hospital_los.index)],
    x="hospital_name",
    y="length_of_stay"
)
plt.xticks(rotation=90)
plt.title("Length of Stay Distribution by Hospital")
plt.xlabel("Hospital")
plt.ylabel("Length of Stay (Days)")
plt.show()

In [None]:
hospital_charges = (
    df.groupby("hospital_name")
      .agg(
          discharges=("hospital_name", "count"),
          avg_charges=("total_charges", "mean"),
          median_charges=("total_charges", "median")
      )
      .query("discharges >= 100")
      .sort_values("avg_charges")
)

hospital_charges.head(10)

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(
    data=df[df["hospital_name"].isin(hospital_charges.index)],
    x="hospital_name",
    y="total_charges"
)
plt.yscale("log")
plt.xticks(rotation=90)
plt.title("Total Charges Distribution by Hospital (Log Scale)")
plt.xlabel("Hospital")
plt.ylabel("Total Charges")
plt.show()

In [None]:
df["charges_per_day"] = df["total_charges"] / df["length_of_stay"]

In [None]:
hospital_efficiency = (
    df.groupby("hospital_name")
      .agg(
          discharges=("hospital_name", "count"),
          avg_charges_per_day=("charges_per_day", "mean"),
          median_charges_per_day=("charges_per_day", "median")
      )
      .query("discharges >= 100")
      .sort_values("avg_charges_per_day")
)

hospital_efficiency.head(10)

In [None]:
perf = hospital_los.join(
    hospital_charges[["avg_charges", "median_charges"]],
    how="inner"
)

perf["los_z"] = (perf["avg_los"] - perf["avg_los"].mean()) / perf["avg_los"].std()
perf["charges_z"] = (perf["avg_charges"] - perf["avg_charges"].mean()) / perf["avg_charges"].std()

perf["performance_score"] = -(perf["los_z"] + perf["charges_z"])
perf.sort_values("performance_score", ascending=False).head(10)

In [None]:
los_q95 = perf["avg_los"].quantile(0.95)
charge_q95 = perf["avg_charges"].quantile(0.95)

outliers = perf[
    (perf["avg_los"] >= los_q95) |
    (perf["avg_charges"] >= charge_q95)
]

outliers

In [None]:
OUTPUT_DIR = Path("../outputs/tables")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

hospital_los.to_csv(OUTPUT_DIR / "hospital_los_performance.csv")
hospital_charges.to_csv(OUTPUT_DIR / "hospital_charge_performance.csv")
hospital_efficiency.to_csv(OUTPUT_DIR / "hospital_efficiency_metrics.csv")
perf.to_csv(OUTPUT_DIR / "hospital_combined_performance_scores.csv")

## Key Hospital Performance Insights

* Substantial variation exists in LOS and charges across hospitals
* High-volume hospitals are not always the most efficient
* Charges per day reveal hidden inefficiencies not visible in total charges
* Composite performance scores help identify consistently high performers
* Outlier hospitals warrant deeper case-mix and process analysis