In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("nss2025.csv")

# ---- Aggregate by CAH3 subject ----
cah1_summary = (
    df
    .groupby(["cah3_code", "cah3_subject"], as_index=False)
    .agg(
        num_providers=("provider", "nunique")
    )
    .sort_values(by="num_providers", ascending=False)
)

# ---- Scatter plot (rank vs providers) ----
plt.figure(figsize=(20, 10))

plt.scatter(
    range(len(cah1_summary)),           # X axis: subject rank
    cah1_summary["num_providers"],      # Y axis: provider count
    marker="x"
)

plt.xlabel("CAH3 Subject (sorted by provider count)")
plt.ylabel("Number of Providers")
plt.title("CAH3 Subjects Sorted by Number of Providers")

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
