# 05 — Off-model sales tax inputs
Produces decile totals of AGI, wages, disposable-income proxy, and weights for 2024 & 2025.


In [1]:
# 05 — Sales-tax off-model inputs (2024)
import os, numpy as np, pandas as pd

os.makedirs("../outputs/vat", exist_ok=True)

parq = "../intermediate/ca_panel_2024.parquet"
csv  = "../intermediate/ca_panel_2024.csv"
panel_path = parq if os.path.exists(parq) else (csv if os.path.exists(csv) else None)
if panel_path is None:
    raise FileNotFoundError("Missing panel; run Step 01.")
df = pd.read_parquet(panel_path) if panel_path.endswith(".parquet") else pd.read_csv(panel_path)

# Normalize weight
if "weight" not in df.columns:
    wcol = [c for c in df.columns if c.lower() in ("household_weight","weight","hh_weight")]
    if not wcol:
        raise KeyError("No weight column found.")
    df["weight"] = pd.to_numeric(df[wcol[0]], errors="coerce").fillna(0.0)
else:
    df["weight"] = pd.to_numeric(df["weight"], errors="coerce").fillna(0.0)

# Equivalized income deciles (AGI / household_size)
df["equiv_income"] = df["household_agi"].astype(float) / np.maximum(df["household_size"].astype(float), 1.0)

# Minimal weighted decile helper (no dependency on module)
def add_weighted_deciles_local(df, income_col, weight_col, label="decile"):
    x = df[income_col].astype(float)
    w = df[weight_col].astype(float)
    s = pd.DataFrame({income_col: x, weight_col: w}).sort_values(income_col).reset_index(drop=True)
    v = s[income_col].to_numpy(float)
    ww = s[weight_col].to_numpy(float)
    if len(ww) == 0 or ww.sum() <= 0:
        df[label] = 1
        return df
    cw = np.cumsum(ww); total = cw[-1]
    cuts = [total*k/10 for k in range(1,10)]
    idxs = np.searchsorted(cw, cuts, side="left")
    edges = [-np.inf] + [v[min(i, len(v)-1)] for i in idxs] + [np.inf]
    df[label] = pd.cut(x, bins=edges, labels=range(1,11), include_lowest=True)
    return df

df = add_weighted_deciles_local(df, "equiv_income", "weight", "decile")

# Build the table
by_dec = df.groupby("decile").apply(
    lambda g: pd.Series({
        "households_weighted": g["weight"].sum(),
        "agi_sum": g["household_agi"].sum(),
        "wages_sum": g["employment_income"].clip(lower=0).sum(),
        # handy proxies:
        "consumption_allowance_sum": g.get("consumption_allowance", pd.Series(0.0, index=g.index)).sum(),
        "rebate_after_phaseout_sum": g.get("rebate_after_phaseout", pd.Series(0.0, index=g.index)).sum(),
    })
).reset_index()

out = "../outputs/vat/sales_tax_inputs_2024.csv"
by_dec.to_csv(out, index=False)

# ✅ checks
assert by_dec["households_weighted"].sum() > 0
print("✅ wrote", out)
print(by_dec.head().to_string(index=False))


✅ wrote ../outputs/vat/sales_tax_inputs_2024.csv
decile  households_weighted      agi_sum    wages_sum  consumption_allowance_sum  rebate_after_phaseout_sum
     1         1.890135e+06 6.264963e+05 5.208819e+05                  4640780.0               4.640780e+06
     2         1.568790e+06 1.659327e+06 1.706502e+06                  2192280.0               2.192280e+06
     3         1.489727e+06 7.642576e+06 7.431737e+06                  6429660.0               6.390954e+06
     4         1.611124e+06 7.349458e+06 6.539761e+06                  4059700.0               3.792341e+06
     5         1.551894e+06 2.177773e+07 1.853373e+07                  8678680.0               7.381687e+06


  by_dec = df.groupby("decile").apply(
  by_dec = df.groupby("decile").apply(
