# 04 — Distribution: baseline vs no-income-tax + rebate (AGI deciles)
Baseline = income_tax_fed + income_tax_state
Reform   = 0 - rebate_phaseout  (income taxes removed; rebate as negative tax)
Checks: shares sum ≈ 100%.


In [1]:
# 04 — Distribution (2024): Baseline vs No income tax + VAT rebate (phase-out)
import os, numpy as np, pandas as pd, importlib.util

# Load helpers
vat_path = os.path.abspath("../policy/vat_rebate.py")
spec = importlib.util.spec_from_file_location("vat_rebate", vat_path)
vr = importlib.util.module_from_spec(spec); spec.loader.exec_module(vr)

os.makedirs("../outputs/vat", exist_ok=True)

# Load Step 01 panel
parq = "../intermediate/ca_panel_2024.parquet"
csv  = "../intermediate/ca_panel_2024.csv"
panel_path = parq if os.path.exists(parq) else (csv if os.path.exists(csv) else None)
if panel_path is None:
    raise FileNotFoundError("Missing panel; run Step 01.")
df = pd.read_parquet(panel_path) if panel_path.endswith(".parquet") else pd.read_csv(panel_path)

# Normalize weight
if "weight" not in df.columns:
    wcol = [c for c in df.columns if c.lower() in ("household_weight","weight","hh_weight")]
    if not wcol:
        raise KeyError("No weight column found.")
    df["weight"] = pd.to_numeric(df[wcol[0]], errors="coerce").fillna(0.0)
else:
    df["weight"] = pd.to_numeric(df["weight"], errors="coerce").fillna(0.0)

# Make sure we have the needed columns from Step 01
need = ["household_agi","household_size","fed_income_tax","ca_income_tax",
        "consumption_allowance","rebate_after_phaseout"]
missing = [c for c in need if c not in df.columns]
if missing:
    # allow recompute of allowance/phaseout if only those are missing
    recompute = set(["consumption_allowance","rebate_after_phaseout"]).intersection(missing)
    if recompute:
        if {"size_bucket","is_married_couple"}.issubset(df.columns):
            if "consumption_allowance" not in df: df = vr.compute_allowance(df)
            if "rebate_after_phaseout" not in df: df = vr.apply_phaseout(df)
        else:
            raise KeyError(f"Need size_bucket + is_married_couple to compute allowance/phaseout; missing: {missing}")
    # re-check
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise KeyError(f"Still missing required columns: {missing}")

# Build baseline and reform tax burdens
baseline_tax = df["fed_income_tax"].astype(float).fillna(0.0) + df["ca_income_tax"].astype(float).fillna(0.0)
# Reform removes income taxes and pays the rebate (negative burden)
reform_tax = - df["rebate_after_phaseout"].astype(float)

# Grouping: equivalized income deciles (AGI / household_size)
df["equiv_income"] = df["household_agi"].astype(float) / np.maximum(df["household_size"].astype(float), 1.0)
df = vr.add_weighted_deciles(df, income_col="equiv_income", weight_col="weight", label="decile")

# Weighted percentiles for Top 5% and Top 1%
s = df[["equiv_income","weight"]].sort_values("equiv_income").reset_index(drop=True)
cw = s["weight"].cumsum()
tot = s["weight"].sum()
p95 = s.loc[cw >= 0.95*tot, "equiv_income"].iloc[0]
p99 = s.loc[cw >= 0.99*tot, "equiv_income"].iloc[0]
df["top_5pct"] = (df["equiv_income"] >= p95).astype(int)
df["top_1pct"] = (df["equiv_income"] >= p99).astype(int)

def wmean(x, w):
    x = x.astype(float); w = w.astype(float); T = w.sum()
    return float((x*w).sum()/T) if T>0 else np.nan

rows = []

# Deciles 1..10
for d in sorted(df["decile"].dropna().unique(), key=lambda z: int(str(z))):
    g = df[df["decile"] == d]
    w = g["weight"]
    mb = wmean(baseline_tax.loc[g.index], w)
    mr = wmean(reform_tax.loc[g.index], w)
    dlt = (reform_tax.loc[g.index] - baseline_tax.loc[g.index]) * w
    rows.append({
        "year": 2024,
        "group": f"decile_{int(str(d))}",
        "mean_tax_baseline": mb,
        "mean_tax_reform":   mr,
        "mean_change":       mr - mb,
        "total_change":      float(dlt.sum()),
        "pop_share":         float(100.0 * w.sum()/df["weight"].sum()),
    })

# Top 5% and Top 1%
for label, mask in [("top_5pct", df["top_5pct"].eq(1)), ("top_1pct", df["top_1pct"].eq(1))]:
    g = df[mask]
    w = g["weight"]
    mb = wmean(baseline_tax.loc[g.index], w)
    mr = wmean(reform_tax.loc[g.index], w)
    dlt = (reform_tax.loc[g.index] - baseline_tax.loc[g.index]) * w
    rows.append({
        "year": 2024,
        "group": label,
        "mean_tax_baseline": mb,
        "mean_tax_reform":   mr,
        "mean_change":       mr - mb,
        "total_change":      float(dlt.sum()),
        "pop_share":         float(100.0 * w.sum()/df["weight"].sum()),
    })

dist = pd.DataFrame(rows)

# Share of total change (within deciles, so it sums to ~100 across deciles)
dec_mask = dist["group"].str.startswith("decile_")
total_delta_deciles = dist.loc[dec_mask, "total_change"].sum()
dist["share_of_total_change"] = 100.0 * dist["total_change"] / total_delta_deciles

out = "../outputs/vat/distribution_2024.csv"
dist.to_csv(out, index=False)

# ✅ checks
assert np.isclose(dist.loc[dec_mask, "pop_share"].sum(), 100.0, atol=0.2), "Decile pop shares should sum to ~100%"
print("✅ wrote", out)
print(dist.head(12).to_string(index=False))


✅ wrote ../outputs/vat/distribution_2024.csv
 year     group  mean_tax_baseline  mean_tax_reform    mean_change  total_change  pop_share  share_of_total_change
 2024  decile_1       -2872.822256    -25898.613447  -23025.791190 -4.352185e+10  11.656511               5.144342
 2024  decile_2       -3991.946569    -33852.590773  -29860.644204 -4.684509e+10   9.674770               5.537152
 2024  decile_3       -2409.696325    -25475.567102  -23065.870776 -3.436186e+10   9.187187               4.061618
 2024  decile_4        1268.987260    -32546.189595  -33815.176855 -5.448043e+10   9.935842               6.439660
 2024  decile_5        3402.169135    -26887.577367  -30289.746502 -4.700649e+10   9.570574               5.556231
 2024  decile_6        7719.960270    -22429.338136  -30149.298406 -5.217881e+10  10.673153               6.167605
 2024  decile_7        7858.732386    -26433.754074  -34292.486460 -5.212744e+10   9.374394               6.161534
 2024  decile_8       22872.541237 