In [1]:
# === CA household counts by size (Single vs Married=MFJ only), 2024 ===
import os, numpy as np, pandas as pd
from policyengine_us import Microsimulation

period = 2024
sim = Microsimulation()

# -------- Helper: try multiple variable names, force map_to='household' --------
def calc_first(sim, period, candidates, *, decode_enums=True, required=False, default=None):
    """
    Return the first available variable mapped to household.
    Uses sim.calculate(...) (PolicyEngine API). 
    """
    for var in candidates:
        try:
            return pd.Series(
                sim.calculate(var, map_to="household", period=period, decode_enums=decode_enums)
            )
        except Exception:
            pass
    if required:
        raise ValueError(f"None of these variables exist (household-mapped): {candidates}")
    return default

# -------- 1) Pull columns (robust candidates) --------
# State can be code ("CA"), name ("California"), or FIPS (6). We'll normalize later.
state_any = calc_first(
    sim, period,
    candidates=["state_code","state_name","state_abbr","state_fips","household_state","state"],
    decode_enums=True,  # strings are fine if available
    required=True
)

hh_size = calc_first(sim, period, ["household_size","hh_size","household_members"], decode_enums=False, required=True)
fstat   = calc_first(sim, period, ["filing_status","tax_unit_filing_status","filingstatus"], decode_enums=True, required=True)
weight  = calc_first(sim, period, ["household_weight","hh_weight","weight"], decode_enums=False, required=False, default=None)

# If no explicit household weight, fall back to 1s (unweighted)
if weight is None:
    weight = pd.Series(np.ones(len(hh_size), dtype=float))
else:
    weight = weight.astype(float)

# -------- 2) Build DataFrame --------
df = pd.DataFrame({
    "state_any": state_any,
    "hh_size":   pd.to_numeric(hh_size, errors="coerce"),
    "fstat_raw": fstat.astype(str).str.strip(),
    "weight":    weight
})

# Guard: drop any rows missing hh_size/weight
df = df[df["hh_size"].notna() & df["weight"].notna()].reset_index(drop=True)
df["hh_size"] = df["hh_size"].round().astype(int)

# -------- 3) California filter (accept code, name, or FIPS) --------
def is_california(series):
    s = series.astype(str).str.strip().str.upper()
    return (
        s.eq("CA") | s.eq("CALIFORNIA") |
        s.eq("06") | s.eq("6")           # FIPS 06/6
    )

df_ca = df.loc[is_california(df["state_any"])].copy()
assert len(df_ca) > 0, "No California households found — check state variable decoding."

# -------- 4) Married vs Single grouping --------
# IMPORTANT: Married Households = MFJ only.
# We detect MFJ by strings containing 'JOINT' or 'MFJ'. Everything else => Single Households.
fs_upper = df_ca["fstat_raw"].str.upper()
df_ca["group"] = np.where(
    fs_upper.str.contains("JOINT", regex=False) | fs_upper.str.contains("MFJ", regex=False),
    "Married Households",
    "Single Households"
)

# Diagnostics: see raw values so we can adjust if needed
print("Unique filing_status values (first 20):", sorted(df_ca["fstat_raw"].dropna().unique().tolist()[:20]))

# -------- 5) Size buckets: 1..6 and "7 or more" --------
df_ca["size_bucket"] = np.where(df_ca["hh_size"] >= 7, "7 or more", df_ca["hh_size"].astype(str))

# -------- 6) Weighted counts (in thousands) --------
agg = (
    df_ca.groupby(["group","size_bucket"], as_index=False)["weight"]
         .sum()
         .rename(columns={"weight":"num_hh_000s"})
)
agg["num_hh_000s"] = agg["num_hh_000s"] / 1_000

# -------- 7) Order rows to match your layout --------
size_order = ["1","2","3","4","5","6","7 or more"]
order_map  = {s:i for i,s in enumerate(size_order)}

def ordered(group_name):
    g = agg[agg["group"] == group_name].copy()
    g["__ord"] = g["size_bucket"].map(order_map)
    return g.sort_values("__ord").drop(columns="__ord")

single_tbl  = ordered("Single Households")
married_tbl = ordered("Married Households")

# -------- 8) Attach consumption allowance and totals (your constants) --------
# Single: sizes 1..7; Married: sizes 2..7 (7+ capped at 7 allowance)
# We'll load from your policy/vat_rebate.py constants to avoid drift.
import importlib.util
vat_path = os.path.abspath("../policy/vat_rebate.py")
spec = importlib.util.spec_from_file_location("vat_rebate", vat_path)
vr = importlib.util.module_from_spec(spec); spec.loader.exec_module(vr)

def allowance_for(status, size_str):
    size = 7 if (size_str == "7 or more") else int(size_str)
    if status == "Single Households":
        return float(vr.ALLOW_SINGLE.get(size, vr.ALLOW_SINGLE[7]))
    else:
        return float(vr.ALLOW_MARRIED.get(size, vr.ALLOW_MARRIED[7]))

def attach_allowance(tbl, status):
    tbl = tbl.copy()
    tbl["Consumption Allowance ($)"] = tbl["size_bucket"].map(lambda s: allowance_for(status, s))
    tbl["Total Tax-Exempt Consumption ($1,000's)"] = (tbl["num_hh_000s"] * tbl["Consumption Allowance ($)"]).astype(float)
    return tbl

single_tbl  = attach_allowance(single_tbl,  "Single Households")
married_tbl = attach_allowance(married_tbl, "Married Households")

# -------- 9) Subtotals and grand total --------
single_subtotal_hh   = single_tbl["num_hh_000s"].sum()
single_subtotal_exmp = single_tbl["Total Tax-Exempt Consumption ($1,000's)"].sum()

married_subtotal_hh   = married_tbl["num_hh_000s"].sum()
married_subtotal_exmp = married_tbl["Total Tax-Exempt Consumption ($1,000's)"].sum()

grand_total_exmp = single_subtotal_exmp + married_subtotal_exmp

# -------- 10) Assemble presentation table --------
rows = []
rows.append({"Section":"Single Households", "Household Size":"", "Number of Households (1,000's)":"", "Consumption Allowance ($)":"", "Total Tax-Exempt Consumption ($1,000's)":""})
rows += [
    {
        "Section":"", 
        "Household Size": s, 
        "Number of Households (1,000's)": round(v_hh, 0),
        "Consumption Allowance ($)": v_allow, 
        "Total Tax-Exempt Consumption ($1,000's)": round(v_tot, 0),
    }
    for s, v_hh, v_allow, v_tot in zip(
        single_tbl["size_bucket"], single_tbl["num_hh_000s"], single_tbl["Consumption Allowance ($)"], single_tbl["Total Tax-Exempt Consumption ($1,000's)"]
    )
]
rows.append({"Section":"", "Household Size":"(1) Subtotal", "Number of Households (1,000's)": round(single_subtotal_hh, 0), "Consumption Allowance ($)":"", "Total Tax-Exempt Consumption ($1,000's)": round(single_subtotal_exmp, 0)})

rows.append({"Section":"Married Households", "Household Size":"", "Number of Households (1,000's)":"", "Consumption Allowance ($)":"", "Total Tax-Exempt Consumption ($1,000's)":""
})
rows += [
    {
        "Section":"", 
        "Household Size": s, 
        "Number of Households (1,000's)": round(v_hh, 0),
        "Consumption Allowance ($)": v_allow, 
        "Total Tax-Exempt Consumption ($1,000's)": round(v_tot, 0),
    }
    for s, v_hh, v_allow, v_tot in zip(
        married_tbl["size_bucket"], married_tbl["num_hh_000s"], married_tbl["Consumption Allowance ($)"], married_tbl["Total Tax-Exempt Consumption ($1,000's)"]
    )
]
rows.append({"Section":"", "Household Size":"(2) Subtotal", "Number of Households (1,000's)": round(married_subtotal_hh, 0), "Consumption Allowance ($)":"", "Total Tax-Exempt Consumption ($1,000's)": round(married_subtotal_exmp, 0)})

rows.append({
    "Section":"", 
    "Household Size":"Total Tax-Exempt Consumption Expenditure (X) = (1) + (2)",
    "Number of Households (1,000's)": round(single_subtotal_hh + married_subtotal_hh, 0),
    "Consumption Allowance ($)":"", 
    "Total Tax-Exempt Consumption ($1,000's)": round(grand_total_exmp, 0)
})

final_table = pd.DataFrame(rows)

# -------- 11) Diagnostics & save --------
print("\nWeighted households (thousands) by group×size:")
print((df_ca.groupby(["group","size_bucket"])["weight"].sum()/1_000).unstack(0).fillna(0).round(1).to_string())

pd.set_option("display.float_format", lambda x: f"{x:,.0f}")
print("\n=== Final table ===\n")
print(final_table)

os.makedirs("../outputs/vat", exist_ok=True)
out_path = "../outputs/vat/allowance_table_by_size_CA_2024.csv"
final_table.to_csv(out_path, index=False)
print(f"\nSaved: {out_path}")

# -------- 12) Sanity checks --------
# A) Every row classified
assert df_ca["group"].isin(["Single Households","Married Households"]).all()

# B) Size totals reconcile with split
size_totals = df_ca.groupby("size_bucket")["weight"].sum().sort_index()
split_totals = (
    df_ca.loc[df_ca["group"]=="Single Households"].groupby("size_bucket")["weight"].sum()
    + df_ca.loc[df_ca["group"]=="Married Households"].groupby("size_bucket")["weight"].sum()
).reindex(size_totals.index).fillna(0.0)
assert np.allclose(size_totals.values, split_totals.values), "Size totals != Single+Married split."

print("✅ Checks passed.")


  from .autonotebook import tqdm as notebook_tqdm


ValueError: None of these variables exist (household-mapped): ['filing_status', 'tax_unit_filing_status', 'filingstatus']