In [1]:
# 01 — Data prep CA (2024; household-level; MFJ via spouse/HOH; exclude AGI<0)
import os, yaml, numpy as np, pandas as pd, importlib.util
from policyengine_us import Microsimulation

print("Step 01 start.")

# Load vat_rebate helpers
vat_path = os.path.abspath("../policy/vat_rebate.py")
spec = importlib.util.spec_from_file_location("vat_rebate", vat_path)
vr = importlib.util.module_from_spec(spec); spec.loader.exec_module(vr)
print("Loaded:", vr.__file__)

# Load column mapping
with open("../config/columns.yaml") as f:
    col_map = yaml.safe_load(f)
print("col_map:", col_map)

os.makedirs("../intermediate", exist_ok=True)
sim = Microsimulation()
YEAR = 2024

def hcalc(var, decode_enums=True):
    return pd.Series(sim.calculate(var, map_to="household", period=YEAR, decode_enums=decode_enums))

# 1) Pull household-level arrays we need
state_code       = hcalc("state_code", decode_enums=True).astype(str).str.strip().str.upper()
household_size   = hcalc(col_map["hh_size"], decode_enums=False)
household_weight = hcalc(col_map["weight"],  decode_enums=False)
agi              = hcalc(col_map["agi"],     decode_enums=False)
wages            = hcalc(col_map["wages"],   decode_enums=False)
fed_tax          = hcalc(col_map["fed_tax"], decode_enums=False)
state_tax        = hcalc(col_map["state_tax"], decode_enums=False)

# Household-level spouse/HOH
def try_household(var, decode=False):
    try:
        return pd.Series(sim.calculate(var, map_to="household", period=YEAR, decode_enums=decode))
    except Exception:
        return None

has_spouse   = try_household("has_spouse", decode=False)
spouse_pres  = try_household("spouse_present", decode=False)
spouse_count = try_household("head_spouse_count", decode=False)
hoh_elig     = try_household("head_of_household_eligible", decode=False)

# 2) Build CA DataFrame
df = pd.DataFrame({
    "state_code": state_code,
    "household_size": pd.to_numeric(household_size, errors="coerce"),
    "household_weight": pd.to_numeric(household_weight, errors="coerce"),
    "household_agi": pd.to_numeric(agi, errors="coerce"),
    "employment_income": pd.to_numeric(wages, errors="coerce"),
    "fed_income_tax": pd.to_numeric(fed_tax, errors="coerce"),
    "ca_income_tax": pd.to_numeric(state_tax, errors="coerce"),
})
mask_ca = df["state_code"].eq("CA")
df = df.loc[mask_ca].reset_index(drop=True)
print("CA households (raw):", len(df))

# 3) Align spouse/HOH to df (they were also household-mapped, so lengths should match)
def align_to_df(s):
    if s is None: 
        return None
    s = pd.to_numeric(pd.Series(s), errors="coerce")
    return s.loc[mask_ca].reset_index(drop=True)

has_spouse   = align_to_df(has_spouse)
spouse_pres  = align_to_df(spouse_pres)
spouse_count = align_to_df(spouse_count)
hoh_elig     = align_to_df(hoh_elig)

# 4) Derive filing_status: HOH ⇒ single; else spouse ⇒ mfj; else single.
# Prefer direct booleans; interpret spouse_count if needed.
if has_spouse is not None:
    spouse_any = has_spouse.fillna(0).astype(bool)
    source_used = "has_spouse"
elif spouse_pres is not None:
    spouse_any = spouse_pres.fillna(0).astype(bool)
    source_used = "spouse_present"
elif spouse_count is not None:
    uniq = np.sort(spouse_count.dropna().unique())
    if len(uniq) and uniq.max() >= 2:
        spouse_any = (spouse_count.fillna(0) >= 2)  # head + spouse
        source_used = "head_spouse_count>=2"
    else:
        spouse_any = (spouse_count.fillna(0) > 0)
        source_used = "head_spouse_count>0"
else:
    spouse_any = pd.Series(False, index=df.index)
    source_used = "no_spouse_signal"

hoh_any = (hoh_elig.fillna(0) > 0) if hoh_elig is not None else pd.Series(False, index=df.index)

filing_status = np.where(hoh_any, "single", np.where(spouse_any, "mfj", "single"))
df["filing_status"] = filing_status.astype(str)
df["is_married_couple"] = (df["filing_status"].str.lower() == "mfj").astype(int)

print(f"[info] spouse signal used: {source_used}")
print("filing_status counts:", df["filing_status"].value_counts().to_dict())

# 5) Size bucket and exclude negative AGI
df["household_size"] = df["household_size"].fillna(1).round().astype(int)
df["size_bucket"] = np.where(df["household_size"] >= 7, 7, np.maximum(1, df["household_size"])).astype(int)

before = len(df)
df = df.loc[df["household_agi"] >= 0].reset_index(drop=True)
print("Excluded negative-AGI households:", before - len(df))

# 6) Compute allowance + phaseout
df = vr.compute_allowance(df)   # -> consumption_allowance
df = vr.apply_phaseout(df)      # -> rebate_after_phaseout

# Back-compat cols
df["allowance_no_phaseout"] = df["consumption_allowance"]
df["allowance_phaseout"]    = df["rebate_after_phaseout"]

# 7) Save intermediate
os.makedirs("../intermediate", exist_ok=True)
parq = "../intermediate/ca_panel_2024.parquet"
csv  = "../intermediate/ca_panel_2024.csv"
try:
    df.to_parquet(parq, index=False)
    print("saved", parq, "rows:", len(df))
except Exception as e:
    print("parquet save failed; writing CSV:", e)
    df.to_csv(csv, index=False)
    print("saved", csv, "rows:", len(df))

# 8) Sanity print: Singles should appear at sizes > 1
w = df["household_weight"].fillna(0.0)
tab = (w.groupby([df["size_bucket"], np.where(df["is_married_couple"]==1,"Married","Single")]).sum()
         .unstack(1).fillna(0.0)/1_000).round(1)
tab.index.name = "size_bucket"
print("\nWeighted CA households (thousands) by size × status:")
print(tab.to_string())

print("\n✅ Step 01 complete.")


  from .autonotebook import tqdm as notebook_tqdm


Step 01 start.
Loaded: c:\Users\Ali.Melad\Dropbox\Ali Work\Kyle\California VAT\policy_engile_cali_v2\policy\vat_rebate.py
col_map: {'agi': 'adjusted_gross_income', 'wages': 'employment_income', 'hh_size': 'household_size', 'weight': 'household_weight', 'fed_tax': 'income_tax', 'state_tax': 'ca_income_tax', 'filing_status': 'filing_status'}
CA households (raw): 1777
[info] spouse signal used: head_spouse_count>=2
filing_status counts: {'mfj': 1131, 'single': 646}
Excluded negative-AGI households: 30
parquet save failed; writing CSV: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.
saved ..