# 00 — Repo audit & config

This notebook confirms required columns exist and writes `config/columns.yaml` for downstream steps.

In [1]:
# 00 — Repo audit & config (2024 only)
import os, sys, yaml, numpy as np, pandas as pd
from policyengine_us import Microsimulation

print("Step 00 start.")

sim = Microsimulation()
YEAR = 2024

def try_household(var, decode=True):
    """Try household-mapped; return (ok, series_or_error_str)."""
    try:
        s = sim.calculate(var, map_to="household", period=YEAR, decode_enums=decode)
        return True, pd.Series(s)
    except Exception as e:
        return False, str(e)

def pick_first(candidates, *, decode=True, required=True):
    for v in candidates:
        ok, s = try_household(v, decode=decode)
        if ok:
            print(f"  ✓ {v} (household) len={len(s)}")
            return v, s
        else:
            print(f"  · {v} unavailable ({s})")
    if required:
        raise KeyError(f"None available: {candidates}")
    return None, None

print("\nDetecting columns for household entity…")
agi_var,  agi_s   = pick_first(["adjusted_gross_income","household_agi","agi_household","agi_hh","agi"], decode=False)
wage_var, wage_s  = pick_first(["employment_income","wages","wage_income","labor_income"], decode=False)
size_var, size_s  = pick_first(["household_size","hh_size","household_members","family_size"], decode=False)
wt_var,   wt_s    = pick_first(["household_weight","hh_weight","weight","marsupwt","asec_weight"], decode=False)
fed_var,  fed_s   = pick_first(["income_tax"], decode=False)
st_var,   st_s    = pick_first(["ca_income_tax"], decode=False)

# Filing status is often not household-mapped. We'll still record the name if it exists anywhere,
# but we WON'T depend on it later.
fs_candidates = ["filing_status","tax_unit_filing_status","filingstatus"]
fs_avail = []
for v in fs_candidates:
    ok, s = try_household(v, decode=True)
    if ok:
        fs_avail.append(v)
if not fs_avail:
    # Just store the canonical key so downstream code has a column name to write to.
    fs_avail = ["filing_status"]
print("\nFiling status candidates (record only):", fs_avail)

# Basic CA sample check
ok_state, state_s = try_household("state_code", decode=True)
if not ok_state:
    raise RuntimeError(f"state_code not available at household level: {state_s}")
mask_ca = state_s.astype(str).str.upper().eq("CA")
print("CA households (raw, 2024):", int(mask_ca.sum()))

# Build lightweight frame to sanity check
df0 = pd.DataFrame({
    agi_var:  agi_s,
    wage_var: wage_s,
    size_var: size_s,
    wt_var:   wt_s,
    fed_var:  fed_s,
    st_var:   st_s
})
print("\nSample rows (any state):")
print(df0.head(3))

# Write config/columns.yaml
os.makedirs("../config", exist_ok=True)
col_map = {
    "agi": agi_var,
    "wages": wage_var,
    "hh_size": size_var,
    "weight": wt_var,
    "fed_tax": fed_var,
    "state_tax": st_var,
    "filing_status": fs_avail[0],  # will be overwritten in Step 01 with derived statuses
}
with open("../config/columns.yaml", "w") as f:
    yaml.safe_dump(col_map, f, sort_keys=False)

print("\nWrote ../config/columns.yaml:")
print(col_map)

# Quick checks
assert mask_ca.any(), "No CA households found with state_code=='CA'."
assert df0[agi_var].notna().any(), "AGI appears all missing."
assert df0[wage_var].notna().any(), "Wages appear all missing."
print("\n✅ Step 00 complete.")


  from .autonotebook import tqdm as notebook_tqdm


Step 00 start.

Detecting columns for household entity…
  ✓ adjusted_gross_income (household) len=21251
  ✓ employment_income (household) len=21251
  ✓ household_size (household) len=21251
  ✓ household_weight (household) len=21251
  ✓ income_tax (household) len=21251
  ✓ ca_income_tax (household) len=21251

Filing status candidates (record only): ['filing_status']
CA households (raw, 2024): 1777

Sample rows (any state):
   adjusted_gross_income  employment_income  household_size  household_weight  \
0          107805.242188        4022.857178               2      24047.990234   
1           85387.771484       92190.474609               3      13475.582031   
2           23692.609901           0.000000               2        186.740341   

    income_tax  ca_income_tax  
0  8968.628906            0.0  
1  4438.604889            0.0  
2   188.025589            0.0  

Wrote ../config/columns.yaml:
{'agi': 'adjusted_gross_income', 'wages': 'employment_income', 'hh_size': 'household_size'