In [2]:
import os, pandas as pd, numpy as np, importlib.util

# load vat_rebate + mapping
vat_path = os.path.abspath("../policy/vat_rebate.py")
spec = importlib.util.spec_from_file_location("vat_rebate", vat_path)
vr = importlib.util.module_from_spec(spec); spec.loader.exec_module(vr)

col_map = {}
with open("../config/columns.yaml") as f:
    for line in f:
        k,v=line.strip().split(": ")
        col_map[k]=v

def load_intermediate(year: int) -> pd.DataFrame:
    p = f"../intermediate/ca_panel_{year}.parquet"
    c = f"../intermediate/ca_panel_{year}.csv"
    if os.path.exists(p): return pd.read_parquet(p)
    if os.path.exists(c): return pd.read_csv(c)
    raise FileNotFoundError(f"Missing both {p} and {c}")

df = load_intermediate(2024).copy()

# See raw strings and weighted counts
fs_raw = df[col_map["filing_status"]].astype(str).str.strip().str.lower()
w = df[col_map["weight"]].astype(float)

print("Unique filing_status values (sample):")
print(fs_raw.drop_duplicates().head(20).tolist())

print("\nWeighted counts by filing_status string:")
print((w.groupby(fs_raw).sum()/1_000).round(1).sort_values(ascending=False).to_string())  # in thousands



Unique filing_status values (sample):
['single', 'married']

Weighted counts by filing_status string:
filing_status
married   11,573
single     5,081


In [9]:
from policyengine_us import Microsimulation
import pandas as pd

# Initialize simulation
sim = Microsimulation()
tbs = sim.tax_benefit_system

# Build catalog of variables with metadata
rows = []
for name, var in tbs.variables.items():
    rows.append({
        "name": name,
        "entity": var.entity.key,
        "label": getattr(var, "label", ""),
        "definition_period": getattr(var, "definition_period", "")
    })

df_vars = pd.DataFrame(rows)

# Save catalog to CSV
output_path = "policyengine_variables_catalog.csv"
df_vars.to_csv(output_path, index=False)

print(f"✅ Wrote {output_path} with {len(df_vars)} variables")
print(df_vars.head(20))


✅ Wrote policyengine_variables_catalog.csv with 3217 variables
                          name    entity  \
0   bonus_guaranteed_deduction  tax_unit   
1                  taxsim_age1  tax_unit   
2                  taxsim_age2  tax_unit   
3                  taxsim_age3  tax_unit   
4             taxsim_childcare  tax_unit   
5                 taxsim_dep13  tax_unit   
6                 taxsim_dep17  tax_unit   
7                 taxsim_dep18  tax_unit   
8                  taxsim_depx  tax_unit   
9             taxsim_dividends  tax_unit   
10               taxsim_fiitax  tax_unit   
11                 taxsim_gssi  tax_unit   
12               taxsim_intrec  tax_unit   
13                 taxsim_ltcg  tax_unit   
14                taxsim_mstat  tax_unit   
15                 taxsim_page  tax_unit   
16              taxsim_pbusinc  tax_unit   
17             taxsim_pensions  tax_unit   
18             taxsim_pprofinc  tax_unit   
19                taxsim_psemp  tax_unit   

            

In [None]:
# --- CA household counts by size (Single vs Married)
import pandas as pd
import numpy as np
from policyengine_us import Microsimulation

period = 2024
sim = Microsimulation()

# Helper to try multiple variable names 
def calc_first(sim, period, candidates, *, decode_enums=True, required=False, default=None):
    for var in candidates:
        try:
            return sim.calc(var, period, decode_enums=decode_enums)
        except Exception:
            pass
    if required:
        raise ValueError(f"None of these variables exist: {candidates}")
    return default

# 1) Pull columns (try common alternatives)
state_series = calc_first(
    sim, period,
    candidates=[
        "state_code", "state_name", "state_abbr",
        "state_fips", "household_state", "state"  # include 'state' last just in case
    ],
    decode_enums=True,  # strings are fine for our filter
    required=True
)

hh_size = calc_first(sim, period, ["household_size", "hh_size", "household_members"], required=True)
fstat   = calc_first(sim, period, ["filing_status", "tax_unit_filing_status", "filingstatus"], required=True)
weight  = calc_first(sim, period, ["household_weight", "hh_weight", "weight"], required=False, default=None)

# If there's no explicit household weight exposed, fall back to 1s (unweighted counts)
if weight is None:
    weight = np.ones_like(hh_size, dtype=float)

# 2) Build DataFrame
data = pd.DataFrame({
    "state_any": state_series,
    "hh_size": hh_size,
    "fstat": fstat,
    "weight": weight.astype(float),
})

# 3) California filter that works for FIPS, abbrev, or full name
def is_california(s):
    # Normalize to string
    ss = pd.Series(s).astype(str).str.strip().str.upper()
    return (
        ss.eq("CA") |
        ss.eq("CALIFORNIA") |
        ss.eq("06") | ss.eq("6")  # FIPS 06 / 6
    )

ca = data.loc[is_california(data["state_any"])].copy()

# 4) Married vs Single grouping
# Treat only "married filing jointly" as Married Households; everything else as Single Households.
f = ca["fstat"].astype(str).str.upper()
ca["group"] = np.where(
    f.str.contains("JOINT") | f.str.contains("MFJ"),
    "Married Households",
    "Single Households",
)

# 5) Size buckets: 1..6 and "7 or more"
ca["size_bucket"] = np.where(ca["hh_size"].astype(int) >= 7, "7 or more", ca["hh_size"].astype(int).astype(str))

# 6) Weighted counts (in thousands)
agg = (
    ca.groupby(["group", "size_bucket"], as_index=False)["weight"]
      .sum()
      .rename(columns={"weight": "num_hh_000s"})
)
agg["num_hh_000s"] = agg["num_hh_000s"] / 1_000

# 7) Order rows to match your layout
size_order = ["1", "2", "3", "4", "5", "6", "7 or more"]
order_map = {s: i for i, s in enumerate(size_order)}

def ordered(group_name):
    g = agg[agg["group"] == group_name].copy()
    g["__order"] = g["size_bucket"].map(order_map)
    return g.sort_values("__order").drop(columns="__order")

single_tbl = ordered("Single Households")
married_tbl = ordered("Married Households")

# 8) Subtotals and grand total
single_subtotal = single_tbl["num_hh_000s"].sum()
married_subtotal = married_tbl["num_hh_000s"].sum()
grand_total     = single_subtotal + married_subtotal

# 9) Assemble a presentation table matching your spec (with subtotal rows)
rows = []

rows.append({"Section": "Single Households", "Household Size": "", "Number of Households (1,000's)": ""})
rows += [
    {"Section": "", "Household Size": s, "Number of Households (1,000's)": v}
    for s, v in zip(single_tbl["size_bucket"], single_tbl["num_hh_000s"].round(0))
]
rows.append({"Section": "", "Household Size": "(1) Subtotal", "Number of Households (1,000's)": round(single_subtotal, 0)})

rows.append({"Section": "Married Households", "Household Size": "", "Number of Households (1,000's)": ""})
rows += [
    {"Section": "", "Household Size": s, "Number of Households (1,000's)": v}
    for s, v in zip(married_tbl["size_bucket"], married_tbl["num_hh_000s"].round(0))
]
rows.append({"Section": "", "Household Size": "(2) Subtotal", "Number of Households (1,000's)": round(married_subtotal, 0)})

rows.append({"Section": "", "Household Size": "Total Tax-Exempt Consumption Expenditure (X) = (1) + (2)",
             "Number of Households (1,000's)": round(grand_total, 0)})

final_table = pd.DataFrame(rows)
final_table


Unnamed: 0,Section,Household Size,"Number of Households (1,000's)"
0,Single Households,,
1,,1,3050.0
2,,2,1895.0
3,,3,1744.0
4,,4,924.0
5,,5,767.0
6,,6,279.0
7,,7 or more,144.0
8,,(1) Subtotal,8804.0
9,Married Households,,


In [4]:
import pandas as pd

panel_path = r"C:\Users\Ali.Melad\Dropbox\Ali Work\Kyle\California VAT\policy_engile_cali_v2\intermediate\ca_panel_2024.csv"
df = pd.read_csv(panel_path)
print("Rows:", len(df), "| Cols:", len(df.columns))


# Accept common variants/locations/extensions
candidate_globs = [
    "intermediate/ca_panel_2024.parquet",
    "intermediate/ca_panel_2024.csv",
    "intermediate/ca_panel_2024.csv.gz",
    "intermediate/ca_panel_2024.feather",
    "intermediate/*ca_panel*2024*.parquet",
    "intermediate/*ca_panel*2024*.csv*",
    "intermediate/*panel*2024*.parquet",
    "intermediate/*panel*2024*.csv*",
]
matches = []
for patt in candidate_globs:
    matches.extend(glob.glob(str(root / patt)))

if not matches:
    # helpful debug: show what's actually inside intermediate/
    interm = root / "intermediate"
    print("No panel found via patterns. Contents of 'intermediate/':")
    if interm.exists():
        for p in sorted(interm.iterdir()):
            print(" -", p.name)
    raise FileNotFoundError("Could not find a 2024 CA panel in 'intermediate/'. Re-run 01_data_prep_ca_2024.ipynb.")

panel_path = Path(matches[0])
print("Loading panel:", panel_path)

# Load by extension
if panel_path.suffix == ".parquet":
    df = pd.read_parquet(panel_path)
elif panel_path.suffix == ".feather":
    df = pd.read_feather(panel_path)
else:
    df = pd.read_csv(panel_path)

print("Rows:", len(df), "| Columns:", len(df.columns))


Rows: 1747 | Cols: 15


NameError: name 'glob' is not defined