In [8]:
# add_zip_from_blockgroup.py
# ---------------------------------------------
# Run from the SAME FOLDER as your CSV file.
# Input : Howard_demographics_block_2022-2023.csv (must have a 'county_name' column)
# Output: Howard_demographics_block_2022-2023_with_zip.csv
#
# How ZIPs are added:
# 1) Parse "county_name" like:
#    "Block Group 1; Census Tract 6011.03; Howard County; Maryland"
# 2) Build Block Group GEOID = state FIPS (24) + county FIPS (027) + tract(6) + blockGroup(1)
# 3) Join to a local crosswalk CSV to map Block Group → ZCTA (ZIP).
#
# Place ONE of these crosswalk files in the SAME FOLDER (any one works):
#  - bg_to_zcta_md.csv            (columns: bg_geoid,zcta5[,pop_share|aland_share])
#  - bg_to_zcta_us.csv            (nationwide; columns: bg_geoid,zcta5[,share])
#  - tract_to_zcta_md.csv         (columns: tract_geoid11,zcta5[,share])  <-- fallback (tract-level)
#  - tract_to_zcta_us.csv         (nationwide; columns: tract_geoid11,zcta5[,share])
#
# If multiple ZCTAs exist per geography, the row with the largest weight
# (pop_share/aland_share/area_share/weight/share) is chosen.

import os
import re
import sys
import pandas as pd

# ------------------ CONFIG ------------------
IN_CSV  = "Howard_demographics_block_2022-2023.csv"
OUT_CSV = "Howard_demographics_block_2022-2023_with_zip.csv"

# Preferred file names to search (in this folder)
BG_XWALK_CANDIDATES = [
    "bg_to_zcta_md.csv",     # Maryland-only block-group crosswalk
    "bg_to_zcta_us.csv",     # US block-group crosswalk
]
TRACT_XWALK_CANDIDATES = [
    "tract_to_zcta_md.csv",  # Maryland-only tract crosswalk
    "tract_to_zcta_us.csv",  # US tract crosswalk
]

WEIGHT_COLS = ["pop_share", "aland_share", "area_share", "weight", "share"]

# FIPS (Howard County, Maryland)
STATE_FIPS_MD = "24"
COUNTY_FIPS_HOWARD = "027"

# ------------------ HELPERS ------------------
TRACT_RE = re.compile(
    r"Block\s*Group\s*(?P<bg>\d+)\s*;\s*Census\s*Tract\s*(?P<tract>[0-9.]+)\s*;\s*(?P<county>[^;]+)\s*;\s*(?P<state>.+)$",
    re.IGNORECASE
)

def tract_to_6d(tract_str: str) -> str:
    """
    Convert human-formatted tract like '6011.03' or '6012.3' or '6012'
    to 6-digit Census tract code (e.g., '601103', '601230', '601200').
    """
    if not isinstance(tract_str, str):
        return None
    ts = tract_str.strip()
    if "." in ts:
        left, right = ts.split(".", 1)
        left = re.sub(r"\D", "", left)
        right = re.sub(r"\D", "", right)
        return f"{left.zfill(4)}{right.zfill(2)}"
    else:
        left = re.sub(r"\D", "", ts)
        return f"{left.zfill(4)}00"

def parse_county_name(val: str):
    """
    Parse 'county_name' like:
    'Block Group 1; Census Tract 6011.03; Howard County; Maryland'
    """
    if not isinstance(val, str):
        return {"bg": None, "tract6": None, "county": None, "state": None}
    m = TRACT_RE.search(val.strip())
    if not m:
        return {"bg": None, "tract6": None, "county": None, "state": None}
    bg = m.group("bg").strip()
    tract6 = tract_to_6d(m.group("tract"))
    county = m.group("county").strip()
    state = m.group("state").strip()
    return {"bg": bg, "tract6": tract6, "county": county, "state": state}

def build_bg_geoid(tract6: str, bg: str) -> str:
    """
    BG GEOID (12) = 24 (MD) + 027 (Howard) + tract(6) + blockgroup(1)
    """
    if pd.isna(tract6) or pd.isna(bg):
        return None
    return f"{STATE_FIPS_MD}{COUNTY_FIPS_HOWARD}{tract6}{int(bg)}"

def choose_dominant(df: pd.DataFrame, key_col: str, cols_keep):
    """
    If multiple rows per key exist and a weight column is present, keep the
    highest-weight row per key. Otherwise, drop duplicates keeping first.
    """
    weight_col = None
    for c in WEIGHT_COLS:
        if c in df.columns:
            weight_col = c
            break
    if weight_col:
        return (df.sort_values(weight_col, ascending=False)
                  .drop_duplicates(subset=[key_col], keep="first")[cols_keep])
    else:
        return df[cols_keep].drop_duplicates()

def find_first_existing(candidates):
    for fname in candidates:
        if os.path.exists(fname):
            return fname
    return None

# ------------------ MAIN ------------------
if __name__ == "__main__":
    if not os.path.exists(IN_CSV):
        sys.exit(f"Input file not found: {IN_CSV}")

    # Load input
    df = pd.read_csv(IN_CSV, dtype=str, encoding="utf-8-sig")
    if "county_name" not in df.columns:
        sys.exit("The input CSV must contain a 'county_name' column.")

    # Parse the descriptive field
    parsed = df["county_name"].apply(parse_county_name).apply(pd.Series)
    df = pd.concat([df, parsed], axis=1)

    # Optional filter (should already be Howard, MD)
    mask_hoco = (df["county"].str.contains("Howard", case=False, na=False)) & \
                (df["state"].str.contains("Maryland", case=False, na=False))
    df = df.loc[mask_hoco].copy()

    # GEOIDs
    df["tract6"]   = df["tract6"]
    df["bg_geoid"] = df.apply(lambda r: build_bg_geoid(r["tract6"], r["bg"]), axis=1)
    df["tract_geoid11"] = df["bg_geoid"].str.slice(0, 11)  # state2 + county3 + tract6

    # Try BLOCK-GROUP crosswalk first
    zip_col = None
    bg_xwalk_path = find_first_existing(BG_XWALK_CANDIDATES)
    tract_xwalk_path = find_first_existing(TRACT_XWALK_CANDIDATES)

    if bg_xwalk_path:
        xw = pd.read_csv(bg_xwalk_path, dtype={"bg_geoid": str, "zcta5": str}, encoding="utf-8-sig")
        xw_dom = choose_dominant(xw, "bg_geoid", ["bg_geoid", "zcta5"])
        merged = df.merge(xw_dom, on="bg_geoid", how="left")
        merged.rename(columns={"zcta5": "zip"}, inplace=True)
        zip_col = "zip"
        out = merged

    elif tract_xwalk_path:
        # Fallback: map via TRACT→ZCTA, then broadcast to BGs within the tract
        tx = pd.read_csv(tract_xwalk_path, dtype=str, encoding="utf-8-sig")
        # Normalize expected columns
        # Accept either 'tract_geoid11' or 'tract' column with 11-digit GEOID
        if "tract_geoid11" not in tx.columns:
            # Try to infer from a 'tract' column
            if "tract" in tx.columns:
                tx["tract_geoid11"] = tx["tract"].astype(str).str.zfill(11)
            else:
                sys.exit(f"Tract crosswalk '{tract_xwalk_path}' must have 'tract_geoid11' (or 'tract').")

        if "zcta5" not in tx.columns:
            sys.exit(f"Tract crosswalk '{tract_xwalk_path}' must have 'zcta5' column.")

        tx_dom = choose_dominant(tx, "tract_geoid11", ["tract_geoid11", "zcta5"])
        merged = df.merge(tx_dom, on="tract_geoid11", how="left")
        merged.rename(columns={"zcta5": "zip"}, inplace=True)
        zip_col = "zip"
        out = merged

    else:
        # No crosswalk found; write file without ZIP and message
        out = df.copy()
        out["zip"] = pd.NA
        print(
            "⚠️  No crosswalk CSV found in this folder.\n"
            "Place one of the following files here and rerun the script:\n"
            "  - bg_to_zcta_md.csv  (preferred)\n"
            "  - bg_to_zcta_us.csv\n"
            "  - tract_to_zcta_md.csv  (fallback)\n"
            "  - tract_to_zcta_us.csv  (fallback)\n"
            "Expected columns are documented at the top of this script."
        )

    # Save
    out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
    print(f"✅ Wrote: {OUT_CSV}")

    # Simple sanity print
    if "zip" in out.columns:
        missing = int(out["zip"].isna().sum())
        total   = len(out)
        print(f"ZIP coverage: {total - missing}/{total} rows mapped.")
        if missing:
            print("Note: Some rows have no ZIP—check crosswalk coverage or parsing.")


⚠️  No crosswalk CSV found in this folder.
Place one of the following files here and rerun the script:
  - bg_to_zcta_md.csv  (preferred)
  - bg_to_zcta_us.csv
  - tract_to_zcta_md.csv  (fallback)
  - tract_to_zcta_us.csv  (fallback)
Expected columns are documented at the top of this script.
✅ Wrote: Howard_demographics_block_2022-2023_with_zip.csv
ZIP coverage: 0/332 rows mapped.
Note: Some rows have no ZIP—check crosswalk coverage or parsing.
