In [2]:
import os, io, zipfile, requests, pandas as pd

# ==== CONFIG ====
INPUT_CSV  = "/demand_data/Howard_demographics_block_2022-2023.csv"         # your file with the "name" column like: "Block Group 1; Census Tract 6011.03; Howard County; Maryland"
OUTPUT_CSV = "/outputs/Howard_demographics_with_zc_2022-2023.csv"
STATE_FIPS = "24"                             # Maryland
COUNTY_FIPS = "027"                           # Howard County (change if needed)
ACS_YEAR   = "2023"                           # for fetching NAME↔GEOID lookup (any recent 5-yr ok)
NAME_COL   = "county_name"                           # column in your CSV that has the semicolon-separated string
# ============

os.makedirs("tmp", exist_ok=True)

# 1) Load your file and normalize the name format to match Census "NAME" (they use commas)
df = pd.read_csv(INPUT_CSV)
if NAME_COL not in df.columns:
    raise ValueError(f"Expected a column named '{NAME_COL}' in {INPUT_CSV}")

def norm(s: str) -> str:
    # replace semicolons with commas; collapse spaces; align casing
    return ", ".join([p.strip() for p in str(s).split(";")])

df["NAME_std"] = df[NAME_COL].astype(str).map(norm)

# 2) Build a NAME ↔ GEOID lookup for all block groups in the county via Census API
BASE = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"
params = {
    "get": "NAME",
    "for": "block group:*",
    "in": f"state:{STATE_FIPS} county:{COUNTY_FIPS}"
}
r = requests.get(BASE, params=params, timeout=120)
r.raise_for_status()
data = r.json()
lookup = pd.DataFrame(data[1:], columns=data[0])
# Construct 12-digit block-group GEOID (state 2 + county 3 + tract 6 + block group 1)
lookup["geo_id"] = (
    lookup["state"].str.zfill(2) +
    lookup["county"].str.zfill(3) +
    lookup["tract"].str.zfill(6) +
    lookup["block group"].str.zfill(1)
)
lookup["NAME_std"] = lookup["NAME"].astype(str)

# 3) Join to get geo_id for each row in your file
df = df.merge(lookup[["NAME_std","geo_id"]], on="NAME_std", how="left")

missing = df["geo_id"].isna().sum()
if missing:
    print(f"WARNING: {missing} rows did not match a block-group NAME. Check punctuation/county/year.")

# 4) Get the 2020 ZCTA↔Block relationship file (national), then derive BG→ZCTA crosswalk
#    File contains: GEOID20 (15-digit block), ZCTA5CE20 (5-digit ZCTA), ALAND20, etc.
REL_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_tabblk20.zip"
rel_zip_path = "tmp/tab20_zcta520_tabblk20.zip"
if not os.path.exists(rel_zip_path):
    print("Downloading ZCTA↔Block relationship (2020)…")
    rr = requests.get(REL_URL, timeout=300)
    rr.raise_for_status()
    with open(rel_zip_path, "wb") as f:
        f.write(rr.content)

with zipfile.ZipFile(rel_zip_path) as zf:
    # The CSV inside is usually named 'tab20_zcta520_tabblk20.csv'
    rel_name = [n for n in zf.namelist() if n.lower().endswith(".csv")][0]
    with zf.open(rel_name) as f:
        rel = pd.read_csv(f, dtype=str)

# Keep only Maryland blocks (first 2 digits of GEOID20 are state FIPS)
rel = rel[rel["GEOID20"].str.startswith(STATE_FIPS)].copy()

# Extract block-group GEOID (first 12 digits of block GEOID)
rel["bg_geoid"] = rel["GEOID20"].str.slice(0, 12)
rel["ZCTA5CE20"] = rel["ZCTA5CE20"].fillna("")  # empty means ‘no ZCTA’ (mostly water/PO boxes)
# Use ALAND20 to pick dominant ZCTA by land area
rel["ALAND20"] = pd.to_numeric(rel.get("ALAND20", 0), errors="coerce").fillna(0)

# For each block group, pick the ZCTA with the largest ALAND20
rel_sorted = rel.sort_values(["bg_geoid","ALAND20"], ascending=[True, False])
bg2zcta = rel_sorted.drop_duplicates("bg_geoid")[["bg_geoid","ZCTA5CE20"]].rename(columns={"ZCTA5CE20":"zcta5"})

# 5) Join ZCTA to your rows via geo_id
df = df.merge(bg2zcta, left_on="geo_id", right_on="bg_geoid", how="left").drop(columns=["bg_geoid"])

# Optional cleanups: blank ZCTA → NaN
df["zcta5"] = df["zcta5"].replace({"": pd.NA})

# 6) Save
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Wrote: {OUTPUT_CSV}  (rows: {len(df):,})")

# --- Sanity checks ---
print("Non-matched GEOIDs:", df["geo_id"].isna().sum())
print("Rows with missing ZCTA:", df["zcta5"].isna().sum())


FileNotFoundError: [Errno 2] No such file or directory: '/demand_data/Howard_demographics_block_2022-2023.csv'