In [9]:
# Build census_demographics_clean robustly (no stray commas)
from pathlib import Path
import duckdb


con = duckdb.connect((INTERIM_DATA_DIR / "nhgis_pl2020.duckdb").as_posix())

# Inventory current columns
cols = {r[0] for r in con.execute("PRAGMA table_info('pl20_blocks')").fetchall()}

# Geo parts present?
has_geo_parts = {"STATEA","COUNTYA","TRACTA","BLOCKA"}.issubset(cols)

# ds248 variable names
VAP_TOTAL = "U7D001"
RACE_VAP  = ["U7D003","U7D004","U7D005","U7D006","U7D007","U7D008","U7D009"]
HISP_VAP  = ["U7E002","U7E003"]

# Filter to what actually exists
race_vars = [v for v in RACE_VAP if v in cols]
hisp_vars = [v for v in HISP_VAP if v in cols]
have_vap_total = VAP_TOTAL in cols

# ----- SELECT list for base CTE -----
select_lines = []

if has_geo_parts:
    select_lines.append("(LPAD(STATEA,2,'0') || LPAD(COUNTYA,3,'0') || LPAD(TRACTA,6,'0') || LPAD(BLOCKA,4,'0')) AS GEOID20")
else:
    # Try reasonable fallbacks
    for alt in ["GEOID20","GEOID","GEOCODE","GISJOIN"]:
        if alt in cols:
            select_lines.append(f"CAST({alt} AS VARCHAR) AS GEOID20")
            break
    else:
        select_lines.append("CAST(NULL AS VARCHAR) AS GEOID20")

if have_vap_total:
    select_lines.append(f"CAST({VAP_TOTAL} AS BIGINT) AS vap_total")
# Cast race/hispanic counts that exist
select_lines += [f"CAST({v} AS BIGINT) AS {v}" for v in race_vars + hisp_vars]

select_sql = ",\n    ".join(select_lines)

# ----- SHARE columns (only if vap_total exists) -----
share_lines = []
if have_vap_total:
    share_lines += [f"{v} * 1.0 / NULLIF(vap_total,0) AS {v}_share" for v in race_vars]
    if set(hisp_vars) == set(HISP_VAP):  # both present
        share_lines += [
            "U7E002 * 1.0 / NULLIF(vap_total,0) AS hisp_share",
            "U7E003 * 1.0 / NULLIF(vap_total,0) AS nonhisp_share",
        ]

shares_sql = ",\n    ".join(share_lines) if share_lines else "/* no share columns available */"

# ----- Create table -----
sql = f"""
DROP TABLE IF EXISTS census_demographics_clean;
CREATE TABLE census_demographics_clean AS
WITH base AS (
  SELECT
    {select_sql}
  FROM pl20_blocks
)
SELECT
  base.*{"," if share_lines else ""}
  {shares_sql if share_lines else ""}
FROM base;
"""

# Debug: print the SQL if needed
# print(sql)

con.execute(sql)

print(con.execute("""
SELECT COUNT(*) AS rows,
       COUNT(DISTINCT GEOID20) AS uniq_blocks
FROM census_demographics_clean;
""").fetchdf())




      rows  uniq_blocks
0  8174955            0
