## Imports and Configurations

### API Endpoint URL
### State FIPS Code
### Variables to Fetch from API
### Connecting to DuckDB
Information on the variables that can be accessed from the 2020 Decennial Census Redistricting API PL-94-171 are available at the following link: https://api.census.gov/data/2020/dec/pl/variables.html.

The variables of interest to us from this particular API pertain to racial demographics.

<ul>
<li><b>GEO_ID</b>: This variable will be used later to match with the corresponding census-block shapefile.</li>
<li><b>P1_001N</b>: Total population (includes both voting and non-voting age population).</li>
<li><b>P3_001N</b>: Total voting age population.</li>
<li><b>P3_003N</b>: Total white alone voting age population.</li>
<li><b>P3_004N</b>: Total black or African American alone voting age population.</li>
<li><b>P3_005N</b>: Total American Indian & Alaskan Native alone voting population.</li>
<li><b>P3_006N</b>: Total Asian alone voting age population.</li>
<li><b>P3_007N</b>: Total Native Hawaiin or other Pacific Islander alone voting age population.</li>
<li><b>P3_008N</b>: Total voting age population of one race not listed.</li>
<li><b>P3_009N</b>: Total voting age population of two or more races.</li>
<li><b>P4_002N</b>: Total Hispanic or Latino voting age population.</li>
<li><b>P4_003N</b>: Total not Hispanic or Latino voting age population.</li>
</ul>

In [None]:
# === 0) Imports & configuration ===
import os, time, random, requests, pandas as pd
from pathlib import Path
import duckdb

# --- Project paths ---
BASE_DIR = Path("data/processed")
BASE_DIR.mkdir(parents=True, exist_ok=True)
DB_PATH  = BASE_DIR / "pl20_tx.duckdb"

# --- Census API config ---
API_URL = "https://api.census.gov/data/2020/dec/pl"
STATE   = "48"  # State FIPS code for Texas
# Optional: set your key via env var CENSUS_API_KEY or .env (see next cell)
API_KEY = os.getenv("CENSUS_API_KEY", None)

# --- Variables to fetch (PL-94-171) ---
VARS = [
    "GEO_ID",          # Unique ID used to join with geospatial data
    "P1_001N",         # Total Texas population, includes both VAP and non-VAP
    "P3_001N",         # VAP total
    "P3_003N","P3_004N","P3_005N","P3_006N","P3_007N","P3_008N","P3_009N",  # VAP race
    "P4_002N","P4_003N"  # Hispanic Voting Age Population, Not-Hispanic Voting Age Population
]
VAP_RACE_VARS = ["P3_003N","P3_004N","P3_005N","P3_006N","P3_007N","P3_008N","P3_009N"]
HISP_VARS     = ["P4_002N","P4_003N"]

# Connect DuckDB
con = duckdb.connect(DB_PATH.as_posix())
print("DuckDB:", duckdb.__version__, "DB:", DB_PATH)

In [None]:
_last_call = [0.0]

def _throttle(min_interval=0.9):
    now = time.monotonic()
    delta = now - _last_call[0]
    if delta < min_interval:
        time.sleep(min_interval - delta)
    _last_call[0] = time.monotonic()

def census_get(params, max_retries=8, base_sleep=1.2):
    params = dict(params)
    if API_KEY:
        params["key"] = API_KEY

    last = None
    for attempt in range(1, max_retries + 1):
        _throttle(0.9 + random.random() * 0.4)  # 0.9–1.3s between calls
        try:
            r = requests.get(API_URL, params=params, timeout=60)
            if r.ok:
                return r.json()

            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra:
                    try:
                        wait = float(ra)
                    except ValueError:
                        wait = base_sleep * (2 ** (attempt - 1))
                else:
                    wait = base_sleep * (2 ** (attempt - 1))
                wait += random.random() * 0.5
                print(f"[census_get] 429 rate-limited. Waiting {wait:.1f}s…")
                time.sleep(wait)
                last = r
                continue

            body = r.text[:300].replace("\n"," ")
            print(f"[census_get] Attempt {attempt}: HTTP {r.status_code} — {body}")
            last = r
        except requests.exceptions.RequestException as e:
            print(f"[census_get] Attempt {attempt}: network error → {e}")
            last = e

        wait = base_sleep * (2 ** (attempt - 1)) + random.random() * 0.4
        print(f"[census_get] Backing off {wait:.1f}s…")
        time.sleep(wait)

    if hasattr(last, "raise_for_status"): last.raise_for_status()
    raise last

In [None]:
import pandas as pd

def list_counties(state=STATE):
    js = census_get({"get":"NAME","for":"county:*","in":f"state:{state}"})
    return pd.DataFrame(js[1:], columns=js[0])["county"].astype(str).str.zfill(3).tolist()

def list_tracts(state=STATE, county_fips=None):
    js = census_get({"get":"NAME","for":"tract:*","in":f"state:{state} county:{str(county_fips).zfill(3)}"})
    return pd.DataFrame(js[1:], columns=js[0])["tract"].astype(str).str.zfill(6).tolist()

def fetch_blocks_tract(state, county_fips, tract, fields=VARS):
    params = {
        "get": ",".join(fields),
        "for": "block:*",
        "in": f"state:{str(state).zfill(2)} county:{str(county_fips).zfill(3)} tract:{str(tract).zfill(6)}"
    }
    # local retry for bursty 429s
    for tries in range(1, 6):
        try:
            js = census_get(params)
            break
        except requests.HTTPError as e:
            if e.response is not None and e.response.status_code == 429 and tries < 5:
                wait = 5 * tries + random.random() * 2
                print(f"[fetch_blocks_tract] 429 on tract {tract}. Sleeping {wait:.1f}s…")
                time.sleep(wait); continue
            raise
    if not js or len(js) <= 1:
        return pd.DataFrame(columns=fields+["state","county","tract","block"])
    df = pd.DataFrame(js[1:], columns=js[0])
    for col, width in [("state",2),("county",3),("tract",6),("block",4)]:
        df[col] = df[col].astype(str).str.zfill(width)
    return df

In [None]:
# Create Raw DuckDB Table
RAW_REQ = ["GEO_ID","P1_001N","P3_001N","P3_003N","P3_004N","P3_005N","P3_006N","P3_007N","P3_008N","P3_009N","P4_002N","P4_003N","state","county","tract","block"]
ddl_cols = ", ".join([f"{c} VARCHAR" for c in RAW_REQ])
con.execute(f"CREATE TABLE IF NOT EXISTS raw_pl20_tx ({ddl_cols});")
con.execute("CREATE INDEX IF NOT EXISTS idx_raw_geo ON raw_pl20_tx(state, county, tract, block);")

# Progress table
con.execute("""CREATE TABLE IF NOT EXISTS etl_progress (  state TEXT,  county TEXT,  tract TEXT,  status TEXT,  rows   INTEGER,  updated_at TIMESTAMP DEFAULT now());""")
con.execute("CREATE INDEX IF NOT EXISTS idx_progress ON etl_progress(state, county, tract);")

def is_done(state, county, tract):
    return con.execute("SELECT 1 FROM etl_progress WHERE state=? AND county=? AND tract=? AND status='done' LIMIT 1", [state, county, tract]).fetchone() is not None

def mark_progress(state, county, tract, status, rows=None):
    con.execute("INSERT INTO etl_progress(state, county, tract, status, rows, updated_at) VALUES (?,?,?,?,?, now())", [state, county, tract, status, rows])

In [None]:
counties = list_counties(STATE)
print(f"Found {len(counties)} counties in state {STATE}")

for i, c in enumerate(counties, 1):
    tracts = list_tracts(STATE, c)
    print(f"[{i}/{len(counties)}] county {c} → {len(tracts)} tracts")
    for t in tracts:
        if is_done(STATE, c, t):
            continue
        try:
            mark_progress(STATE, c, t, "started", 0)
            df = fetch_blocks_tract(STATE, c, t, VARS)
            if not df.empty:
                con.register("tmp_df", df)
                con.execute(f"""                    INSERT INTO raw_pl20_tx
                    SELECT {", ".join(VARS)}, state, county, tract, block
                    FROM tmp_df
                """                )
                con.unregister("tmp_df")
                print(f"  tract {t}: {len(df):,} rows")
            mark_progress(STATE, c, t, "done", int(len(df)))
        except Exception as e:
            mark_progress(STATE, c, t, "error", 0)
            print(f"  tract {t}: ERROR → {e}")

In [None]:
# Drop & build clean
con.execute("DROP TABLE IF EXISTS census_demographics_clean;")

race_casts = ", ".join([f"CAST({c} AS BIGINT) AS {c}" for c in VAP_RACE_VARS])
hisp_casts = ", ".join([f"CAST({c} AS BIGINT) AS {c}" for c in HISP_VARS])
share_exprs = ", ".join([f"CAST({c} AS DOUBLE)/NULLIF(CAST(P3_001N AS DOUBLE),0) AS {c}_share_vap" for c in VAP_RACE_VARS + HISP_VARS])

con.execute(f"""CREATE TABLE census_demographics_clean AS
WITH base AS (
  SELECT
    state, county, tract, block, GEO_ID,
    CAST(P1_001N AS BIGINT) AS P1_001N,
    CAST(P3_001N AS BIGINT) AS P3_001N,
    {race_casts},
    {hisp_casts},
    state || county || tract || block AS GEOID20,
    REPLACE(REPLACE(GEO_ID,'15000US',''),'1000000US','') AS GEOID_from_GEO_ID
  FROM raw_pl20_tx
),
shares AS (
  SELECT
    *,
    {share_exprs}
  FROM base
)
SELECT * FROM shares;
""")


# Deduplicate to final
con.execute("DROP TABLE IF EXISTS census_demographics;")
con.execute("""CREATE TABLE census_demographics AS
SELECT *
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY GEOID20 ORDER BY GEO_ID) AS rn
  FROM census_demographics_clean
) WHERE rn = 1;
""")

print(con.execute("SELECT COUNT(*) rows, COUNT(DISTINCT GEOID20) uniq FROM census_demographics;").fetchdf())

In [None]:
# Uniqueness of GEO_ID
print(con.execute("SELECT COUNT(*) rows, COUNT(DISTINCT GEOID20) uniq FROM census_demographics;").fetchdf())

# Ensure Correct GEO_ID format
ok = con.execute("""SELECT SUM(LENGTH(GEOID20)=15 AND GEOID20 ~ '^[0-9]+$') = COUNT(*) AS ok
FROM census_demographics;
""").fetchone()[0]
print("15-digit GEOID:", ok)

# GEO_ID agreement (~1.0)
print(con.execute("""SELECT AVG( (GEOID20 = REPLACE(REPLACE(GEO_ID,'15000US',''),'1000000US',''))::DOUBLE ) AS pct_match
FROM census_demographics;
""").fetchdf())

# Non-negatives and VAP <= total
print(con.execute("""SELECT
  SUM((P1_001N < 0) OR (P3_001N < 0)) AS neg_pop,
  SUM( (P3_003N < 0) OR (P3_004N < 0) OR (P3_005N < 0) OR
       (P3_006N < 0) OR (P3_007N < 0) OR (P3_008N < 0) OR (P3_009N < 0) ) AS neg_race,
  SUM( (P4_002N < 0) OR (P4_003N < 0) ) AS neg_hisp,
  AVG( (P3_001N <= P1_001N)::DOUBLE ) AS pct_vap_le_total
FROM census_demographics;
""").fetchdf())

In [None]:
PARQUET_OUT = BASE_DIR / "census_demographics.parquet"
CSV_OUT     = BASE_DIR / "census_demographics.csv.gz"

con.execute(f"COPY census_demographics TO '{PARQUET_OUT.as_posix()}' (FORMAT PARQUET);")
print("Saved:", PARQUET_OUT)