## Selecting Variables to Access from API
Information on the variables that can be accessed from the 2020 Decennial Census Redistricting API PL-94-171 are available at the following link: https://api.census.gov/data/2020/dec/pl/variables.html.

The variables of interest to us from this particular API pertain to racial demographics.

<ul>
<li><b>GEO_ID</b>: This variable will be used later to match with the corresponding census-block shapefile.</li>
<li><b>P1_001N</b>: Total population (includes both voting and non-voting age population).</li>
<li><b>P3_001N</b>: Total voting age population.</li>
<li><b>P3_003N</b>: Total white alone voting age population.</li>
<li><b>P3_004N</b>: Total black or African American alone voting age population.</li>
<li><b>P3_005N</b>: Total American Indian & Alaskan Native alone voting population.</li>
<li><b>P3_006N</b>: Total Asian alone voting age population.</li>
<li><b>P3_007N</b>: Total Native Hawaiin or other Pacific Islander alone voting age population.</li>
<li><b>P3_008N</b>: Total voting age population of one race not listed.</li>
<li><b>P3_009N</b>: Total voting age population of two or more races.</li>
<li><b>P4_002N</b>: Total Hispanic or Latino voting age population.</li>
<li><b>P4_003N</b>: Total not Hispanic or Latino voting age population.</li>
</ul>

## Configuration

In [1]:
from texas_gerrymandering_hb4.config import CENSUS_API_KEY, INTERIM_DATA_DIR
import os, time, random, requests, pandas as pd
from pathlib import Path
import duckdb

API_URL = "https://api.census.gov/data/2020/dec/pl"
STATE = "48"


API_KEY = CENSUS_API_KEY

VARS = [
    "GEO_ID","P1_001N","P3_001N",
    "P3_003N","P3_004N","P3_005N","P3_006N","P3_007N","P3_008N","P3_009N",
    "P4_002N","P4_003N"
]

DB_PATH  = INTERIM_DATA_DIR/ "census_demographics.duckdb"
INTERIM_DATA_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(DB_PATH.as_posix())


[32m2025-09-07 13:32:37.245[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


## Create Tables

In [2]:
cols = ", ".join([f'{c} VARCHAR' for c in VARS] + ['state VARCHAR','county VARCHAR','tract VARCHAR','block VARCHAR'])
con.execute("CREATE TABLE IF NOT EXISTS raw_pl20_tx (" + cols + ");")


con.execute("""
CREATE TABLE IF NOT EXISTS etl_progress (
  state TEXT,
  county TEXT,
  tract TEXT,
  status TEXT,
  rows   INTEGER,
  updated_at TIMESTAMP DEFAULT now()
);
""")
con.execute("CREATE INDEX IF NOT EXISTS idx_progress ON etl_progress(state, county, tract);")


<duckdb.duckdb.DuckDBPyConnection at 0x7c7724845ab0>

# Calling Census API

In [3]:
_last_call = [0.0]

def _throttle(min_interval=0.9):
    now = time.monotonic()
    delta = now - _last_call[0]
    if delta < min_interval:
        time.sleep(min_interval - delta)
    _last_call[0] = time.monotonic()

def census_get(params, max_retries=8, base_sleep=1.2):
    params = dict(params)
    if API_KEY:
        params["key"] = API_KEY

    last = None
    for attempt in range(1, max_retries + 1):
        _throttle(0.9 + random.random() * 0.4)
        try:
            r = requests.get(API_URL, params=params, timeout=60)
            if r.ok:
                return r.json()


            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra:
                    try:
                        wait = float(ra)
                    except ValueError:
                        wait = base_sleep * (2 ** (attempt - 1))
                else:
                    wait = base_sleep * (2 ** (attempt - 1))
                wait += random.random() * 0.5  # jitter
                print(f"[census_get] 429 rate-limited. Waiting {wait:.1f}s then retrying…")
                time.sleep(wait)
                last = r
                continue


            body = r.text[:300].replace("\n", " ")
            print(f"[census_get] Attempt {attempt}: HTTP {r.status_code} — {body}")
            last = r

        except requests.exceptions.RequestException as e:
            print(f"[census_get] Attempt {attempt}: network error → {e}")
            last = e


        wait = base_sleep * (2 ** (attempt - 1)) + random.random() * 0.4
        print(f"[census_get] Backing off {wait:.1f}s…")
        time.sleep(wait)

    if hasattr(last, "raise_for_status"):
        last.raise_for_status()
    raise last


## Helper Functions

In [4]:
def list_counties(state=STATE):
    js = census_get({"get":"NAME","for":"county:*","in":f"state:{state}"})
    return pd.DataFrame(js[1:], columns=js[0])["county"].astype(str).str.zfill(3).tolist()

def list_tracts(state=STATE, county_fips=None):
    js = census_get({"get":"NAME","for":"tract:*","in":f"state:{state} county:{str(county_fips).zfill(3)}"})
    return pd.DataFrame(js[1:], columns=js[0])["tract"].astype(str).str.zfill(6).tolist()

def fetch_blocks_tract(state, county_fips, tract, fields=VARS):
    params = {
        "get": ",".join(fields),
        "for": "block:*",
        "in": f"state:{str(state).zfill(2)} county:{str(county_fips).zfill(3)} tract:{str(tract).zfill(6)}"
    }

    for tries in range(1, 6):
        try:
            js = census_get(params)
            break
        except requests.HTTPError as e:
            if e.response is not None and e.response.status_code == 429 and tries < 5:
                wait = 5 * tries + random.random() * 2
                print(f"[fetch_blocks_tract] 429 on tract {tract}. Sleeping {wait:.1f}s and retrying ({tries}/5)…")
                time.sleep(wait)
                continue
            raise

    if not js or len(js) <= 1:
        return pd.DataFrame(columns=fields+["state","county","tract","block"])
    df = pd.DataFrame(js[1:], columns=js[0])
    for col, width in [("state",2),("county",3),("tract",6),("block",4)]:
        df[col] = df[col].astype(str).str.zfill(width)
    return df


# Tracking ETL Progress

In [5]:
def is_done(state, county, tract):
    q = con.execute(
        "SELECT 1 FROM etl_progress WHERE state=? AND county=? AND tract=? AND status='done' LIMIT 1",
        [state, county, tract]
    ).fetchone()
    return q is not None

def mark_progress(state, county, tract, status, rows=None):
    con.execute("""
        INSERT INTO etl_progress(state, county, tract, status, rows, updated_at)
        VALUES (?, ?, ?, ?, ?, now())
    """, [state, county, tract, status, rows])

counties = list_counties(STATE)
print(f"Found {len(counties)} counties")

for i, c in enumerate(counties, 1):
    tracts = list_tracts(STATE, c)
    print(f"[{i}/{len(counties)}] county {c} → {len(tracts)} tracts")
    for t in tracts:
        if is_done(STATE, c, t):
            continue
        try:
            mark_progress(STATE, c, t, "started", 0)
            df = fetch_blocks_tract(STATE, c, t, VARS)
            if not df.empty:
                con.register("tmp_df", df)
                con.execute(f"""
                    INSERT INTO raw_census_demographics
                    SELECT {", ".join(VARS)}, state, county, tract, block
                    FROM tmp_df
                """)
                con.unregister("tmp_df")
            mark_progress(STATE, c, t, "done", int(len(df)))
            if len(df):
                print(f"  tract {t}: {len(df):,} rows")
        except Exception as e:
            mark_progress(STATE, c, t, "error", 0)
            print(f"  tract {t}: ERROR → {e}")


Found 254 counties
[1/254] county 001 → 12 tracts
  tract 950100: 348 rows
  tract 950401: 21 rows
  tract 950402: 37 rows
  tract 950500: 193 rows
  tract 950600: 239 rows
  tract 950700: 179 rows
  tract 950800: 165 rows
  tract 950901: 201 rows
  tract 950902: 265 rows
  tract 951001: 155 rows
  tract 951002: 146 rows
  tract 951100: 326 rows
[2/254] county 003 → 4 tracts
  tract 950100: 535 rows
  tract 950200: 227 rows
  tract 950300: 227 rows
  tract 950400: 415 rows
[3/254] county 005 → 21 tracts
  tract 000102: 149 rows
  tract 000103: 69 rows
  tract 000104: 73 rows
  tract 000201: 55 rows
  tract 000202: 126 rows
  tract 000301: 119 rows
  tract 000302: 128 rows
  tract 000400: 216 rows
  tract 000500: 195 rows
  tract 000600: 166 rows
  tract 000700: 163 rows
  tract 000800: 114 rows
  tract 000902: 42 rows
  tract 000903: 47 rows
  tract 000904: 79 rows
  tract 001102: 86 rows
  tract 001001: 193 rows
  tract 001002: 164 rows
  tract 001101: 85 rows
  tract 001200: 248 rows

## Building a Clean Table and Exporting

In [6]:
VAP_RACE_VARS = ["P3_003N","P3_004N","P3_005N","P3_006N","P3_007N","P3_008N","P3_009N"]
HISP_VARS     = ["P4_002N","P4_003N"]

con.execute("DROP TABLE IF EXISTS census_demographics_clean;")

race_casts = ", ".join([f"CAST({c} AS BIGINT) AS {c}" for c in VAP_RACE_VARS])
hisp_casts = ", ".join([f"CAST({c} AS BIGINT) AS {c}" for c in HISP_VARS])

con.execute(f"""
CREATE TABLE census_demographics_clean AS
WITH base AS (
  SELECT
    state, county, tract, block, GEO_ID,
    CAST(P1_001N AS BIGINT) AS P1_001N,
    CAST(P3_001N AS BIGINT) AS P3_001N,
    {race_casts},
    {hisp_casts},
    state || county || tract || block AS GEOID20,
    REPLACE(REPLACE(GEO_ID,'15000US',''),'1000000US','') AS GEOID_from_GEO_ID
  FROM raw_census_demographics
),
shares AS (
  SELECT
    *,
    {", ".join([f"CAST({c} AS DOUBLE)/NULLIF(CAST(P3_001N AS DOUBLE),0) AS {c}_share_vap" for c in VAP_RACE_VARS + HISP_VARS])}
  FROM base
)
SELECT * FROM shares;
""")


con.execute("""
CREATE TABLE census_demographics_clean_dedup AS
SELECT * FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY GEOID20 ORDER BY GEO_ID) rn
  FROM pl20_tx_clean
) WHERE rn=1;
""")
con.execute("DROP TABLE census_demographics_clean;")
con.execute("ALTER TABLE census_demographics_clean_dedup RENAME TO pl20_tx_clean;")

print(con.execute("SELECT COUNT(*) rows, COUNT(DISTINCT GEOID20) uniq FROM census_demographics_clean;").fetchdf())


CatalogException: Catalog Error: Table with name census_demographics_clean does not exist!
Did you mean "pg_namespace"?

## Exporting Parquet File

In [None]:
PARQUET_OUT = INTERIM_DATA_DIR / "census_demographic_blocks_clean.parquet"


con.execute(f"COPY census_demographics_clean TO '{PARQUET_OUT.as_posix()}' (FORMAT PARQUET);")
print("Saved Parquet:", PARQUET_OUT)

