In [20]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR
import time, requests, pandas as pd
from pathlib import Path

## Defining the URL for API Endpoint

In [21]:
api_url = "https://api.census.gov/data/2020/dec/pl"

## Selecting Variables to Access from API
Information on the variables that can be accessed from the 2020 Decennial Census Redistricting API PL-94-171 are available at the following link: https://api.census.gov/data/2020/dec/pl/variables.html.

The variables of interest to us from this particular API pertain to racial demographics.

<ul>
<li><b>GEO_ID</b>: This variable will be used later to match with the corresponding census-block shapefile.</li>
<li><b>P1_001N</b>: Total population (includes both voting and non-voting age population).</li>
<li><b>P3_001N</b>: Total voting age population.</li>
<li><b>P3_003N</b>: Total white alone voting age population.</li>
<li><b>P3_004N</b>: Total black or African American alone voting age population.</li>
<li><b>P3_005N</b>: Total American Indian & Alaskan Native alone voting population.</li>
<li><b>P3_006N</b>: Total Asian alone voting age population.</li>
<li><b>P3_007N</b>: Total Native Hawaiin or other Pacific Islander alone voting age population.</li>
<li><b>P3_008N</b>: Total voting age population of one race not listed.</li>
<li><b>P3_009N</b>: Total voting age population of two or more races.</li>
<li><b>P4_002N</b>: Total Hispanic or Latino voting age population.</li>
<li><b>P4_003N</b>: Total not Hispanic or Latino voting age population.</li>
</ul>

In [22]:
api_vars = [
    "GEO_ID", "P1_001N","P3_001N","P3_003N", "P3_004N", "P3_005N",
    "P3_006N", "P3_007N", "P3_008N","P3_009N","P4_002N","P4_003N"
]

## Specify the FIPS State Code
The FIPS state code for Texas is specified by the U.S. Census Bureau as 48 (National FIPS and GNIS Codes File, n.d.).

In [23]:
state_code = "48"

## Group Columns for Derived Values

In [24]:
vap_total = "P3_001N"
vap_race_vars = ["P3_003N", "P3_004N", "P3_005N", "P3_006N", "P3_007N", "P3_008N", "P3_009N"]
hispanic_vars = ["P4_002N", "P4_003N"]

## Wrapper Function that Fetches Census API Data and Handles Retries/Errors

In [25]:
def census_api_get(params, max_retries=5, backoff_sec=1.2):

    last_err = None
    for i in range(max_retries):
        try:
            r = requests.get(api_url, params=params, timeout=60)

            if r.ok:
                return r.json()
            else:
                # Logging every failure with the HTTP status & response
                print(f"[Attempt {i+1}/{max_retries}] HTTP {r.status_code}: {r.text[:200]}")
                last_err = r

        except requests.exceptions.RequestException as e:
            # Logging every failure with an exception in the event of a timeout, DNS failure, etc
            print(f"[Attempt {i+1}/{max_retries}] Request failed: {e}")
            last_err = e

        # Retries up to 5 times with wait times increasing after every attempt
        wait_time = backoff_sec * (i + 1)
        print(f"  → Retrying in {wait_time:.1f} seconds...")
        time.sleep(wait_time)

    # If all 5 retry attempts fail, then a final error is raised showing the status code
    if isinstance(last_err, requests.Response):
        last_err.raise_for_status()
    else:
        raise last_err

## Helper Function to List Counties

In [26]:
def list_counties(state=state_code):
    js = census_api_get({"get": "NAME", "for": "county:*", "in":f"state:{state}"})
    header, rows = js[0], js[1:]
    df = pd.DataFrame(rows, columns = header)
    return df["county"].tolist()

## Download Census-Block Level Data Paginated By County

In [27]:
def get_blocks_for_county(county_fips, state=state_code, vars=api_vars):
    params_base = {
        "get": ",".join(vars),
        "for": "block:*",
        "in": f"state:{state} county:{county_fips} tract:*",
        "page": 1
    }
    frames = []
    while True:
        js = census_api_get(params_base)
        header, rows = js[0], js[1:]
        if not rows:
            break
        frames.append(pd.DataFrame(rows, columns=header))
        params_base["page"] += 1

        if len(rows) < 2_000:

            try_next = census_api_get(params_base)
            if len(try_next) <= 1:
                break
            else:
                header2, rows2 = try_next[0], try_next[1:]
                frames.append(pd.DataFrame(rows2, columns=header2))
                params_base["page"] += 1
                break
    if frames:
        return pd.concat(frames, ignore_index=True)
    return pd.DataFrame(columns=vars + ["state","county","tract","block"])

## Obtaining Texas Census-Block Level Data Via Streaming By Counties

In [28]:
out_dir = Path(INTERIM_DATA_DIR/"2020_census_blocks_tx")
out_dir.mkdir(parents=True, exist_ok=True)

counties = list_counties(state_code)
len(counties), counties[:5]

# Download
for i, c in enumerate(counties, 1):
    out_parquet = out_dir / f"2020_census_tx_county_{c}.parquet"
    if out_parquet.exists():
        continue
    dfc = get_blocks_for_county(c)
    if dfc.empty:
        continue
    dfc.to_parquet(out_parquet, index=False)
    print(f"[{i}/{len(counties)}] saved", out_parquet)


[Attempt 1/5] HTTP 400: error: unknown predicate variable: 'page'
  → Retrying in 1.2 seconds...
[Attempt 2/5] HTTP 400: error: unknown predicate variable: 'page'
  → Retrying in 2.4 seconds...
[Attempt 3/5] HTTP 400: error: unknown predicate variable: 'page'
  → Retrying in 3.6 seconds...
[Attempt 4/5] HTTP 400: error: unknown predicate variable: 'page'
  → Retrying in 4.8 seconds...
[Attempt 5/5] HTTP 400: error: unknown predicate variable: 'page'
  → Retrying in 6.0 seconds...


HTTPError: 400 Client Error:  for url: https://api.census.gov/data/2020/dec/pl?get=GEO_ID%2CP1_001N%2CP3_001N%2CP3_003N%2CP3_004N%2CP3_005N%2CP3_006N%2CP3_007N%2CP3_008N%2CP3_009N%2CP4_002N%2CP4_003N&for=block%3A%2A&in=state%3A48+county%3A001+tract%3A%2A&page=1