## Import Needed Filepaths & Libraries

In [None]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, CENSUS_DEMOGRAPHICS_TXT
import duckdb

## Start DuckDB Session

In [None]:
con = duckdb.connect()

## Inspect the First Few Rows

In [None]:
print(con.execute(f"""
  SELECT * FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True, sample_size=20000) LIMIT 5
""").df())

## Create a Cleaned Table in DuckDB
The cleaned table has the following schema:

| Column    | Type    | Description                           |
|-----------|---------|---------------------------------------|
| geoid20   | VARCHAR | Primary key to join with shapefile    |
| total_pop | BIGINT  | Total population                      |
| vap_total | BIGINT  | Voting age population                 |
| nh_white  | BIGINT  | Non-Hispanic white total population   |
| nh_black  | BIGINT  | Non-Hispanic black total population   |
| nh_asian  | BIGINT  | Non-Hispanic Asian total population   |
| hispanic  | BIGINT  | Hispanic/Latino population (any race) |

In [None]:
con.execute(f"""
CREATE OR REPLACE TABLE pl94_clean AS
WITH base AS (
  SELECT
    -- ---- Join key (already full 15-digit 2020 block GEOID) ----
    CAST(SCTBKEY  AS VARCHAR) AS geoid20,

    -- ---- Totals ----
    CAST(total    AS BIGINT)  AS total_pop,
    CAST(vap      AS BIGINT)  AS vap_total,

    -- ---- Race/Ethnicity counts (TOTAL) ----
    CAST(anglo    AS BIGINT)  AS nh_white,
    CAST(black    AS BIGINT)  AS nh_black,
    CAST(asian    AS BIGINT)  AS nh_asian,
    CAST(hisp     AS BIGINT)  AS hispanic,

    -- ---- Race/Ethnicity counts (VAP) ----
    CAST(anglovap AS BIGINT)  AS nh_white_vap,
    CAST(blackvap AS BIGINT)  AS nh_black_vap,
    CAST(asianvap AS BIGINT)  AS nh_asian_vap,
    CAST(hispvap  AS BIGINT)  AS hispanic_vap

  FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True)
),
shares AS (
  SELECT
    *,
    -- ---- Shares (TOTAL) ----
    (nh_white  ::DOUBLE / NULLIF(total_pop,0)) AS share_nh_white_total,
    (nh_black  ::DOUBLE / NULLIF(total_pop,0)) AS share_nh_black_total,
    (nh_asian  ::DOUBLE / NULLIF(total_pop,0)) AS share_nh_asian_total,
    (hispanic  ::DOUBLE / NULLIF(total_pop,0)) AS share_hispanic_total,

    -- ---- Shares (VAP) ----
    (nh_white_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_nh_white_vap,
    (nh_black_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_nh_black_vap,
    (nh_asian_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_nh_asian_vap,
    (hispanic_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_hispanic_vap
  FROM base
),
qa AS (
  SELECT
    *,
    -- Optional residuals for quick QA (can drop later if you want)
    GREATEST(total_pop - COALESCE(nh_white,0) - COALESCE(nh_black,0) - COALESCE(nh_asian,0) - COALESCE(hispanic,0), 0) AS other_pop,
    GREATEST(vap_total - COALESCE(nh_white_vap,0) - COALESCE(nh_black_vap,0) - COALESCE(nh_asian_vap,0) - COALESCE(hispanic_vap,0), 0) AS other_vap
  FROM shares
)
SELECT * FROM qa;
""")

## Print How Many Census Blocks were Parsed

In [None]:
print("Row count:", con.execute("SELECT COUNT(*) FROM pl94_clean").fetchone()[0])

## Comparing Total Population Versus Racial Demographics

In [None]:
print(con.execute("""
SELECT
  SUM(total_pop)                           AS sum_total,
  SUM(nh_white + nh_black + nh_asian + hispanic + other_pop) AS sum_parts
FROM pl94_clean;
""").df())

print(con.execute("""
SELECT
  SUM(vap_total)                            AS sum_vap_total,
  SUM(nh_white_vap + nh_black_vap + nh_asian_vap + hispanic_vap + other_vap) AS sum_vap_parts
FROM pl94_clean;
""").df())

## Export Cleaned Table to Parquet Table

In [None]:
con.execute(f"COPY pl94_clean TO '{INTERIM_DATA_DIR}/tx_pl94_clean.parquet' (FORMAT PARQUET);")

print("Wrote:", INTERIM_DATA_DIR / "tx_pl94_clean.parquet")