In [3]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, CENSUS_DEMOGRAPHICS_TXT
import duckdb


con = duckdb.connect()

# 1) Inspect a few rows
print(con.execute(f"""
  SELECT * FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True, sample_size=20000) LIMIT 5
""").df())

# 2) Create a cleaned table
con.execute(f"""
CREATE OR REPLACE TABLE pl94_clean AS
WITH base AS (
  SELECT
    /* --- Keys & geography --- */
    CAST(SCTBKEY AS VARCHAR)                    AS geoid20,         -- already full 15-digit key
    CAST(State AS VARCHAR)                      AS state_fips,      -- '48' for TX
    LPAD(CAST(FIPS AS VARCHAR), 3, '0')         AS county_fips,     -- '001'
    LPAD(CAST(TRT  AS VARCHAR), 6, '0')         AS tract_code,      -- '950100'
    LPAD(CAST(BLK  AS VARCHAR), 4, '0')         AS block_code,      -- '1012'
    CAST(FENAME AS VARCHAR)                     AS county_name,

    /* --- Population totals --- */
    CAST(total AS BIGINT)                       AS total_pop,
    CAST(vap   AS BIGINT)                       AS vap_total,

    /* --- Race/Hispanic counts ---
       NOTE: Based on your header, mapping likely is:
         anglo      ≈ non-Hispanic White
         black      ≈ non-Hispanic Black
         asian      ≈ non-Hispanic Asian
         hisp       ≈ Hispanic (any race)
       Fields like bh / nanglo appear to be derived categories provided by source.
       You can keep them if you need, but below we standardize core ones and compute residuals.
    */
    CAST(anglo     AS BIGINT)                   AS nh_white,
    CAST(black     AS BIGINT)                   AS nh_black,
    CAST(asian     AS BIGINT)                   AS nh_asian,
    CAST(hisp      AS BIGINT)                   AS hispanic,

    CAST(anglovap  AS BIGINT)                   AS nh_white_vap,
    CAST(blackvap  AS BIGINT)                   AS nh_black_vap,
    CAST(asianvap  AS BIGINT)                   AS nh_asian_vap,
    CAST(hispvap   AS BIGINT)                   AS hispanic_vap,

    /* Keep source-provided derived columns if you want to audit later */
    CAST(bh        AS BIGINT)                   AS bh_misc,
    CAST(bhvap     AS BIGINT)                   AS bh_misc_vap,
    CAST(nanglo    AS BIGINT)                   AS non_anglo,
    CAST(nanglovap AS BIGINT)                   AS non_anglo_vap,

    /* geometry footprint metrics (likely from an intersected layer) */
    CAST(Shape_Leng AS DOUBLE)                  AS shape_length,
    CAST(Shape_Ar_1 AS DOUBLE)                  AS shape_area
  FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True)
),
calc AS (
  SELECT
    *,
    GREATEST(total_pop - COALESCE(nh_white,0) - COALESCE(nh_black,0) - COALESCE(nh_asian,0) - COALESCE(hispanic,0), 0) AS other_pop,
    GREATEST(vap_total - COALESCE(nh_white_vap,0) - COALESCE(nh_black_vap,0) - COALESCE(nh_asian_vap,0) - COALESCE(hispanic_vap,0), 0) AS other_vap
  FROM base
)
SELECT * FROM calc;
""")

# 3) Basic sanity checks
print("Row count:", con.execute("SELECT COUNT(*) FROM pl94_clean").fetchone()[0])

# Totals should be >= sum of parts (allowing residual 'other')
print(con.execute("""
SELECT
  SUM(total_pop)                           AS sum_total,
  SUM(nh_white + nh_black + nh_asian + hispanic + other_pop) AS sum_parts
FROM pl94_clean;
""").df())

print(con.execute("""
SELECT
  SUM(vap_total)                            AS sum_vap_total,
  SUM(nh_white_vap + nh_black_vap + nh_asian_vap + hispanic_vap + other_vap) AS sum_vap_parts
FROM pl94_clean;
""").df())

# 4) Save outputs
con.execute(f"COPY pl94_clean TO '{INTERIM_DATA_DIR}/tx_pl94_clean.parquet' (FORMAT PARQUET);")

print("Wrote:", INTERIM_DATA_DIR / "tx_pl94_clean.parquet")


   FID  BG  State     TRT   BLK          SCTBKEY  blkkey         CTBKEY FIPS  \
0    0   1     48  950100  1012  480019501001012      13  0019501001012  001   
1    1   1     48  950100  1013  480019501001013      14  0019501001013  001   
2    2   1     48  950100  1014  480019501001014      15  0019501001014  001   
3    3   1     48  950100  1015  480019501001015      16  0019501001015  001   
4    4   1     48  950100  1024  480019501001024      25  0019501001024  001   

     FENAME  ...  bh  nanglo  anglovap  hispvap  bhvap  blackvap  asianvap  \
0  ANDERSON  ...   1       5         7        0      1         1         0   
1  ANDERSON  ...   0       0        17        0      0         0         0   
2  ANDERSON  ...   2       3        10        0      1         1         0   
3  ANDERSON  ...   3       3         8        1      3         2         0   
4  ANDERSON  ...   0       0         0        0      0         0         0   

   nanglovap   Shape_Leng     Shape_Ar_1  
0      