## Import Needed Filepaths & Libraries

In [20]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, CENSUS_DEMOGRAPHICS_TXT
import duckdb

## Start DuckDB Session

In [21]:
con = duckdb.connect()

## Registering the Raw Data as a DuckDB Views

In [22]:
con.execute(f"""
  CREATE OR REPLACE VIEW raw_view AS
  SELECT * FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True)
""")

con.execute(f"""
  CREATE OR REPLACE VIEW raw_view_str AS
  SELECT * FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True, ALL_VARCHAR=TRUE)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7c3e3c0765b0>

## Counting the Number of Rows in Raw Data

In [23]:
print("Row count:")
print(con.execute("SELECT COUNT(*) AS n_rows FROM raw_view").df())

Row count:
   n_rows
0  668757


## Printing Attribute Names & Rows in Raw Data

In [24]:
print("\nColumn names and types:")
print(con.execute("PRAGMA table_info('raw_view')").df())


Column names and types:
    cid        name     type  notnull dflt_value     pk
0     0         FID   BIGINT    False       None  False
1     1          BG   BIGINT    False       None  False
2     2       State   BIGINT    False       None  False
3     3         TRT  VARCHAR    False       None  False
4     4         BLK  VARCHAR    False       None  False
5     5     SCTBKEY   BIGINT    False       None  False
6     6      blkkey   BIGINT    False       None  False
7     7      CTBKEY  VARCHAR    False       None  False
8     8        FIPS  VARCHAR    False       None  False
9     9      FENAME  VARCHAR    False       None  False
10   10       anglo   BIGINT    False       None  False
11   11       asian   BIGINT    False       None  False
12   12        hisp   BIGINT    False       None  False
13   13       total   BIGINT    False       None  False
14   14         vap   BIGINT    False       None  False
15   15       black   BIGINT    False       None  False
16   16          bh   B

## Checking for Missing Values

In [25]:
# Get schema (column names/types) into a DataFrame
schema_df = con.execute("PRAGMA table_info('raw_view')").df()

# Extract column names into a Python list
cols = schema_df["name"].tolist()

cols = schema_df["name"].tolist()

exprs = [
    f"SUM(CASE WHEN \"{c}\" IS NULL OR CAST(\"{c}\" AS VARCHAR) = '' THEN 1 ELSE 0 END) AS missing_{c.lower()}"
    for c in cols
]

sql_missing = "SELECT " + ",\n       ".join(exprs) + " FROM raw_view"

missing_df = con.execute(sql_missing).df().T.reset_index()
missing_df.columns = ["column", "missing_count"]

print("\n=== RAW: Missing values per column ===")
display(missing_df.sort_values("missing_count", ascending=False))



=== RAW: Missing values per column ===


Unnamed: 0,column,missing_count
0,missing_fid,0.0
1,missing_bg,0.0
2,missing_state,0.0
3,missing_trt,0.0
4,missing_blk,0.0
5,missing_sctbkey,0.0
6,missing_blkkey,0.0
7,missing_ctbkey,0.0
8,missing_fips,0.0
9,missing_fename,0.0


## Create a Cleaned Table in DuckDB

In [26]:
con.execute(f"""
CREATE OR REPLACE TABLE pl94_clean AS
WITH base AS (
  SELECT
    -- ---- Join key (already full 15-digit 2020 block GEOID) ----
    CAST(SCTBKEY  AS VARCHAR) AS geoid20,

    -- ---- Totals ----
    CAST(total    AS BIGINT)  AS total_pop,
    CAST(vap      AS BIGINT)  AS vap_total,

    -- ---- Race/Ethnicity counts (TOTAL) ----
    CAST(anglo    AS BIGINT)  AS nh_white,
    CAST(black    AS BIGINT)  AS nh_black,
    CAST(asian    AS BIGINT)  AS nh_asian,
    CAST(hisp     AS BIGINT)  AS hispanic,

    -- ---- Race/Ethnicity counts (VAP) ----
    CAST(anglovap AS BIGINT)  AS nh_white_vap,
    CAST(blackvap AS BIGINT)  AS nh_black_vap,
    CAST(asianvap AS BIGINT)  AS nh_asian_vap,
    CAST(hispvap  AS BIGINT)  AS hispanic_vap

  FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True)
),
shares AS (
  SELECT
    *,
    -- ---- Shares (TOTAL) ----
    (nh_white  ::DOUBLE / NULLIF(total_pop,0)) AS share_nh_white_total,
    (nh_black  ::DOUBLE / NULLIF(total_pop,0)) AS share_nh_black_total,
    (nh_asian  ::DOUBLE / NULLIF(total_pop,0)) AS share_nh_asian_total,
    (hispanic  ::DOUBLE / NULLIF(total_pop,0)) AS share_hispanic_total,

    -- ---- Shares (VAP) ----
    (nh_white_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_nh_white_vap,
    (nh_black_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_nh_black_vap,
    (nh_asian_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_nh_asian_vap,
    (hispanic_vap  ::DOUBLE / NULLIF(vap_total,0)) AS share_hispanic_vap
  FROM base
),
qa AS (
  SELECT
    *,
    -- Optional residuals for quick QA (can drop later if you want)
    GREATEST(total_pop - COALESCE(nh_white,0) - COALESCE(nh_black,0) - COALESCE(nh_asian,0) - COALESCE(hispanic,0), 0) AS other_pop,
    GREATEST(vap_total - COALESCE(nh_white_vap,0) - COALESCE(nh_black_vap,0) - COALESCE(nh_asian_vap,0) - COALESCE(hispanic_vap,0), 0) AS other_vap
  FROM shares
)
SELECT * FROM qa;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7c3e3c0765b0>

The cleaned DuckDB table has the following schema:

| Column               | Type    | Description                                                                            |
|----------------------|---------|----------------------------------------------------------------------------------------|
| geoid20              | VARCHAR | Primary key to join with shapefile                                                     |
| total_pop            | BIGINT  | Total population                                                                       |
| vap_total            | BIGINT  | Voting age population                                                                  |
| nh_white             | BIGINT  | Non-Hispanic white total population                                                    |
| nh_black             | BIGINT  | Non-Hispanic black total population                                                    |
| nh_asian             | BIGINT  | Non-Hispanic Asian total population                                                    |
| hispanic             | BIGINT  | Hispanic/Latino population (any race)                                                  |
| nh_white_vap         | BIGINT  | Non-Hispanic white voting age population                                               |
| nh_black_vap         | BIGINT  | Non-Hispanic black voting age population                                               |
| nh_asian_vap         | BIGINT  | Hispanic/Latino population (any race)                                                  |
| hispanic_vap         | BIGINT  | Hispanic/Latino voting age population)                                                 |
| share_nh_white_total | DOUBLE  | % non-Hispanic white population                                                        |
| share_nh_black_total | DOUBLE  | % non-Hispanic black total population                                                  |
| share_nh_asian_total | DOUBLE  | % non-Hispanic Asian total population                                                  |
| share_hispanic_total | DOUBLE  | % Hispanic total population                                                            |
| share_nh_white_vap   | DOUBLE  | % non-Hispanic white voting age population                                             |
| share_nh_asian_vap   | DOUBLE  | % non-Hispanic Asian voting age population                                             |
| share_hispanic_vap   | DOUBLE  | % Hispanic voting age population                                                       |
| other_pop            | BIGINT  | Population not accounted for in white, black, Asian, or Hispanic categories            |
| other_vap            | BIGINT  | Voting age population not accounted for in white, black, Asian, or Hispanic categories |

## Print How Many Census Blocks were Parsed

In [27]:
print("Row count:", con.execute("SELECT COUNT(*) FROM pl94_clean").fetchone()[0])

Row count: 668757


## Comparing Total Population Versus Racial Demographics

In [28]:
print(con.execute("""
SELECT
  SUM(total_pop)                           AS sum_total,
  SUM(nh_white + nh_black + nh_asian + hispanic + other_pop) AS sum_parts
FROM pl94_clean;
""").df())

print(con.execute("""
SELECT
  SUM(vap_total)                            AS sum_vap_total,
  SUM(nh_white_vap + nh_black_vap + nh_asian_vap + hispanic_vap + other_vap) AS sum_vap_parts
FROM pl94_clean;
""").df())

    sum_total   sum_parts
0  29145505.0  29324461.0
   sum_vap_total  sum_vap_parts
0     21866700.0     21956709.0


## Export Cleaned Table to Parquet Table

In [29]:
con.execute(f"COPY pl94_clean TO '{INTERIM_DATA_DIR}/tx_pl94_clean.parquet' (FORMAT PARQUET);")

print("Wrote:", INTERIM_DATA_DIR / "tx_pl94_clean.parquet")

Wrote: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4/data/interim/tx_pl94_clean.parquet
