## Import Needed Filepaths & Libraries

In [27]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, CENSUS_DEMOGRAPHICS_TXT, DATA_DIR
import duckdb

## Start DuckDB Session

In [28]:
con = duckdb.connect()

## Registering the Raw Data as a DuckDB Views

In [29]:
con.execute(f"""
  CREATE OR REPLACE VIEW raw_view AS
  SELECT * FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True)
""")

con.execute(f"""
  CREATE OR REPLACE VIEW raw_view_str AS
  SELECT * FROM read_csv_auto('{CENSUS_DEMOGRAPHICS_TXT}', header=True, ALL_VARCHAR=TRUE)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7acfd80b5830>

## Counting the Number of Rows in Raw Data

In [30]:
print("Row count:")
print(con.execute("SELECT COUNT(*) AS n_rows FROM raw_view").df())

Row count:
   n_rows
0  668757


## Printing Attribute Names & Rows in Raw Data

In [31]:
print("\nColumn names and types:")
print(con.execute("PRAGMA table_info('raw_view')").df())


Column names and types:
    cid        name     type  notnull dflt_value     pk
0     0         FID   BIGINT    False       None  False
1     1          BG   BIGINT    False       None  False
2     2       State   BIGINT    False       None  False
3     3         TRT  VARCHAR    False       None  False
4     4         BLK  VARCHAR    False       None  False
5     5     SCTBKEY   BIGINT    False       None  False
6     6      blkkey   BIGINT    False       None  False
7     7      CTBKEY  VARCHAR    False       None  False
8     8        FIPS  VARCHAR    False       None  False
9     9      FENAME  VARCHAR    False       None  False
10   10       anglo   BIGINT    False       None  False
11   11       asian   BIGINT    False       None  False
12   12        hisp   BIGINT    False       None  False
13   13       total   BIGINT    False       None  False
14   14         vap   BIGINT    False       None  False
15   15       black   BIGINT    False       None  False
16   16          bh   B

## Checking for Missing Values

In [32]:
# Get schema (column names/types) into a DataFrame
schema_df = con.execute("PRAGMA table_info('raw_view')").df()

# Extract column names into a Python list
cols = schema_df["name"].tolist()

cols = schema_df["name"].tolist()

exprs = [
    f"SUM(CASE WHEN \"{c}\" IS NULL OR CAST(\"{c}\" AS VARCHAR) = '' THEN 1 ELSE 0 END) AS missing_{c.lower()}"
    for c in cols
]

sql_missing = "SELECT " + ",\n       ".join(exprs) + " FROM raw_view"

missing_df = con.execute(sql_missing).df().T.reset_index()
missing_df.columns = ["column", "missing_count"]

print("\n=== RAW: Missing values per column ===")
display(missing_df.sort_values("missing_count", ascending=False))



=== RAW: Missing values per column ===


Unnamed: 0,column,missing_count
0,missing_fid,0.0
1,missing_bg,0.0
2,missing_state,0.0
3,missing_trt,0.0
4,missing_blk,0.0
5,missing_sctbkey,0.0
6,missing_blkkey,0.0
7,missing_ctbkey,0.0
8,missing_fips,0.0
9,missing_fename,0.0
