## Import Needed Filepaths & Libraries

In [1]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, GEN_ELECTION_CSV
import pandas as pd

[32m2025-09-16 13:29:25.879[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


In [2]:
OUT_CSV = INTERIM_DATA_DIR / 'clean_vtd_election_results.csv'

In [3]:
df = pd.read_csv(GEN_ELECTION_CSV)

In [5]:
print("Shape (rows, cols):", df.shape)

Shape (rows, cols): (450357, 10)


In [6]:
print(list(df.columns))

['County', 'FIPS', 'VTD', 'cntyvtd', 'vtdkeyvalue', 'Office', 'Name', 'Party', 'Incumbent', 'Votes']


In [7]:
df.isna().sum()

County         0
FIPS           0
VTD            0
cntyvtd        0
vtdkeyvalue    0
Office         0
Name           0
Party          0
Incumbent      0
Votes          0
dtype: int64

In [8]:
df.head()

Unnamed: 0,County,FIPS,VTD,cntyvtd,vtdkeyvalue,Office,Name,Party,Incumbent,Votes
0,Anderson,1,1,10001,1,President,Harris,D,N,325
1,Anderson,1,2,10002,2,President,Harris,D,N,182
2,Anderson,1,3,10003,3,President,Harris,D,N,110
3,Anderson,1,4,10004,4,President,Harris,D,N,109
4,Anderson,1,5,10005,5,President,Harris,D,N,17


In [9]:
# normalize column names just in case (keep original names too)
df.columns = [c.strip() for c in df.columns]

In [10]:
# Basic clean-up of string columns used for filtering/join keys
for col in ["County", "FIPS", "VTD", "cntyvtd", "vtdkeyvalue", "Office", "Name", "Party", "Incumbent", "Votes"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

In [11]:
# Filter to U.S. Senate office. Your file shows "U.S. Sen"
office_patterns = ["u.s. sen", "us senate", "u.s. senate", "u.s. sen."]  # add common variants
df_office = df[df["Office"].str.lower().apply(lambda x: any(p in x for p in office_patterns))]

In [12]:
print("U.S. Senate rows:", len(df_office))
df_office.head()

U.S. Senate rows: 48560


Unnamed: 0,County,FIPS,VTD,cntyvtd,vtdkeyvalue,Office,Name,Party,Incumbent,Votes
110,Anderson,1,1,10001,1,U.S. Sen,Allred,D,N,330
111,Anderson,1,2,10002,2,U.S. Sen,Allred,D,N,215
112,Anderson,1,3,10003,3,U.S. Sen,Allred,D,N,124
113,Anderson,1,4,10004,4,U.S. Sen,Allred,D,N,135
114,Anderson,1,5,10005,5,U.S. Sen,Allred,D,N,19


## Minimal Columns for Join & Partisan Features

In [None]:
# Keep only the columns we need
needed_cols = ["County", "FIPS", "VTD", "cntyvtd", "vtdkeyvalue", "Name", "Party", "Votes"]
df_sen = df_office[needed_cols].copy()

In [14]:
# Coerce numeric votes safely
df_sen["Votes"] = pd.to_numeric(df_sen["Votes"], errors="coerce").fillna(0).astype("int64")

In [15]:
# Normalize party labels to simple codes where possible
# This keeps "D" and "R" as-is, but also passes through minor codes (e.g., "L", "G", "W", etc.)
df_sen["Party"] = df_sen["Party"].str.upper().str.strip()

## Standardize Join Keys

In [None]:
# VTDs are typically zero-padded to 4 chars in TX. County FIPS is 3 digits. State FIPS for Texas is "48".
df_sen["FIPS"] = df_sen["FIPS"].str.zfill(3)        # county FIPS
df_sen["VTD"]  = df_sen["VTD"].str.zfill(4)         # VTD code

In [None]:
# Some feeds include "cntyvtd" (countyFIPS+VTD). If missing or malformed, rebuild it.
def build_cntyvtd(row):
    if pd.notna(row.get("cntyvtd")) and row["cntyvtd"].isdigit() and len(row["cntyvtd"]) >= 5:
        return row["cntyvtd"]
    return f"{row['FIPS']}{row['VTD']}"

df_sen["cntyvtd"] = df_sen.apply(build_cntyvtd, axis=1)

In [None]:
# Helpful composite keys used by many shapefiles:
STATEFP = "48"
df_sen["GEOID_VTD"] = STATEFP + df_sen["cntyvtd"]  # "48" + county(3) + VTD(4) = 9-char state+county+vtd


In [None]:
# Keep also original vtdkeyvalue if present (often a simple VTD index within county)
if "vtdkeyvalue" in df_sen.columns:
    # make sure it's string & trimmed
    df_sen["vtdkeyvalue"] = df_sen["vtdkeyvalue"].astype(str).str.strip()

df_sen.head()

## Pivot to Partisan Features