## Import Needed Filepaths & Libraries

In [19]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, GEN_ELECTION_CSV
import pandas as pd

In [20]:
OUT_CSV = INTERIM_DATA_DIR / 'clean_vtd_election_results.csv'

In [21]:
df = pd.read_csv(GEN_ELECTION_CSV)

In [22]:
print("Shape (rows, cols):", df.shape)

Shape (rows, cols): (450357, 10)


In [23]:
print(list(df.columns))

['County', 'FIPS', 'VTD', 'cntyvtd', 'vtdkeyvalue', 'Office', 'Name', 'Party', 'Incumbent', 'Votes']


In [24]:
df.isna().sum()

County         0
FIPS           0
VTD            0
cntyvtd        0
vtdkeyvalue    0
Office         0
Name           0
Party          0
Incumbent      0
Votes          0
dtype: int64

In [25]:
df.head()

Unnamed: 0,County,FIPS,VTD,cntyvtd,vtdkeyvalue,Office,Name,Party,Incumbent,Votes
0,Anderson,1,1,10001,1,President,Harris,D,N,325
1,Anderson,1,2,10002,2,President,Harris,D,N,182
2,Anderson,1,3,10003,3,President,Harris,D,N,110
3,Anderson,1,4,10004,4,President,Harris,D,N,109
4,Anderson,1,5,10005,5,President,Harris,D,N,17


In [26]:
# normalize column names just in case (keep original names too)
df.columns = [c.strip() for c in df.columns]

In [27]:
# Basic clean-up of string columns used for filtering/join keys
for col in ["County", "FIPS", "VTD", "cntyvtd", "vtdkeyvalue", "Office", "Name", "Party", "Incumbent", "Votes"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()

In [28]:
# Filter to U.S. Senate office. Your file shows "U.S. Sen"
office_patterns = ["u.s. sen", "us senate", "u.s. senate", "u.s. sen."]  # add common variants
df_office = df[df["Office"].str.lower().apply(lambda x: any(p in x for p in office_patterns))]

In [29]:
print("U.S. Senate rows:", len(df_office))
df_office.head()

U.S. Senate rows: 48560


Unnamed: 0,County,FIPS,VTD,cntyvtd,vtdkeyvalue,Office,Name,Party,Incumbent,Votes
110,Anderson,1,1,10001,1,U.S. Sen,Allred,D,N,330
111,Anderson,1,2,10002,2,U.S. Sen,Allred,D,N,215
112,Anderson,1,3,10003,3,U.S. Sen,Allred,D,N,124
113,Anderson,1,4,10004,4,U.S. Sen,Allred,D,N,135
114,Anderson,1,5,10005,5,U.S. Sen,Allred,D,N,19


## Minimal Columns for Join & Partisan Features

In [30]:
# Keep only the columns we need
needed_cols = ["County", "FIPS", "VTD", "cntyvtd", "vtdkeyvalue", "Name", "Party", "Votes"]
df_sen = df_office[needed_cols].copy()

In [31]:
# Coerce numeric votes safely
df_sen["Votes"] = pd.to_numeric(df_sen["Votes"], errors="coerce").fillna(0).astype("int64")

In [32]:
# Normalize party labels to simple codes where possible
# This keeps "D" and "R" as-is, but also passes through minor codes (e.g., "L", "G", "W", etc.)
df_sen["Party"] = df_sen["Party"].str.upper().str.strip()

## Standardize Join Keys

In [33]:
# VTDs are typically zero-padded to 4 chars in TX. County FIPS is 3 digits. State FIPS for Texas is "48".
df_sen["FIPS"] = df_sen["FIPS"].str.zfill(3)        # county FIPS
df_sen["VTD"]  = df_sen["VTD"].str.zfill(4)         # VTD code

In [34]:
# Some feeds include "cntyvtd" (countyFIPS+VTD). If missing or malformed, rebuild it.
def build_cntyvtd(row):
    if pd.notna(row.get("cntyvtd")) and row["cntyvtd"].isdigit() and len(row["cntyvtd"]) >= 5:
        return row["cntyvtd"]
    return f"{row['FIPS']}{row['VTD']}"

df_sen["cntyvtd"] = df_sen.apply(build_cntyvtd, axis=1)

In [35]:
# Helpful composite keys used by many shapefiles:
STATEFP = "48"
df_sen["GEOID_VTD"] = STATEFP + df_sen["cntyvtd"]  # "48" + county(3) + VTD(4) = 9-char state+county+vtd


In [36]:
# Keep also original vtdkeyvalue if present (often a simple VTD index within county)
if "vtdkeyvalue" in df_sen.columns:
    # make sure it's string & trimmed
    df_sen["vtdkeyvalue"] = df_sen["vtdkeyvalue"].astype(str).str.strip()

df_sen.head()

Unnamed: 0,County,FIPS,VTD,cntyvtd,vtdkeyvalue,Name,Party,Votes,GEOID_VTD
110,Anderson,1,1,10001,1,Allred,D,330,4810001
111,Anderson,1,2,10002,2,Allred,D,215,4810002
112,Anderson,1,3,10003,3,Allred,D,124,4810003
113,Anderson,1,4,10004,4,Allred,D,135,4810004
114,Anderson,1,5,10005,5,Allred,D,19,4810005


## Pivot to Partisan Features

In [37]:
# Sum votes by VTD + Party (in case multiple rows per candidate/party exist)
group_keys = ["County", "FIPS", "VTD", "cntyvtd", "vtdkeyvalue", "GEOID_VTD"]
tall = df_sen.groupby(group_keys + ["Party"], as_index=False)["Votes"].sum()

In [38]:
# Pivot: one row per VTD; columns per Party code
wide = (
    tall
    .pivot(index=group_keys, columns="Party", values="Votes")
    .fillna(0)
    .reset_index()
)

In [39]:
# Standard names for the two majors; keep any third/others as-is
rename_map = {"D": "dem_votes", "R": "rep_votes"}
wide = wide.rename(columns=rename_map)

In [40]:
# Detect third-party columns (anything not in keys + the two majors)
key_cols = set(group_keys + ["dem_votes", "rep_votes"])
third_cols = [c for c in wide.columns if c not in key_cols]

In [41]:
# Collapse all non-D/R into a single 'third_party_votes' column
if third_cols:
    wide["third_party_votes"] = wide[third_cols].sum(axis=1).astype("int64")
else:
    wide["third_party_votes"] = 0

In [42]:
# Ensure dem/rep exist even if missing in some VTDs
for col in ["dem_votes", "rep_votes"]:
    if col not in wide.columns:
        wide[col] = 0

In [43]:
# Compute totals and shares
wide["total_votes"] = (wide["dem_votes"].astype("int64")
                       + wide["rep_votes"].astype("int64")
                       + wide["third_party_votes"].astype("int64"))

In [44]:
# Avoid divide-by-zero
wide["dem_share"] = wide.apply(
    lambda r: (r["dem_votes"] / r["total_votes"]) if r["total_votes"] > 0 else pd.NA, axis=1
)

In [45]:
# Keep only minimal, model-ready columns + join keys
final_cols = [
    "County", "FIPS", "VTD", "cntyvtd", "vtdkeyvalue", "GEOID_VTD",
    "dem_votes", "rep_votes", "third_party_votes", "total_votes", "dem_share"
]
wide = wide[final_cols].copy()

print("Final shape:", wide.shape)
wide.head(10)

Final shape: (9712, 11)


Party,County,FIPS,VTD,cntyvtd,vtdkeyvalue,GEOID_VTD,dem_votes,rep_votes,third_party_votes,total_votes,dem_share
0,Anderson,1,1,10001,1,4810001,330,742,25,1097,0.30082
1,Anderson,1,2,10002,2,4810002,215,1620,25,1860,0.115591
2,Anderson,1,3,10003,3,4810003,124,712,21,857,0.144691
3,Anderson,1,4,10004,4,4810004,135,1032,18,1185,0.113924
4,Anderson,1,5,10005,5,4810005,19,198,6,223,0.085202
5,Anderson,1,6,10006,6,4810006,340,406,14,760,0.447368
6,Anderson,1,7,10007,7,4810007,322,152,14,488,0.659836
7,Anderson,1,8,10008,8,4810008,113,95,4,212,0.533019
8,Anderson,1,9,10009,9,4810009,166,741,15,922,0.180043
9,Anderson,1,10,10010,10,4810010,62,309,3,374,0.165775


In [46]:
# --- 5) Export to CSV and Parquet ---------------------------------------------
# CSV (always available)
wide.to_csv(OUT_CSV, index=False)
print(f"✅ Wrote CSV: {OUT_CSV.resolve()}")

✅ Wrote CSV: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4/data/interim/clean_vtd_election_results.csv
