In [1]:
ART_DIR  = "artifacts"               # output dir
RACE_COLS = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]
ID_COL = "district_id"

In [2]:
import os, json
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from texas_gerrymandering_hb4.config import FINAL_CSV, RACE

os.makedirs(ART_DIR, exist_ok=True)

df = pd.read_csv(FINAL_CSV)
df.shape, df.head()

[32m2025-09-24 18:15:18.300[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


((38, 11),
    district_id  polsby_popper  schwartzberg  convex_hull_ratio     reock  \
 0            1       0.193575      0.439971           0.722970  0.407570   
 1            2       0.165232      0.406487           0.625270  0.417351   
 2            3       0.216761      0.465576           0.805445  0.273916   
 3            4       0.103462      0.321654           0.574190  0.222752   
 4            5       0.184465      0.429494           0.823171  0.310183   
 
    pct_white  pct_black  pct_asian  pct_hispanic  dem_share  rep_share  
 0   0.629943   0.186495   0.015244      0.144035   0.308879   0.670312  
 1   0.535046   0.117933   0.073266      0.253889   0.308623   0.670572  
 2   0.578076   0.114005   0.108806      0.174107   0.308510   0.670685  
 3   0.608467   0.097559   0.156080      0.108975   0.308504   0.670681  
 4   0.515702   0.161955   0.038687      0.263356   0.308515   0.670684  )

## Building a Feature Matrix
* The helper function `build_features` creates a feature matrix from our Pandas dataframe.
* `drop_race` is a boolean flag which determines whether or not racial composition features will be included in our feature matrix.
* The `district_id` is dropped before returning the feature matrix.

In [None]:
def build_features(df, drop_race: bool):
    cols = [c for c in df.columns if c != ID_COL]
    if drop_race:
        cols = [c for c in cols if c not in RACE_COLS]
    return df[cols].copy(), cols

### Building a Feature Matrix that Includes All Features
* This matrix will have the columns `polsby_popper`, `schwartzberg`, `convex_hull_ratio`, `reock`, `pct_white`, `pct_black`, `pct_asian`, `pct_hispanic`, `dem_share`, and `rep_share`.
* Hence, the full feature matrix will have 38 rows and 10 columns.

In [None]:
X_full, full_cols = build_features(df, drop_race=False)
scaler_full = StandardScaler()
X_full_scaled = scaler_full.fit_transform(X_full)

### Building a Feature Matrix that Does NOT Include Racial Composition Features
* This feature matrix that excludes racial features will have the columns `polsby_popper`, `schwartzberg`, `convex_hull_ratio`, `reock`, `dem_share`, and `rep_share`.
* Hence, this feature matrix has 38 rows and 6 columns.

In [3]:
X_norace, norace_cols = build_features(df, drop_race=True)
scaler_norace = StandardScaler()
X_norace_scaled = scaler_norace.fit_transform(X_norace)

In [4]:
# Save artifacts
joblib.dump(scaler_full, f"{ART_DIR}/scaler_full.joblib")
np.savez(f"{ART_DIR}/X_full_scaled.npz", X=X_full_scaled)
with open(f"{ART_DIR}/full_columns.json", "w") as f: json.dump(full_cols, f)

joblib.dump(scaler_norace, f"{ART_DIR}/scaler_norace.joblib")
np.savez(f"{ART_DIR}/X_norace_scaled.npz", X=X_norace_scaled)
with open(f"{ART_DIR}/norace_columns.json", "w") as f: json.dump(norace_cols, f)

df[[ID_COL]].to_csv(f"{ART_DIR}/district_ids.csv", index=False)
df.to_csv(f"{ART_DIR}/dataset_snapshot.csv", index=False)
print("Artifacts saved in", ART_DIR)

Artifacts saved in artifacts
