## Import Needed Filepaths and Libraries

In [24]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from scipy.stats import norm

from texas_gerrymandering_hb4.config import FINAL_CSV


## Load Dataset Into Pandas DataFrame
Our processed dataset is read into a Pandas DataFrame.

In [25]:
df = pd.read_csv(FINAL_CSV)


Unnamed: 0,district_id,polsby_popper,schwartzberg,convex_hull_ratio,reock,pct_white,pct_black,pct_asian,pct_hispanic,dem_share,rep_share
0,1,0.193575,0.439971,0.72297,0.40757,0.632532,0.184984,0.014875,0.14283,0.252671,0.728964
1,2,0.165232,0.406487,0.62527,0.417351,0.526751,0.12505,0.072934,0.255189,0.399379,0.577517
2,3,0.216761,0.465576,0.805445,0.273916,0.577914,0.112669,0.111327,0.172895,0.403698,0.573637
3,4,0.103462,0.321654,0.57419,0.222752,0.605524,0.099241,0.1576,0.108877,0.405732,0.575123
4,5,0.184465,0.429494,0.823171,0.310183,0.522819,0.15998,0.034848,0.26167,0.481179,0.49804


## Obtain a PCA-Weighted Average of Compactness Scores

### Specify Compactness Metric Columns in DataFrame

In [None]:
compactness_cols = [
    "polsby_popper",
    "reock",
    "convex_hull_ratio",
    "schwartzberg",
]

### Obtain Matrix of Compactness Metrics from DataFrame

In [None]:
X = df[compactness_cols].values

### Standardizing Compactness Metrics

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

### Implement PCA

In [None]:
pca = PCA(n_components=1)
pc1 = pca.fit_transform(X_std).flatten()

In [26]:
# Rescale PC1 to [0, 1]
pc1_min, pc1_max = pc1.min(), pc1.max()
compactness_index = (pc1 - pc1_min) / (pc1_max - pc1_min)

df["compactness_index"] = compactness_index

# This will be the outcome for the mixture
# Values near 1 = compact / "not gerrymandered"
# Values near 0 = non-compact / "gerrymandered-ish"
y = df["compactness_index"].values
n = len(y)

df[["district_id", "compactness_index"]].head()



Unnamed: 0,district_id,compactness_index
0,1,0.403531
1,2,0.305874
2,3,0.417277
3,4,0.061802
4,5,0.393908


In [27]:
def rinvchisq(df, scale):
    """
    Draw from scaled inverse-chi-squared with df degrees of freedom and 'scale' parameter.
    sigma^2 = df * scale / chi^2_df
    """
    return df * scale / np.random.chisquare(df)


In [28]:
np.random.seed(123)

# Priors for mixture weight lambda ~ Beta(alpha_1, alpha_2)
alpha_1 = 2.0
alpha_2 = 2.0

# Priors for component means mu_k ~ Normal(mu0_k, sigma0_mu_k^2)
# Compactness_index is in [0,1], so set prior means moderately separated
mu0_1 = 0.25
mu0_2 = 0.75

sigma0_mu1_sq = 0.25**2
sigma0_mu2_sq = 0.25**2

# Priors for component variances sigma_k^2 ~ scaled Inv-chi^2(nu0_k, sigma0_sig_k^2)
nu0_1 = 2.0
nu0_2 = 2.0

sigma0_sig1_sq = 0.20**2
sigma0_sig2_sq = 0.20**2

# Initial values
_lambda = 0.5
mu_1 = np.quantile(y, 0.3)
mu_2 = np.quantile(y, 0.7)
sigma_squared_1 = np.var(y)
sigma_squared_2 = np.var(y)

# Initial component memberships
z = np.random.binomial(1, 0.5, size=n)  # 1 = component 1, 0 = component 2

# MCMC settings
iterations = 10_000
warmup = iterations // 2

# Storage for samples
lambda_samples = np.zeros(iterations)
mu1_samples = np.zeros(iterations)
mu2_samples = np.zeros(iterations)
sigma_squared_1_samples = np.zeros(iterations)
sigma_squared_2_samples = np.zeros(iterations)
z_samples = np.zeros((iterations, n), dtype=int)


In [29]:
for i in range(iterations):
    # ---- Update z (component membership for each district) ----
    lik_1 = _lambda * norm.pdf(y, loc=mu_1, scale=np.sqrt(sigma_squared_1))
    lik_2 = (1.0 - _lambda) * norm.pdf(y, loc=mu_2, scale=np.sqrt(sigma_squared_2))
    z_probs = lik_1 / (lik_1 + lik_2)
    z = np.random.binomial(1, z_probs)  # 1 = component 1, 0 = component 2

    # ---- Update lambda ----
    n1 = np.sum(z)
    n2 = n - n1

    alpha_post = alpha_1 + n1
    beta_post = alpha_2 + n2
    _lambda = np.random.beta(alpha_post, beta_post)

    # ---- Update mu_1 ----
    if n1 > 0:
        y1_mean = np.mean(y[z == 1])
        mu1_post_var_inv = (1.0 / sigma0_mu1_sq) + (n1 / sigma_squared_1)
        mu1_post_mean = ((mu0_1 / sigma0_mu1_sq) + n1 * y1_mean / sigma_squared_1) / mu1_post_var_inv
        mu1_post_sd = np.sqrt(1.0 / mu1_post_var_inv)
    else:
        mu1_post_mean = mu0_1
        mu1_post_sd = np.sqrt(sigma0_mu1_sq)

    mu_1 = np.random.normal(mu1_post_mean, mu1_post_sd)

    # ---- Update mu_2 ----
    if n2 > 0:
        y2_mean = np.mean(y[z == 0])
        mu2_post_var_inv = (1.0 / sigma0_mu2_sq) + (n2 / sigma_squared_2)
        mu2_post_mean = ((mu0_2 / sigma0_mu2_sq) + n2 * y2_mean / sigma_squared_2) / mu2_post_var_inv
        mu2_post_sd = np.sqrt(1.0 / mu2_post_var_inv)
    else:
        mu2_post_mean = mu0_2
        mu2_post_sd = np.sqrt(sigma0_mu2_sq)

    mu_2 = np.random.normal(mu2_post_mean, mu2_post_sd)

    # ---- Update sigma_1_sq (scaled inverse-chi-squared) ----
    nu1_post = nu0_1 + n1
    if n1 > 0:
        ss1 = np.sum((y[z == 1] - mu_1) ** 2)
    else:
        ss1 = 0.0

    sigma1_post_scale = (nu0_1 * sigma0_sig1_sq + ss1) / nu1_post
    sigma_1_sq = rinvchisq(nu1_post, sigma1_post_scale)

    # ---- Update sigma_2_sq (scaled inverse-chi-squared) ----
    nu2_post = nu0_2 + n2
    if n2 > 0:
        ss2 = np.sum((y[z == 0] - mu_2) ** 2)
    else:
        ss2 = 0.0

    sigma2_post_scale = (nu0_2 * sigma0_sig2_sq + ss2) / nu2_post
    sigma_2_sq = rinvchisq(nu2_post, sigma2_post_scale)

    # ---- Feed variances back into state for next iteration ----
    sigma_squared_1 = sigma_1_sq
    sigma_squared_2 = sigma_2_sq

    # ---- Store samples ----
    lambda_samples[i] = _lambda
    mu1_samples[i] = mu_1
    mu2_samples[i] = mu_2
    sigma_squared_1_samples[i] = sigma_1_sq
    sigma_squared_2_samples[i] = sigma_2_sq
    z_samples[i, :] = z


In [30]:
lambda_post = lambda_samples[warmup:]
mu1_post = mu1_samples[warmup:]
mu2_post = mu2_samples[warmup:]
sigma1_post = sigma_squared_1_samples[warmup:]
sigma2_post = sigma_squared_2_samples[warmup:]
z_post = z_samples[warmup:, :]

print("Posterior mean lambda:", lambda_post.mean())
print("Posterior mean mu_1 (compactness):", mu1_post.mean())
print("Posterior mean mu_2 (compactness):", mu2_post.mean())
print("Posterior mean sigma1^2:", sigma1_post.mean())
print("Posterior mean sigma2^2:", sigma2_post.mean())


Posterior mean lambda: 0.5106736871122953
Posterior mean mu_1 (compactness): 0.324092488589036
Posterior mean mu_2 (compactness): 0.4806422763749537
Posterior mean sigma1^2: 0.050157384123388174
Posterior mean sigma2^2: 0.051521638734689484


In [31]:
# For each iteration, decide which component is "low-compactness" (gerrymandered-ish)
is_low1 = mu1_post < mu2_post                   # True if comp 1 has lower mean compactness
gerr_label_per_iter = np.where(is_low1, 1, 0)   # 1 if comp 1 is low-compactness, else 0

# Posterior P(geometrically gerrymandered) for each district:
# proportion of iterations where its z equals that iteration's gerrymandered label
post_prob_geom_gerr = np.mean(
    z_post == gerr_label_per_iter[:, None],
    axis=0
)

# Store in dataframe
df["post_prob_geom_gerrymandered"] = post_prob_geom_gerr

# For backward compatibility with your previous code name:
df["post_prob_gerrymandered"] = df["post_prob_geom_gerrymandered"]

df[["district_id", "compactness_index", "post_prob_gerrymandered"]].head()


Unnamed: 0,district_id,compactness_index,post_prob_gerrymandered
0,1,0.403531,0.518
1,2,0.305874,0.6228
2,3,0.417277,0.5194
3,4,0.061802,0.7628
4,5,0.393908,0.5346


In [32]:
df_sorted = df.sort_values("post_prob_gerrymandered", ascending=False)

# Base columns we know exist
cols_to_show = [
    "district_id",
    "compactness_index",
    "post_prob_gerrymandered",
]

# Optional columns – only include if they’re actually present
for c in ["pct_minority", "dem_share"]:
    if c in df_sorted.columns:
        cols_to_show.append(c)

df_sorted[cols_to_show].head(20)



Unnamed: 0,district_id,compactness_index,post_prob_gerrymandered,dem_share
32,33,0.02825,0.7796,0.688378
6,7,0.0,0.7794,0.632036
3,4,0.061802,0.7628,0.405732
13,14,0.04007,0.7624,0.38974
14,15,0.078903,0.762,0.445939
35,36,0.134082,0.7382,0.387174
7,8,0.165469,0.7226,0.388563
9,10,0.165394,0.7172,0.393264
26,27,0.174912,0.7152,0.415234
31,32,0.222402,0.6736,0.457105
