In [None]:
import ibis
from ibis import _
import ibis.expr.datatypes as dt  
import re
from cng.utils import *
from cng.h3 import *
from minio import Minio
import altair as alt

duckdb_install_h3()
con = ibis.duckdb.connect(extensions = ["spatial"])
set_secrets(con)

# party_val = "Democrat"
votes= (con.read_parquet('s3://public-tpl/landvote/landvote_party.parquet')
    .drop('geom')
)

colors = {
    "dark_orange": "#ab5601",
    "light_orange": "#f3d3b1",
    "grey": "#d3d3d3",
    "light_green": "#c3dbc3",
    "dark_green": "#417d41",
    "dem_blue": "#1b46c2",
    "rep_red": "#E81B23"
}



# Helper functions

In [None]:
def get_unique_rows(df):
    # collapse multi-county measures to one row per landvote_id
    unique_votes = (
        df
        .group_by("landvote_id")
        .agg(
            **{c: ibis._[c].first() for c in df.schema().names if c not in ("landvote_id", "county", "party")},
            # if spans multiple counties -> set different name for county
            county=ibis.ifelse(ibis._.county.nunique() > 1, "Multiple Counties", ibis._.county.first()), 
             # if counties differ in parties -> assign other label to party 
            party=ibis.ifelse(ibis._.party.nunique() > 1, "Mixed", ibis._.party.first()),
        )
    )
    return unique_votes


def get_passed(df):
    passed = df.filter(_.status.isin(["Pass", "Pass*"])).count().execute()
    total = df.count().execute()
    overall_passed = round(passed / total * 100, 2)
    print(f"{overall_passed}% Measures Passed from 1988 - 2024 \n")
    
## graphing utils 
party_colors = alt.Scale(
    domain=["Democrat", "Republican"],
    range=[colors["dem_blue"], colors["rep_red"]],
)


def year_line(df, y, group, title, y_title, percent=True):
    if percent:
        axis = alt.Axis(format="%")
    else:
        axis = alt.Axis(format="d")
    return (
        alt.Chart(df, title=alt.TitleParams(text=title, fontSize=20))
        .mark_line(point=True)
        .encode(
            x="year:O",
            y=alt.Y(f"{y}:Q", title=y_title, axis=axis),
            color=alt.Color(f"{group}:N", scale=party_colors, legend=None),
        )
    ).properties(
    width=600,
    height=300,
    )

# Overall Pass Rate

In [None]:
#we reclassify parties if we do sensitivity analysis
votes = (get_unique_rows(votes)
    # .mutate(party=_.party.substitute({'Mixed':party_val,"None":party_val,"Other":party_val,None:party_val}))
        )

get_passed(votes)
# compute percentage passed over entire dataset



# % of Measures Passed per Party

In [None]:
df = (
    votes
    .filter(_.party.isin(["Democrat", "Republican"]))
    .group_by("year", "party")
    .agg(pass_fraction=((_.status.isin(["Pass", "Pass*"]))).cast("int").mean())
    .order_by("year")
    .execute()
)

chart = year_line(
    df,
    y="pass_fraction",
    group="party",
    title="% of Measures Passed by Party",
    y_title="% of measures passed",
)


# chart.save('percent_passed_party.png', ppi=200)
chart

### Two-tailed Z test

Let $p_D$ and $p_R$ be the ballot measure pass rate for Democrats and Republican, respectively.

$H_0: p_D=p_R \\$
$H_A: p_D\neq p_R$

A two-tailed z-test shows no statistically significant difference in passage rates between Democratic and Republican measures (z = 1.01, p = 0.31, two-sided).


In [None]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

df = votes.execute()  # one row per landvote_id

df = df[df["party"].isin(["Democrat", "Republican"])].copy()
df["passed"] = df["status"].isin(["Pass", "Pass*"]).astype(int)

summary = (
    df
    .groupby("party")["passed"]
    .agg(["sum", "count"])
)

count = summary["sum"].values        # number passed
nobs = summary["count"].values       # total measures

z_stat, p_two_sided = proportions_ztest(count, nobs)


print("z =", z_stat)
print("two-sided p =", p_two_sided)


## Equivalence Test - two one-sided t-tests (TOST)

Let $\Delta  = p_D-p_R$ where $\delta$ is a equivalence margin.

$H_{0}: \Delta \leq - \delta$ or $\Delta \geq \delta\\$ 
$H_{A}: -\delta < \Delta < \delta\\$

Short Conclusion:
Since both p-values $<0.05$ -> reject non-equivalence
Pass rates are statistically equivalent within $\pm5$ percentage points

Detailed:
The estimated difference in conservation measures passage rates between Democratic and Republican jurisdictions is 1.6 percentage points ($95% CI: −1.6$ to $4.8$). Using a two one-sided tests (TOST) equivalence framework with a $\pm5$ percentage point margin, we reject non-equivalence ($p < 0.05$ for both bounds), indicating statistically equivalent passage rates across parties.

In [None]:
import numpy as np
from statsmodels.stats.proportion import test_proportions_2indep, confint_proportions_2indep

df = votes.execute()  # one row per landvote_id
df = df[df["party"].isin(["Democrat", "Republican"])].copy()
df["passed"] = df["status"].isin(["Pass", "Pass*"]).astype(int)

g = df.groupby("party")["passed"].agg(["sum", "count"])
xD, nD = int(g.loc["Democrat", "sum"]), int(g.loc["Democrat", "count"])
xR, nR = int(g.loc["Republican", "sum"]), int(g.loc["Republican", "count"])

delta = 0.05   # equivalence margin: ±5 percentage points
alpha = 0.05

# TOST:
# 1) H0: (pD - pR) <= -delta  vs  H1: (pD - pR) > -delta
p_lo = test_proportions_2indep(xD, nD, xR, nR, value=-delta, alternative="larger").pvalue

# 2) H0: (pD - pR) >=  delta  vs  H1: (pD - pR) <  delta
p_hi = test_proportions_2indep(xD, nD, xR, nR, value= delta, alternative="smaller").pvalue

equivalent = (p_lo < alpha) and (p_hi < alpha)

# effect size + CI (helpful to report)
diff = xD / nD - xR / nR
ci_lo, ci_hi = confint_proportions_2indep(xD, nD, xR, nR, method="wald")

print("diff (pD - pR) =", diff)
print("95% CI =", (ci_lo, ci_hi))
print("TOST p-values =", (p_lo, p_hi))
print("Equivalent within ±delta?", equivalent)


# Are Measures Proposed more in democratic jurisdictions?

In [None]:
df = (
    votes
    .filter(_.party.isin(["Democrat", "Republican"]))
    .group_by(_.year, _.party)
    .aggregate(freq=_.count())
    .order_by('year')
    .execute()
)

chart = year_line(
    df,
    y="freq",
    group="party",
    title="Number of Conservation Ballot Measures",
    y_title="# of measures",
    percent=False
)

# chart.save('number_measures_party.png', ppi=200)
chart

# Do Democrats approve more conservation funding?

In [None]:
party_df = (
    votes
    .filter(_.party.isin(["Democrat", "Republican"]))
    .filter(_.status.isin(["Pass", "Pass*"]))
    .group_by(_.year, _.party)
    .aggregate(amount_approved=((_.conservation_funds_approved.sum())))
    .order_by('year')
    .execute()
)

chart = (
    alt.Chart(party_df)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:O", title="Year"),
        y=alt.Y(
            "amount_approved:Q",
            title="Conservation funds approved ($ billions)",
            axis=alt.Axis(
                format="$,.0f",
                labelExpr="datum.value / 1000000000"
            ),
        ),
        color=alt.Color(
            "party:N",
            scale=party_colors,
            legend=None
        ),
    )
.properties(title=alt.TitleParams(text="Conservation Funds Approved by Party",fontSize=20))
)
# chart.save('amount_approved_party.png', ppi=200)
chart


### our data is heavily skewed

In [None]:

import numpy as np

df = votes.filter(_.status.isin(["Pass","Pass*"])).execute()
df = df[df.party.isin(["Democrat","Republican"])].copy()

# log-transform (drop non-positive just to be safe)
df = df[df["conservation_funds_approved"] > 0]
df["log_amt"] = np.log(df["conservation_funds_approved"])

table = df.groupby("party").agg(
    n_measures=("conservation_funds_approved", "count"),
    mean_amt=("conservation_funds_approved", "mean"),
    median_amt=("conservation_funds_approved", "median"),
    mean_log_amt=("log_amt", "mean"),
    median_log_amt=("log_amt", "median"),
    total_amt=("conservation_funds_approved", "sum"),
)

print(table)



In [None]:
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X(
        "conservation_funds_approved:Q",
        bin=alt.Bin(maxbins=10),
        title="Conservation funds approved ($ billions)",
        axis=alt.Axis(
            format="$,.0f",
            labelExpr="datum.value / 1000000000"
        ),
    ),
    y=alt.Y("count()", title="Number of measures"),
).properties(
    title=alt.TitleParams(
        text="Distribution of Approved Conservation Funds (Raw Dollars)",
        fontSize=16
    ),
    width=400,
    height = 400
)
# chart.save('dist_conservation_funds.png',ppi=300)
chart

In [None]:
import numpy as np

df["log_funds"] = np.log10(df["conservation_funds_approved"])

chart = alt.Chart(df).mark_bar().encode(
    x=alt.X(
        "log_funds:Q",
        bin=alt.Bin(maxbins=40),
        title="log10(Conservation funds approved)",
    ),
    y=alt.Y("count()", title="Number of measures"),
).properties(
    title=alt.TitleParams(
        text="Distribution of Approved Conservation Funds (Log Scale)",
        fontSize=16
    ),
    width=400,
    height = 400
)
# chart.save('log_dist_conservation_funds.png',ppi=300)
chart

## Welch's T Test on logscale funds 

In [None]:
import numpy as np
from scipy.stats import ttest_ind

df = votes.execute()
df = df[
    df["party"].isin(["Democrat", "Republican"])
    & df["status"].isin(["Pass", "Pass*"])
    & df["conservation_funds_approved"].notna()
    & (df["conservation_funds_approved"] > 0)
].copy()

xD = np.log(df.loc[df.party == "Democrat", "conservation_funds_approved"])
xR = np.log(df.loc[df.party == "Republican", "conservation_funds_approved"])

t, p = ttest_ind(xD, xR, equal_var=False)
print("Welch t =", t, "p =", p)
