# Predicting Support for U.S. State Ballot Measures

For this project, I scrape online elections resources for data on U.S. state ballot measures, including ballot text and campaign finance, as well as state demographic and electoral data, to create a predictive model for ballot measure support.

This project is a work in progress (data sources to be cited), but exhibits web scraping, data engineering, textual/quantitative data cleaning, data analysis, and machine learning.

**Packages used:** Beautiful Soup, Selenium, RegEx, Pandas, PySpark, Scikit-learn

### Load Packages

In [None]:
pip install pyspark
from pyspark import SparkContext, SparkConf

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=b31147f6961372843d09ea9906beff2ee71f93300d5f6828caf97cb346ca9d5e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import requests
import time
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import ast
import sklearn

## Initial Ballot Measure Scrape

Data found on Ballotpedia annual ballot measures pages, 2004-2024

In [None]:
def get_measure_link(columns):
    '''
    Get ballot measure link from ballot measure table

    columns - list of ballot measure table columns HTML

    Returns ballot measure link as string
    '''
    measurelink = columns[1].a['href']
    if measurelink.startswith('/'):
        measurelink = 'https://ballotpedia.org' + measurelink
    return measurelink

def get_state(slug):
    '''
    Get state from ballot measure slug

    slug - string URL slug of ballot measure

    Returns state as string
    '''
    if slug[0] in ('New', 'North', 'Rhode', 'South', 'West'):
        return slug[0] + ' ' + slug[1]
    return slug[0]

def get_subjects(columns, col_idx):
    '''
    Get ballot measure subjects from ballot measure table

    columns - list of ballot measure table columns HTML
    col_idx - integer index of subjects column

    Returns subjects as list
    '''
    return [subjtag.text.strip() for subjtag in columns[col_idx].find_all('a')]

def get_approval(columns, col_idx):
    '''
    Determine whether ballot measure was approved from ballot measure table

    columns - list of ballot measure table columns HTML
    col_idx - integer index of approval column

    Returns approval status as Boolean
    '''
    if columns[col_idx].a is not None:
        return columns[col_idx].a['title'] in ('Approved', 'Repealed, altered, or partially repealed')
    return False

year = []
state = []
title = []
measuretype = []
subject = []
concluded = []
approved = []
link = []

In [None]:
# retrive Ballotpedia ballot measure page HTML for each year in range of interest
for pageyear in range(2004, 2025):
    r = requests.get(f'https://ballotpedia.org/{pageyear}_ballot_measures')
    soup = BeautifulSoup(r.content, 'lxml')
    # find table for each state's ballot measures that year
    for table in soup.find_all('table', class_ = 'bptable', attrs = {'style' : 'width:auto;'}):
        # skip tables not representing states
        if 'sortable' in table['class']:
            if table.tbody.tr.th.text.strip() != 'Type':
                continue
        # iterate through rows, each representing one ballot measure
        for row in table.tbody.find_all('tr'):
            columns = row.find_all('td')
            if columns:
                # get ballot measure link
                measure_link = get_measure_link(columns)
                # get ballot measure data, skipping tables not representing states
                if not (pageyear == 2022 and columns[1].a['href'].startswith('/')):
                    year.append(pageyear)
                    link.append(measure_link)
                    slug = measure_link.split('/')[3].split('_')
                    state.append(get_state(slug))
                    measuretype.append(columns[0].text.strip())
                    title.append(columns[1].text.strip())
                    # get ballot measures subject(s)/conclusion status/approval status based on presence in table
                    if len(columns) in (5, 7):
                        subject.append(get_subjects(columns, 2))
                        concluded.append(True)
                        approved.append(get_approval(columns, 4))
                    elif len(columns) == 6:
                        subject.append([])
                        concluded.append(True)
                        approved.append(get_approval(columns, 3))
                    else:
                        concluded.append(False)
                        approved.append(None)
                        subject.append(get_subjects(columns, 2) if len(columns) == 4 else [])
    time.sleep(5)

# create pandas DataFrame of all metrics
bal_measures = pd.DataFrame([year, state, title, measuretype, subject, concluded, approved, link]).transpose()
bal_measures.columns = ['Year', 'State', 'Title', 'Type', 'Subject(s)', 'Concluded', 'Approved', 'Link']
# exclude territories/districts
bal_measures = bal_measures[~bal_measures['State'].isin(('American', 'El', 'Puerto', 'U.S.', 'Washington,'))]
# remove measures w/ ambiguous yes vs no options
ambig_row_idx = bal_measures[bal_measures['Approved'] != True][bal_measures['Approved'] != False][bal_measures['Year'] != 2024].index
bal_measures.drop(ambig_row_idx, inplace = True)
# clean typos
bal_measures.loc[bal_measures['State'] == 'West Virginia,', 'State'] = 'West Virginia'
# reset index
bal_measures.reset_index(drop = True, inplace = True)

bal_measures

  ambig_row_idx = balmeasures[balmeasures['Approved'] != True][balmeasures['Approved'] != False][balmeasures['Year'] != 2024].index
  ambig_row_idx = balmeasures[balmeasures['Approved'] != True][balmeasures['Approved'] != False][balmeasures['Year'] != 2024].index


Unnamed: 0,Year,State,Title,Type,Subject(s),Concluded,Approved,Link
0,2004,Alabama,Amendment 1,LRCA,[Development],True,True,https://ballotpedia.org/Alabama_Industrial_Dev...
1,2004,Alabama,Amendment 2,LRCA,[Con Language],True,False,https://ballotpedia.org/Alabama_Separation_of_...
2,2004,Alabama,Amendment 3,LRCA,[Development],True,True,https://ballotpedia.org/Alabama_Economic_and_I...
3,2004,Alabama,Amendment 4,LRCA,[Development],True,True,https://ballotpedia.org/Alabama_Shrimp_and_Sea...
4,2004,Alabama,Amendment 5,LRCA,[Local Gov't],True,False,https://ballotpedia.org/Alabama_Trussville_Ann...
...,...,...,...,...,...,...,...,...
2179,2024,Wisconsin,Question 2,LRCA,[Elections],True,True,"https://ballotpedia.org/Wisconsin_Question_2,_..."
2180,2024,Wisconsin,Question 1,LRCA,[State legislatures],False,,"https://ballotpedia.org/Wisconsin_Question_1,_..."
2181,2024,Wisconsin,Question 2,LRCA,"[Administration, State legislatures, Budgets]",False,,"https://ballotpedia.org/Wisconsin_Question_2,_..."
2182,2024,Wisconsin,Citizenship Voting Requirement Amendment,LRCA,[Suffrage],False,,https://ballotpedia.org/Wisconsin_Citizenship_...


In [None]:
# create DataFrame copy
bal_measurescopy = bal_measures.copy()
# cast 'Subject(s)' as string to verify table has no duplicates
bal_measurescopy['Subject(s)'] = str(bal_measurescopy['Subject(s)'])
len(bal_measurescopy.drop_duplicates()) == len(bal_measures)

True

## Ballot Text Scrape

Data found on individual Ballotpedia ballot measure pages

In [None]:
def get_ballot_text(heading_text_options, tag, target_list):
    '''
    Appends specified list with text of ballot measure

    heading_text_options - list of strings, each representing potential heading for text section
    tag - string representing HTML tag of heading
    target_list - list to append text to
    '''
    heading = soup.find(tag, string = heading_text_options[0])
    counter = 1
    # test different heading options
    while heading is None and len(heading_text_options) > counter:
        heading = soup.find(tag, heading_text_options[counter])
        counter += 1
    # if text section found, get each line of ballot text
    if heading is not None:
        text_lines = []
        text_block = heading.findNext(['table', 'blockquote'])
        for line in text_block.find_all(['p', 'li', 'td', 'i']):
            if line.text not in text_lines:
                text_lines.append(line.text)
        # append to target list
        target_list.append(' '.join(text_lines))
    else:
        target_list.append(None)

ballot_title = []
ballot_summary = []
fiscal_impact = []
yes_votes = []
no_votes = []

In [None]:
# iterate through each link for ballot measures in DataFrame
for idx, row in bal_measures[bal_measures['Concluded'] == True].iterrows():
    measure_link = row['Link']
    # progress tracker
    print(idx)
    # request Ballotpedia ballot measure page
    r = requests.get(measure_link)
    soup = BeautifulSoup(r.content, 'lxml')
    # get ballot text for each text type (title/question, summary, fiscal impact statement)
    get_ballot_text(['Ballot title', 'Ballot question', 'Title', 'Ballot language'], 'h3', ballot_title)
    get_ballot_text(['Ballot summary', 'Petition summary', 'Official description'], 'h3', ballot_summary)
    get_ballot_text(['Fiscal impact statement'], 'h3', fiscal_impact)

    # if text section lacks subheading(s), get general ballot text as title
    if ballot_title[-1] == None and ballot_summary[-1] == None:
        ballot_title.pop(-1)
        get_ballot_text(['Text of measure'], 'h2', ballot_title)

    # get yes/no vote counts from outcome table
    result_rows = soup.find('table', attrs = {'border' : '1'})
    first_vote_count = result_rows.find(['td', 'b'], string = re.compile('^ *[0-9]+,?[0-9]*,?[0-9]* *$'))
    second_vote_count = first_vote_count.findNext('tr').find_all('td')[1]
    # determine which vote count represents yes vs no to append count to correct list
    if re.search('Yes', first_vote_count.findPrevious('td', attrs = {'colspan' : '2'}).text):
        yes_votes.append(float(first_vote_count.text.strip().replace(',', '')))
        no_votes.append(float(second_vote_count.text.strip().replace(',', '')))
    else:
        yes_votes.append(float(second_vote_count.text.strip().replace(',', '')))
        no_votes.append(float(first_vote_count.text.strip().replace(',', '')))

    time.sleep(5)

In [None]:
# get links of concluded ballot measures
concl_links = balmeasures[balmeasures['Concluded'] == True]['Link']

# create pandas DataFrame of ballot text and yes/no vote counts
text_scrape = pd.DataFrame([ballot_title, ballot_summary, fiscal_impact, yes_votes, no_votes, concl_links]).transpose()
text_scrape.columns = ['Title', 'Summary', 'Fiscal_impact', 'Yes_votes', 'No_votes', 'Link']

In [None]:
bal_measures.set_index('Link')
text_scrape.set_index('Link')
text_scrape.drop_duplicates(inplace = True)

# combine ballot measure info DataFrame with ballot text/vote count DataFrame
bal_final = bal_measures.merge(text_scrape, on = 'Link', how = 'left')

In [None]:
# find measures where wrong table scraped for yes/no vote counts
bal_final[bal_final['Yes_votes'] < bal_final['No_votes']][bal_final['Approved'] == True]['Link']

# 956, 1380, 1381, 1383, 1384,1390, 1403, 1561 scraped wrong tables

In [None]:
# manually visit pages and recode vote counts
bal_final.loc[956, 'Yes_votes'], bal_final.loc[956, 'No_votes'] = 622316, 887957
bal_final.loc[1380, 'Yes_votes'], bal_final.loc[1380, 'No_votes'] = 381768, 377773
bal_final.loc[1381, 'Yes_votes'], bal_final.loc[1381, 'No_votes'] = 383428, 373848
bal_final.loc[1383, 'Yes_votes'], bal_final.loc[1383, 'No_votes'] = 420892, 337486
bal_final.loc[1384, 'Yes_votes'], bal_final.loc[1384, 'No_votes'] = 388273, 356621
bal_final.loc[1390, 'Yes_votes'], bal_final.loc[1390, 'No_votes'] = 1769328, 1528219
bal_final.loc[1403, 'Yes_votes'], bal_final.loc[1403, 'No_votes'] = 558631, 548732
bal_final.loc[1561, 'Yes_votes'], bal_final.loc[1561, 'No_votes'] = 365107, 237567

In [None]:
# convert U.S. state names to two-letter abbreviations

# dictionary sourced from https://gist.github.com/rogerallen/1583593
us_state_to_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'District of Columbia': 'DC',
    'American Samoa': 'AS',
    'Guam': 'GU',
    'Northern Mariana Islands': 'MP',
    'Puerto Rico': 'PR',
    'United States Minor Outlying Islands': 'UM',
    'U.S. Virgin Islands': 'VI',
}

bal_final['State'] = bal_final['State'].map(us_state_to_abbrev)

In [None]:
bal_final.to_csv('bal_final.csv', index = False)

In [None]:
bal_final.head()

Unnamed: 0,Year,State,Title_x,Type,Subject(s),Concluded,Approved,Link,Title_y,Summary,Fiscal_impact,Yes_votes,No_votes
0,2004,Alabama,Amendment 1,LRCA,[Development],True,True,https://ballotpedia.org/Alabama_Industrial_Dev...,\n Proposing an amendment to the Constitution ...,,,674466.0,535786.0
1,2004,Alabama,Amendment 2,LRCA,[Con Language],True,False,https://ballotpedia.org/Alabama_Separation_of_...,\n Proposing an amendment to the Constitution ...,,,689450.0,691300.0
2,2004,Alabama,Amendment 3,LRCA,[Development],True,True,https://ballotpedia.org/Alabama_Economic_and_I...,\n Proposing an amendment to the Constitution ...,,,727630.0,584014.0
3,2004,Alabama,Amendment 4,LRCA,[Development],True,True,https://ballotpedia.org/Alabama_Shrimp_and_Sea...,\n Proposing an amendment to the Constitution ...,,,815629.0,479767.0
4,2004,Alabama,Amendment 5,LRCA,[Local Gov't],True,False,https://ballotpedia.org/Alabama_Trussville_Ann...,\n Relating to the City of Trussville in Jeffe...,,,505628.0,623920.0


In [None]:
# import CSV for future use
bal_final = pd.read_csv('bal_final.csv')
# convert subject(s) from strings back to lists
bal_final['Subject(s)'] = bal_final['Subject(s)'].apply(lambda x: ast.literal_eval(x))

## Ballot Text Cleaning/Analysis (Work in Progress)

In [None]:
SparkContext.getOrCreate()

In [None]:
ballots = sc.parallelize(ballot_title)

In [None]:
# clear text scraped not representing ballot text

In [None]:
# use \s for whitespace

ballots = ballots.map(lambda x : x.strip().strip('“”'))
# remove symbols + newline sequences
ballots = ballots.map(lambda x : re.sub('\[[0-9]+\]|\\n', '', x))
# remove yes/no + approve/disapprove choices
ballots = ballots.map(lambda x : re.sub('(\[ \] )?(Approve|Yes) (\[ \] )?(Disapprove|No)$', '', x, flags = re.IGNORECASE))
# remove (Proposed by _____)
ballots = ballots.map(lambda x : re.sub('\(Proposed by .+\)$', '', x))
# remove double-spaces
# ballots = ballots.map(lambda x : re.sub(' {2}', ' ', x))
ballots = ballots.map(lambda x : x.strip())
ballots.collect()

['Proposing an amendment to the Constitution of Alabama of 1901, to authorize Baldwin County and certain governmental entities within the county to have certain powers for the promotion of economic and industrial development in Baldwin County and the municipalities therein.',
 'Proposing an amendment to the Constitution of Alabama of 1901, to repeal portions of Section 256 and Amendment 111 relating to separation of schools by race and repeal portions of Amendment 111 concerning constitutional construction against the right to education, and to repeal Section 259, Amendment 90, and Amendment 109 relating to the poll tax.']

In [None]:
# fill in missing subjects with scikit-learn classification

In [None]:
sc.stop()

## State Demographics Scrape/Cleaning

ACS Demographic data tables - https://data.census.gov/table/ACSDP5Y2010.DP05?q=DP05&g=010XX00US$0400000&moe=false

ACS Selected Population data tables - https://data.census.gov/table/ACSSPP1Y2010.S0201?q=2010%20population%20profile&t=-02

In [None]:
def read_dem_df(df_name):
    '''
    Reads and cleans Excel file of state demographic data

    df_name - string representing name of Excel file

    Returns pandas DataFrame of state demographic data
    '''
    df = pd.read_excel(f'{df_name}.xlsx', sheet_name = 'Data', header = None).transpose()
    df.columns = df.iloc[0]
    df.iloc[1:,0] = df.iloc[1:,0].ffill(axis = 0)
    row_mi = pd.MultiIndex.from_arrays([df.iloc[1:,0], df.iloc[1:,1]], names = ['State', 'Measure'])
    values = df.iloc[1:,2:].values
    df_multi = pd.DataFrame(values, index = row_mi, columns = df.columns[2:])

    df_multi.replace('(X)', np.nan, inplace=True)
    df_multi.drop(df_multi.columns[~df_multi.any()], axis = 1, inplace = True)

    df_multi = df_multi.astype(str)
    for col_idx in range(len(df_multi.columns)):
        df_multi.iloc[:,col_idx] = df_multi.iloc[:,col_idx].str.rstrip('%').str.replace(',', '').astype('float', errors = 'ignore')

    # df_multi.tocsv(f'{df_name}_multi.csv')

    return df_multi

def read_pop_df(df_name):
    '''
    Reads and cleans Excel file of state selected population data

    df_name - string representing name of Excel file

    Returns pandas DataFrame of state selected population data
    '''
    df = pd.read_excel(f'{df_name}.xlsx', sheet_name = 'Data', header = None)
    df = df.transpose()

    # get upper level column measure indices
    upper_cols = df.columns[~df.iloc[1:].any()]
    # fill missing measure names
    df.iloc[0,0] = 'State'
    # get lower level column measure indices
    lower_cols = df.columns[df.iloc[1:].any()]
    # create list of upper level column names for upper level of pandas MultiIndex
    upper_names = ['']
    for upper_idx in range(len(upper_cols)):
        cur_col = upper_cols[upper_idx]
        # repeat upper level measure name appendage for number of lower level measures within
        if upper_idx != len(upper_cols) - 1:
            for _ in range(upper_cols[upper_idx + 1] - cur_col - 1):
                upper_names.append(df.iloc[0, cur_col])
        else:
            for _ in range(len(df.columns) - cur_col - 1):
                upper_names.append(df.iloc[0, cur_col])
    # create list of lower level names
    lower_names = [df.iloc[0, col] for col in lower_cols]
    # remove unwanted column names
    for _ in range(2):
        lower_names.pop(1)
    # create pandas MultiIndex
    col_mi = pd.MultiIndex.from_arrays([upper_names, lower_names], names = ['Measure', 'Submeasure'])

    # find and clean NAs
    df.replace('(X)', np.nan, inplace=True)
    # drop original upper level columns with only NAs
    df.drop(upper_cols, axis = 1, inplace = True)

    # create pandas DataFrame with new MultiIndex
    values = df.iloc[4:,np.r_[0,3:len(df.columns)]].values
    df_multi = pd.DataFrame(values, columns = col_mi)

    # drop unwanted columns
    df_multi.drop(df_multi.iloc[:,1:3], axis = 1, inplace = True)
    # cast all DataFrame values to string type
    df_multi = df_multi.astype(str)
    # clean values and cast back to floats
    for col_idx in range(len(df_multi.columns)):
        df_multi.iloc[:,col_idx] = df_multi.iloc[:,col_idx].str.rstrip('%').str.replace(',', '').astype('float', errors = 'ignore')

    # df.tocsv(f'{df_name}.csv')

    return df_multi

In [None]:
# read all ACS demographic tables
dem22 = read_dem_df('acsdem22')
dem21 = read_dem_df('acsdem21')
dem20 = read_dem_df('acsdem20')
dem19 = read_dem_df('acsdem19')
dem18 = read_dem_df('acsdem18')
dem17 = read_dem_df('acsdem17')
dem16 = read_dem_df('acsdem16')
dem15 = read_dem_df('acsdem15')
dem14 = read_dem_df('acsdem14')
dem13 = read_dem_df('acsdem13')
dem12 = read_dem_df('acsdem12')
dem11 = read_dem_df('acsdem11')
dem10 = read_dem_df('acsdem10')
# read all ACS selected population tables
pop22 = read_pop_df('acspop22')
pop21 = read_pop_df('acspop21')
pop19 = read_pop_df('acspop19')
pop18 = read_pop_df('acspop18')
pop17 = read_pop_df('acspop17')
pop16 = read_pop_df('acspop16')
pop15 = read_pop_df('acspop15')
pop14 = read_pop_df('acspop14')
pop13 = read_pop_df('acspop13')
pop12 = read_pop_df('acspop12')
pop11 = read_pop_df('acspop11')
pop10 = read_pop_df('acspop10')

In [None]:
dem22

Unnamed: 0_level_0,Unnamed: 1_level_0,Total population,Male,Female,Sex ratio (males per 100 females),Under 5 years,5 to 9 years,10 to 14 years,15 to 19 years,20 to 24 years,25 to 34 years,...,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some Other Race alone,Two or More Races.2,Two races including Some Other Race,"Two races excluding Some Other Race, and three or more races",Total housing units,"Citizen, 18 and over population",Male.3,Female.3
State,Measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Estimate,5074296,2461248.0,2613048.0,94.2,284064.0,311592.0,318523.0,340067.0,345126.0,652152.0,...,76682.0,1954.0,19927.0,169880.0,23323.0,146557.0,2339582.0,3862490.0,1838188.0,2024302.0
Alabama,Percent,5074296,48.5,51.5,,5.6,6.1,6.3,6.7,6.8,12.9,...,1.5,0.0,0.4,3.3,0.5,2.9,,3862490.0,47.6,52.4
Alaska,Estimate,733583,385667.0,347916.0,110.9,46497.0,49637.0,51178.0,44794.0,52259.0,113900.0,...,44905.0,14539.0,4034.0,78552.0,7898.0,70654.0,329160.0,537962.0,286333.0,251629.0
Alaska,Percent,733583,52.6,47.4,,6.3,6.8,7.0,6.1,7.1,15.5,...,6.1,2.0,0.5,10.7,1.1,9.6,,537962.0,53.2,46.8
Arizona,Estimate,7359197,3678381.0,3680816.0,99.9,393413.0,438358.0,463552.0,487734.0,522484.0,1018691.0,...,257020.0,13887.0,36042.0,284994.0,54738.0,230256.0,3186554.0,5322581.0,2640533.0,2682048.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wisconsin,Percent,5892539,50.2,49.8,,5.2,5.7,6.3,6.5,6.9,12.5,...,2.9,0.0,0.3,3.7,0.9,2.8,,4511381.0,49.7,50.3
Wyoming,Estimate,581381,297855.0,283526.0,105.1,30444.0,37271.0,38701.0,38287.0,36882.0,72434.0,...,3577.0,294.0,4997.0,22911.0,6611.0,16300.0,277106.0,442829.0,225657.0,217172.0
Wyoming,Percent,581381,51.2,48.8,,5.2,6.4,6.7,6.6,6.3,12.5,...,0.6,0.1,0.9,3.9,1.1,2.8,,442829.0,51.0,49.0
Puerto Rico,Estimate,3221789,1528789.0,1693000.0,90.3,99432.0,133195.0,170757.0,197196.0,216771.0,405834.0,...,2671.0,0.0,3573.0,3592.0,773.0,2819.0,1598570.0,2666630.0,1242825.0,1423805.0


## State Partisanship Scrape

To measure state partisanship, I operationalize partisanship in a given year as the party affiliation of the current state-specific officeholders last voted in by the state: state governor, majority of U.S. Senators, majority of U.S. Representatives, as well as the state's last majority vote for U.S. President. Each of these four offices contributes a quarter of the year's partisanship score.

Partisanship scores range from -1.0 (most Democratic) to 1.0 (most Republican). Independent officeholders and split offices (e.g., 1 Democratic and 1 Republican Senator) count as a point in neither direction.

Example:
Democratic governor (-.25) + Split Senators (.00) + Majority Republican Representatives (.25) + Last voted Republican president (.25) = .25 partisanship score for year

Presidential vote history data: https://ballotpedia.org/Presidential_voting_history_by_state

State office partisanship data: https://ballotpedia.org/Historical_and_potential_changes_in_trifectas



In [None]:
def get_data_tables(url):
    '''
    Get data table for each state from webpage

    url - string representing URL of page to scrape

    Returns list of HTML tables, each representing one state
    '''
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'lxml')
    states = soup.find_all('h3', string = re.compile('[A-Za-z^(Washington, D.C.)]'))[:50]
    return [(state.text.strip(), state.findNext('table')) for state in states]

In [None]:
pres_tables = get_data_tables('https://ballotpedia.org/Presidential_voting_history_by_state')
trifecta_tables = get_data_tables('https://ballotpedia.org/Historical_and_potential_changes_in_trifectas')

partisanship_dic = {}

# add key-value pair of state and historical partisanship scores from 2004-2024 for each state
for state in range(50):
    state_scores = []
    votes_by_year = []

    # presidential vote tracker
    pres_votes = pres_tables[state][1].find_all('tr')[1]
    pres_year_counter = 2004
    for pres_year in pres_votes.find_all('td')[27:]:
        # append vote list with vote for number of years vote was most recent
        if pres_year_counter == 2020:
            reps = 5
        else:
            reps = 4
        for _ in range(reps):
            votes_by_year.append([pres_year.text.strip()])
        pres_year_counter += 4

    # governor, senate, house officeholder trifecta
    trifecta_offices = trifecta_tables[state][1].find_all('tr')[1:]
    # for each state office, append vote list with party of that year's officeholders
    for office in trifecta_offices:
        office_votes = office.find_all('td')[13:]
        for office_year in range(len(votes_by_year)):
             office_vote = office_votes[office_year].text.strip()
             votes_by_year[office_year].append(office_vote)

    # convert vote lists to partisanship scores
    for year in votes_by_year:
        year_score = 0
        for vote in year:
            if vote == 'R':
                year_score += 1/4
            elif vote == 'D':
                year_score -= 1/4
        state_scores.append(year_score)

    partisanship_dic[pres_tables[state][0]] = state_scores

partisanship = pd.DataFrame().from_dict(partisanship_dic, orient = 'index')
partisanship.columns = range(2004, 2025)

In [None]:
partisanship

Unnamed: 0,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
Alabama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Alaska,1.0,1.0,1.0,0.5,0.5,0.5,0.5,0.5,0.5,1.0,...,0.75,0.75,0.25,0.25,0.75,0.75,0.75,0.75,0.5,0.5
Arizona,0.5,0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,0.0
Arkansas,0.0,0.0,0.0,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
California,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
Colorado,1.0,0.0,0.0,-0.5,-1.0,-1.0,-1.0,-0.5,-0.5,-1.0,...,-0.5,-0.5,-0.5,-0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
Connecticut,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
Delaware,-0.5,-0.5,-0.5,-0.5,-0.5,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
Florida,1.0,1.0,1.0,1.0,0.5,0.5,0.25,0.5,0.5,0.5,...,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Georgia,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5,0.5


In [None]:
partisanship.to_csv('partisanship.csv')

In [None]:
partisanship = pd.read_csv('partisanship.csv', index_col = 0)

## Voter Turnout Scrape

Voter turnout data: https://ballotpedia.org/Voter_turnout_in_United_States_elections

In [None]:
r = requests.get('https://ballotpedia.org/Voter_turnout_in_United_States_elections')
soup = BeautifulSoup(r.content, 'lxml')

turnout_table = soup.find_all('table', class_ = 'wikitable')[1]
turnout_data = []

# for each state, append turnout data list with list of recorded voter turnout from 2022 to 2004
for state in turnout_table.find_all(['tr'])[1:54]:
    turnout_data.append([cell.text.strip() for cell in state.find_all(['td', 'th'])[:-1]])

turnout = pd.DataFrame(turnout_data[1:], columns = turnout_data[0])
turnout.replace(['NA', 'N/A'], np.nan, inplace=True)

# clean turnout data to floating-point numbers
turnout = turnout.astype(str)
for col_idx in range(1, len(turnout.columns)):
    turnout.iloc[:,col_idx] = turnout.iloc[:,col_idx].str.rstrip('%').astype('float') / 100

turnout

Unnamed: 0,State,2022,2020,2018,2016,2014,2012,2010,2008,2006,2004
0,Alabama,0.3774,0.6313,0.473,0.593,0.332,,0.433,0.61,,0.574
1,Alaska,0.5078,0.6876,0.546,0.618,0.548,0.589,0.529,0.683,0.512,0.696
2,Arizona,0.4944,0.6592,0.491,0.562,0.341,0.53,0.416,0.574,0.396,0.548
3,Arkansas,0.4195,0.5607,0.414,0.531,0.403,0.511,0.379,0.529,0.389,0.544
4,California,0.4343,0.685,0.496,0.584,0.307,0.557,0.459,0.617,0.412,0.596
5,Colorado,0.5845,0.7641,0.63,0.721,0.547,0.706,0.517,0.716,0.481,0.673
6,Connecticut,0.4888,0.7149,0.544,0.654,0.425,0.614,0.459,,0.477,0.662
7,Delaware,0.4319,0.7068,0.514,0.646,0.349,,0.49,0.658,0.429,0.645
8,District of Columbia,0.4358,0.6408,0.437,0.611,0.357,0.616,0.296,0.617,0.287,0.549
9,Florida,0.4935,0.7166,0.549,0.657,0.433,0.633,0.422,0.666,0.401,0.647


In [None]:
turnout.to_csv('turnout.csv', index = False)

## Cost of Voting Index Cleaning

Cost of Voting Index: https://costofvotingindex.com/data

In [None]:
covi = pd.read_excel('covi.xlsx')
covi.iloc[:,[1,2,5]]

Unnamed: 0,state,year,FinalCOVI
0,AL,1996,-0.399114
1,AK,1996,0.663697
2,AZ,1996,0.821447
3,AR,1996,0.346197
4,CA,1996,0.530294
...,...,...,...
395,VA,2022,-0.740231
396,WA,2022,-2.450240
397,WV,2022,-0.189976
398,WI,2022,1.231225


## Campaign Finance Scrape (Work in Progress)

Below, I scrape OpenSecrets.com with Selenium for data on money raised for ballot measure campaigns.

In [None]:
pip install google_colab_selenium

Collecting google_colab_selenium
  Downloading google_colab_selenium-1.0.14-py3-none-any.whl.metadata (2.7 kB)
Collecting selenium (from google_colab_selenium)
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium->google_colab_selenium)
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium->google_colab_selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium->google_colab_selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium->google_colab_selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium->google_colab_selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading google_colab_selenium-1.0.14-py3-none-any.whl (8.

In [None]:
import google_colab_selenium as gs # Copyright (c) 2023 Jacob Padilla, https://github.com/jpjacobpadilla/Google-Colab-Selenium/tree/main/google_colab_selenium
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium import webdriver
import re
import undetected_chromedriver as uc

driver = gs.UndetectedChrome()

measures = []

for year in range(2004, 2025):
    driver.get(f'https://www.opensecrets.org/ballot-measures/{year}')
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="DataTables_Table_0_next"]')))
    next_button = driver.find_element(By.XPATH, '//*[@id="DataTables_Table_0_next"]')
    for page_num in driver.find_elements(By.XPATH, '//*[@id="DataTables_Table_0_paginate"]/span'):
        measure_table = driver.find_element(By.XPATH, '//*[@id="DataTables_Table_0"]/tbody')
        for row in measure_table.find_elements(By.TAG_NAME, 'tr'):
            measure = driver.find_element(By.TAG_NAME, 'a')
            measure_link = measure.get_attribute('href')
            measure_title = measure_link.text
            measures.append((year, measure_title, measure_link))
        next_button.click()


for measure in measures:
    support = 0
    oppose = 0
    driver.get(measure[2])
    finance_table = driver.find_element(By.XPATH, '//*[@id="DataTables_Table_0"]/tbody')
    for row in finance_table.find_elements(By.TAG_NAME, 'tr'):
        sup_or_opp = row.find_element(By.TAG_NAME, 'td')[2].text
        raised = float(row.find_element(By.TAG_NAME, 'td')[3].text.strip('$').replace(',', ''))
        if sup_or_opp == 'Support':
            support += raised
        else:
            oppose += raised
    measure.append(support)
    measure.append(oppose)

driver.quit()

finance = pd.DataFrame(measures, columns = ['Year', 'Title', 'Link', 'Support_dollars', 'Oppose_dollars'])
finance

TimeoutException: Message: 
Stacktrace:
#0 0x587b8ab4871a <unknown>
#1 0x587b8a819640 <unknown>
#2 0x587b8a868c0b <unknown>
#3 0x587b8a868ef1 <unknown>
#4 0x587b8a8acb64 <unknown>
#5 0x587b8a88b90d <unknown>
#6 0x587b8a8aa08a <unknown>
#7 0x587b8a88b683 <unknown>
#8 0x587b8a85bd71 <unknown>
#9 0x587b8a85c7de <unknown>
#10 0x587b8ab102ab <unknown>
#11 0x587b8ab14242 <unknown>
#12 0x587b8aafd665 <unknown>
#13 0x587b8ab14dd2 <unknown>
#14 0x587b8aae22af <unknown>
#15 0x587b8ab37eb8 <unknown>
#16 0x587b8ab38090 <unknown>
#17 0x587b8ab474ec <unknown>
#18 0x7c5d0358aac3 <unknown>


In [None]:
driver.quit()