In [None]:
#Data source
#Main website is https://www.sos.ca.gov/elections/prior-elections/statewide-election-results
#From there you navigate to the General Election section of the correct year, and then 
# Voter Registration Statistics by County

#Notes
#1) These files are one page pdf files that I copied into a text file
#https://elections.cdn.sos.ca.gov/sov/2014-general/pdf/02-voter-reg-stats-by-county.pdf
#https://elections.cdn.sos.ca.gov/sov/2016-general/sov/02-voter-reg-stats-by-county.pdf
#https://elections.cdn.sos.ca.gov/sov/2018-general/sov/02-county-voter-reg-stats-by-county.pdf

#2) Data is only available every two years, so we are using one file to cover a two year period.
#.  For example, the 2014 data will be assigned to years 2014 and 2015

In [None]:
import pandas as pd
import re

In [None]:
#Read in files
with open("/work/assets/ca_county_voter_reg_2014.rtf","r") as file:
    voter_data_2014 = file.read()

with open("/work/assets/ca_county_voter_reg_2016.rtf","r") as file:
    voter_data_2016 = file.read()

with open("/work/assets/ca_county_voter_reg_2018.txt","r") as file:
    voter_data_2018 = file.read()


In [None]:
voter_data_2014

'{\\rtf1\\ansi\\ansicpg1252\\cocoartf2580\n\\cocoatextscaling0\\cocoaplatform0{\\fonttbl\\f0\\froman\\fcharset0 Times-Roman;}\n{\\colortbl;\\red255\\green255\\blue255;\\red0\\green0\\blue0;}\n{\\*\\expandedcolortbl;;\\cssrgb\\c0\\c0\\c0;}\n\\margl1440\\margr1440\\vieww11520\\viewh8400\\viewkind0\n\\deftab720\n\\pard\\pardeftab720\\partightenfactor0\n\n\\f0\\fs24 \\cf2 \\expnd0\\expndtw0\\kerning0\n\\outl0\\strokewidth0 \\strokec2 Alameda 1,036,648 814,009 451,535 108,919 16,622 97 10,072 4,210 2,936 46,644 172,974 Alpine 878 764 280 228 30 0 13 6 1 7 199 Amador 26,593 20,798 6,467 9,185 909 1 119 224 49 58 3,786 Butte 167,802 117,503 38,793 42,484 4,165 13 1,446 1,245 502 1,711 27,144 Calaveras 35,473 27,068 8,180 11,344 1,274 4 240 369 99 234 5,324 Colusa 12,296 7,595 2,542 3,341 202 0 21 36 13 2 1,438 Contra Costa 718,685 527,521 258,862 126,134 14,029 36 3,347 3,161 1,174 1,343 119,435 Del Norte 18,253 12,750 4,153 4,548 593 1 97 118 55 176 3,009 El Dorado 135,707 106,931 30,540 45,

In [None]:
#Set up regex patterns - they are different in different years
pattern_2014 = """
        (?P<county>[A-Za-z ]+)             #county
        (\s )
        (?P<eligible>[\d,]+)               #eligible voters 
        (\s )
        (?P<registered>[\d,]+)             #registered voters
        (\s )
        (?P<democrat>[\d,]+)               #registered as Democratic 
        (\s )
        (?P<republican>[\d,]+)             #registered as Republican
        (\s )
        (?P<american_independent>[\d,]+)   #registered as American Independent
        (\s )
        (?P<americans_elect>[\d,]+)        #registered as Americans Elect
        (\s )
        (?P<green>[\d,]+)                  #registered as Green
        (\s )
        (?P<liberterian>[\d,]+)            #registered as Liberterian
        (\s )
        (?P<peace_and_freedom>[\d,]+)      #registered as Peace and Freedom
        (\s )
        (?P<other>[\d,]+)                  #registered as other
        (\s )
        (?P<no_party>[\d,]+)               #registered as No Party Preference
    """

pattern_2016 = """
        (?P<county>[A-Za-z ]+)             #county
        (\s )
        (?P<eligible>[\d,]+)               #eligible voters 
        (\s )
        (?P<registered>[\d,]+)             #registered voters
        (\s )
        (?P<democrat>[\d,]+)               #registered as Democratic 
        (\s )
        (?P<republican>[\d,]+)             #registered as Republican
        (\s )
        (?P<american_independent>[\d,]+)   #registered as American Independent
        (\s )
        (?P<green>[\d,]+)                  #registered as Green
        (\s )
        (?P<liberterian>[\d,]+)            #registered as Liberterian
        (\s )
        (?P<peace_and_freedom>[\d,]+)      #registered as Peace and Freedom
        (\s )
        (?P<other>[\d,]+)                  #registered as other
        (\s )
        (?P<no_party>[\d,]+)               #registered as No Party Preference
    """


pattern_2018 = """
        (?P<county>[A-Za-z ]+)             #county
        (\s )
        (?P<eligible>[\d,]+)               #eligible voters 
        (\s )
        (?P<registered>[\d,]+)             #registered voters
        (\s )
        (?P<democrat>[\d,]+)               #registered as Democratic 
        (\s )
        (?P<republican>[\d,]+)             #registered as Republican
        (\s )
        (?P<american_independent>[\d,]+)   #registered as American Independent
        (\s )
        (?P<green>[\d,]+)                  #registered as Green
        (\s )
        (?P<liberterian>[\d,]+)            #registered as Liberterian
        (\s )
        (?P<peace_and_freedom>[\d,]+)      #registered as Peace and Freedom
        (\s )
        (?P<unknown>[\d,]+)                #registered but not known
        (\s )
        (?P<other>[\d,]+)                  #registered as other
        (\s )
        (?P<no_party>[\d,]+)               #registered as No Party Preference
    """

In [None]:
#Run regex and put results in dataframes, remove commas and convert fields to integer
#2014 and 2015
result_2014 = [item.groupdict() for item in re.finditer(pattern_2014,voter_data_2014,re.VERBOSE)]
result_2014_df=pd.DataFrame(result_2014).set_index('county')
result_2014_df = result_2014_df.apply(lambda x: x.str.replace(',', ''))
result_2014_df = result_2014_df.astype('int32')
result_2014_df['year']=2014

result_2015_df=pd.DataFrame(result_2014).set_index('county')
result_2015_df = result_2015_df.apply(lambda x: x.str.replace(',', ''))
result_2015_df = result_2015_df.astype('int32')
result_2015_df['year']=2015

#2016 and 2017
result_2016 = [item.groupdict() for item in re.finditer(pattern_2016,voter_data_2016,re.VERBOSE)]
result_2016_df=pd.DataFrame(result_2016).set_index('county')
result_2016_df = result_2016_df.apply(lambda x: x.str.replace(',', ''))
result_2016_df = result_2016_df.astype('int32')
result_2016_df['year']=2016

result_2017_df=pd.DataFrame(result_2016).set_index('county')
result_2017_df = result_2017_df.apply(lambda x: x.str.replace(',', ''))
result_2017_df = result_2017_df.astype('int32')
result_2017_df['year']=2017

#2018 and 2019
result_2018 = [item.groupdict() for item in re.finditer(pattern_2018,voter_data_2018,re.VERBOSE)]
result_2018_df=pd.DataFrame(result_2018).set_index('county')
result_2018_df = result_2018_df.apply(lambda x: x.str.replace(',', ''))
result_2018_df = result_2018_df.astype('int32')
result_2018_df['year']=2018

result_2019_df=pd.DataFrame(result_2018).set_index('county')
result_2019_df = result_2019_df.apply(lambda x: x.str.replace(',', ''))
result_2019_df = result_2019_df.astype('int32')
result_2019_df['year']=2019


In [None]:
#Standardize fields across the years
#Confirm that all columns to the right of registered sum to registered
# result_2018_df['all_reg']= result_2018_df['democrat']+result_2018_df['republican']+result_2018_df['american_independent']+result_2018_df['green']+result_2018_df['liberterian']+result_2018_df['peace_and_freedom']+result_2018_df['unknown']+result_2018_df['other']+result_2018_df['no_party']-result_2018_df['registered']
# result_2018_df['all_reg'].sum()

#Create aggregate field for other/unknown parties
result_2014_df['other_party'] = result_2014_df['americans_elect']+result_2014_df['other']
result_2015_df['other_party'] = result_2015_df['americans_elect']+result_2014_df['other']
result_2016_df['other_party'] = result_2016_df['other']
result_2017_df['other_party'] = result_2017_df['other']
result_2018_df['other_party'] = result_2018_df['unknown']+result_2018_df['other']
result_2019_df['other_party'] = result_2019_df['unknown']+result_2019_df['other']


#Drop columns we don't need
result_2014_df.drop(columns=['americans_elect','other'],inplace=True)
result_2015_df.drop(columns=['americans_elect','other'],inplace=True)
result_2016_df.drop(columns=['other'],inplace=True)
result_2017_df.drop(columns=['other'],inplace=True)
result_2018_df.drop(columns=['unknown','other'],inplace=True)
result_2019_df.drop(columns=['unknown','other'],inplace=True)

In [None]:
#Put all six years in one df
all_years_df = pd.concat([result_2014_df,result_2015_df, result_2016_df, result_2017_df, result_2018_df, result_2019_df])
all_years_df['year'].value_counts()

2014    58
2015    58
2016    58
2017    58
2018    58
2019    58
Name: year, dtype: int64

In [None]:
#Create usable features for ml analysis

# % of registered voters
all_years_df['registered_pct'] = all_years_df['registered']/all_years_df['eligible'] * 100

# % of voters registered in each party
all_years_df['democrat_pct'] = all_years_df['democrat']/all_years_df['registered'] * 100
all_years_df['republican_pct'] = all_years_df['republican']/all_years_df['registered'] * 100
all_years_df['american_independent_pct'] = all_years_df['american_independent']/all_years_df['registered'] * 100
all_years_df['green_pct'] = all_years_df['green']/all_years_df['registered'] * 100
all_years_df['liberterian_pct'] = all_years_df['liberterian']/all_years_df['registered'] * 100
all_years_df['peace_and_freedom_pct'] = all_years_df['peace_and_freedom']/all_years_df['registered'] * 100
all_years_df['no_party_pct'] = all_years_df['no_party']/all_years_df['registered'] * 100
all_years_df['other_party_pct'] = all_years_df['other_party']/all_years_df['registered'] * 100

# drop original columns that are no longer needed
all_years_df.columns
all_years_df = all_years_df.drop(columns=['eligible', 'registered', 'democrat', 'republican',
       'american_independent', 'green', 'liberterian', 'peace_and_freedom',
       'no_party', 'other_party'])


In [None]:
all_years_df.head()

Unnamed: 0_level_0,year,registered_pct,democrat_pct,republican_pct,american_independent_pct,green_pct,liberterian_pct,peace_and_freedom_pct,no_party_pct,other_party_pct
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alameda,2014,78.523182,55.470517,13.380565,2.041992,1.237333,0.517193,0.360684,21.249642,5.742074
Alpine,2014,87.015945,36.649215,29.842932,3.926702,1.701571,0.78534,0.13089,26.04712,0.91623
Amador,2014,78.208551,31.094336,44.1629,4.370613,0.57217,1.077027,0.2356,18.203673,0.283681
Butte,2014,70.024791,33.014476,36.155673,3.54459,1.230607,1.059547,0.427223,23.100687,1.467197
Calaveras,2014,76.305923,30.220186,41.909266,4.706665,0.886656,1.363233,0.365746,19.668982,0.879267


In [None]:
#Write data to a file to be used in supervised/unsupervised learning algorithms
all_years_df.to_csv('/work/cleaned-csvs/ca_voter_reg_2014_2019.csv')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f6c76417-5fde-42f3-8920-755838dec3fa' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>