# Parse Archived Statement of Votes, Orange County (CA)

## Requirements

In [1]:
#import sys
#!{sys.executable} -m pip install requests
#!{sys.executable} -m pip install beautifulsoup4

import requests
from bs4 import BeautifulSoup
import re
import zipfile
import os
import shutil
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import csv



## Set URL parameters

### Define function for creating temp directory

In [2]:
def create_directory(path):
    try:
        # Create the directory if it doesn't exist
        os.makedirs(path, exist_ok=True)
    except Exception as e:
        print(f"Failed to create directory '{path}'. Reason: {e}")

### Define incoming and temp paths

In [3]:
# URL to scrape
url = 'https://www.ocvote.gov/data/election-results-archives/archived-statement-of-votes'

# URL prefix
url_prefix = 'https://www.ocvote.gov'

# Local destination
local_prefix = '/Users/alisonpitt/Documents/Data Sets/oc_vote/'
create_directory(local_prefix)

## Fetch and parse index page contents

In [4]:
# Send a GET request to fetch the content of the page
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
section = soup.find(class_="page__body")
list_of_values = section.find_all('p')

## Parse site for headings and links

In [5]:
# List to store the results
link_list = []

# Initialize the current election, link name and file name
current_election = ''
link = ''
dl_filename = ''

# Initialize list index
index = 0

# Loop through all <p> elements
for element in list_of_values:
    # Check if the element has a hyperlink
    if 'a href=' in str(element):
        link_type = element.string
        link = url_prefix + element.find('a')['href']
        dl_filename = link.rsplit('/', 1)[-1]
        # Only record the link if it's a zip file (aka the plain-text extract)
        if dl_filename[-3:] == 'zip':
            link_list.append((index, current_election, link_type, link, dl_filename))
            index = index + 1
    
    # Otherwise
    else:
        current_election = element.string

link_list

[(0,
  'June 4, 2024 City of Anaheim District 3 Special Recall Election',
  'Plain-text extract for data analysis (media.zip).',
  'https://www.ocvote.gov/sites/default/files/elections/anarcl2024/results/media.zip',
  'media.zip'),
 (1,
  'March 5, 2024\xa0Presidential Primary Election',
  'Plain-text extract for data analysis (media.zip).',
  'https://www.ocvote.gov/fileadmin/live/PRI2024/media.zip',
  'media.zip'),
 (2,
  'November 14, 2023 City of Santa Ana Special Recall Election',
  'Plain-text extract for data analysis (media.zip).',
  'https://www.ocvote.gov/fileadmin/live/sarcl2023/media.zip',
  'media.zip'),
 (3,
  'October\xa03, 2023 City of Anaheim Special Municipal Election',
  'Plain-text extract for data analysis (media.zip).',
  'https://www.ocvote.gov/fileadmin/live/2023ANA/media.zip',
  'media.zip'),
 (4,
  'January 31, 2023 City of Seal Beach Districts 3 and 5 Municipal Run-Off Election',
  'Plain-text extract for data analysis (media.zip).',
  'https://www.ocvote.gov

## Download files one by one and append to a dataframe

To do this:
- Set up the file for download
- Download the remote zip file
- Extract zip file (throw error if more than one file)
- Union with full dataframe
- Delete temp files

### Define functions for parsing and appending

In [6]:
def delete_file(path):
    try:
        # Delete the zip file if it exists
        os.remove(path)
    except Exception as e:
        print(f"Failed to delete zip file '{path}'. Reason: {e}")

def is_first_row_value(file_path, value):
    with open(file_path, 'r', encoding='latin-1') as file:
        reader = csv.reader(file)
        first_row = next(reader)  # Get the first row
        return value in first_row

def detect_delimiter(file_path):
    comma_count = 0
    tab_count = 0
    
    with open(file_path, 'r', encoding='latin-1') as file:
        # Read the first 3 lines
        for _ in range(3):
            line = file.readline()
            if not line:
                break
            # Update the counts
            comma_count += line.count(',')
            tab_count += line.count('\t')

    # Determine the delimiter
    if comma_count > tab_count:
        delimiter = ','
    elif tab_count > comma_count:
        delimiter = '\t'
    else:
        delimiter = None  # Could not determine

    return delimiter

### Download all files

In [7]:
full_df = pd.DataFrame()

for item in link_list:
    #print(f'--- Iteration {item[0]} ---')
    # Set up the file for download
    file_url = item[3] # Specify the URL of the file to download

    # Define the temp file
    local_tempfile = local_prefix + 'temp.zip'

    # Send a GET request to the URL
    dl_file = requests.get(file_url)

    # Write the content of the response to a local file
    with open(local_tempfile, 'wb') as file:
        file.write(dl_file.content)

    #print(f'File {file_url} was written to {local_tempfile}')

    # Extract the zip file
    with zipfile.ZipFile(local_tempfile, 'r') as zip_ref:
        temp_filenames = zip_ref.namelist()
        zip_ref.extractall(local_prefix)
    
    # Read all extracted files into a pandas dataframe
    for name in temp_filenames:
        # Check for delimiter changes and whether to skip first row
        # Detect delimiter
        detected_delimiter = detect_delimiter(local_prefix + name)
        if detected_delimiter in [',', '\t']:
            # Check if '#FormatVersion 1' is in the first row
            if is_first_row_value(local_prefix + name, '#FormatVersion 1'):
                df = pd.read_csv(local_prefix + name, delimiter=detected_delimiter, skiprows=1, encoding='latin-1')
            else:
                df = pd.read_csv(local_prefix + name, delimiter=detected_delimiter, encoding='latin-1')
        else:
            print(f'Delimiter unknown. File {name} from {item[1]} was skipped.')
        df['Election ID'] = item[0]
        df['Election Name'] = item[1]
        full_df = pd.concat([df, full_df], ignore_index=True)

    # Delete the zip file
    delete_file(local_tempfile)

    # Delete the extracted file
    for name in temp_filenames:
        delete_file(local_prefix + name)

In [8]:
print(full_df)

         Precinct_Name  Split_Name  precinct_splitId  Reg_voters  Ballots  \
0               2001.0         NaN           28135.0       794.0    534.0   
1               2001.0         NaN           28135.0       794.0    534.0   
2               2001.0         NaN           28135.0       794.0    534.0   
3               2001.0         NaN           28135.0       794.0    534.0   
4               2001.0         NaN           28135.0       794.0    534.0   
...                ...         ...               ...         ...      ...   
2717209            NaN         NaN               NaN         NaN      NaN   
2717210            NaN         NaN               NaN         NaN      NaN   
2717211            NaN         NaN               NaN         NaN      NaN   
2717212            NaN         NaN               NaN         NaN      NaN   
2717213            NaN         NaN               NaN         NaN      NaN   

         Reporting  Contest_id                 Contest_title Contest_party 

## Transform the data

To do this:
- Split the dataframe into Precinct data and Election data
- Aggregate Precinct data to get to the right level of detail
- Additional cleaning as needed
- Export Precinct data and Election data to csv for Tableau ingestion

In [9]:
#print(full_df.head())
col_list = full_df.columns.to_list()
print(sorted(col_list))

['#Precinct', '.Precinct', 'AI Ballots Cast', 'AI Turnout Percentage', 'AI Voters', 'Absentee_votes', 'Ballots', 'Ballots Cast', 'Ballots_cast', 'Candidate_Type', 'Candidate_name', 'Choice ID', 'Choice Name', 'Choice Party', 'Choice_id', 'Choice_party', 'Contest ID', 'Contest Party', 'Contest Title', 'Contest_id', 'Contest_party', 'Contest_title', 'DEM Ballots Cast', 'DEM Turnout Percentage', 'DEM Voters', 'Early VC Invalid Votes', 'Early VC Overvotes', 'Early VC Undervotes', 'Early VC Votes', 'Early Vote Center Invalid Votes', 'Early Vote Center Overvotes', 'Early Vote Center Undervotes', 'Early Vote Center Votes', 'Early_votes', 'Election Day Vote Center Invalid Votes', 'Election Day Vote Center Overvotes', 'Election Day Vote Center Undervotes', 'Election Day Vote Center Votes', 'Election Days VC Invalid Votes', 'Election Days VC Overvotes', 'Election Days VC Undervotes', 'Election Days VC Votes', 'Election Days Vote Center Invalid Votes', 'Election Days Vote Center Overvotes', 'Elec

### Build Precinct data

In [10]:
precinct_data = pd.DataFrame()

# Coalesce dimensions
precinct_data['precinct'] = full_df.loc[:, ('#Precinct', '.Precinct', 'Precinct_Name')].bfill(axis=1).iloc[:, 0]
precinct_data['election_id'] = full_df.loc[:, ('Election ID')]

In [11]:
# Sum measures
## Voters
mask_voters = full_df.columns.str.contains('Voters|voters')
voters = full_df.loc[:, mask_voters]
precinct_data['registered_voters'] = voters.sum(axis=1)

## Ballots
mask_ballots = full_df.columns.str.contains('Ballots|ballots')
ballots = full_df.loc[:, mask_ballots]
precinct_data['ballots_total'] = ballots.sum(axis=1)

### Aggregate Precinct data to correct level of details (by precinct)

In [12]:
precinct_data = precinct_data.groupby(['precinct', 'election_id']).min().reset_index()

### Build Election data

In [13]:
election_data = pd.DataFrame()

# Coalese dimensions
election_data['precinct']       = precinct_data['precinct']
election_data['election_id']    = precinct_data['election_id']
election_data['election_name']  = full_df['Election Name']
election_data['contest_id']     = full_df['Contest ID'].combine_first(full_df['Contest_id'])
election_data['contest_party']  = full_df['Contest Party'].combine_first(full_df['Contest_party'])
election_data['concest_title']  = full_df['Contest Title'].combine_first(full_df['Contest_title'])
election_data['choice_id']      = full_df['Choice ID'].combine_first(full_df['Choice_id'])
election_data['choice_name']    = full_df['Choice Name'].combine_first(full_df['Candidate_name'])
election_data['choice_party']   = full_df['Choice Party'].combine_first(full_df['Choice_party'])
election_data['choice_type']    = full_df['Candidate_Type']

In [14]:
# Sum measures
## Votes
mask_votes = full_df.columns.str.contains('Votes|votes')
votes = full_df.loc[:, mask_votes]
election_data['total_votes'] = votes.sum(axis=1)

In [15]:
precinct_rows, precinct_columns = precinct_data.shape
election_rows, election_columns = election_data.shape
print(f'Precinct data has {precinct_rows} rows and {precinct_columns} columns.')
print(f'Election data has {election_rows} rows and {election_columns} columns.')

Precinct data has 25422 rows and 4 columns.
Election data has 25422 rows and 11 columns.


## Export to CSV

In [16]:
precinct_data.to_csv(local_prefix + 'precincts.csv', index=False)
election_data.to_csv(local_prefix + 'elections.csv', index=False)