In [10]:
import os
import zipfile
import pandas as pd

DOWNLOAD_DIR = os.path.join(
    os.getcwd(), 'election-results-scraper/scraped_files')
SOURCE_DIR = os.path.join(os.getcwd(), 'data/source/')
COUNTY_RESULTS_DIR = os.path.join(SOURCE_DIR, 'county_election_results')
PRECINCT_RESULTS_DIR = os.path.join(SOURCE_DIR, 'precinct_election_results')
STATE_RESULTS_DIR = os.path.join(SOURCE_DIR, 'state_election_results')


def unzipCountyResults():
    countyZipFiles = [f for f in os.listdir(DOWNLOAD_DIR) if f.split('_')[
        1].startswith('County')]
    for f in countyZipFiles:
        with zipfile.ZipFile(os.path.join(DOWNLOAD_DIR, f), 'r') as z:
            z.extractall(COUNTY_RESULTS_DIR)


def unzipPrecinctResults():
    precinctZipFiles = [f for f in os.listdir(DOWNLOAD_DIR) if f.split('_')[
        1].startswith('Precinct')]
    for f in precinctZipFiles:
        with zipfile.ZipFile(os.path.join(DOWNLOAD_DIR, f), 'r') as z:
            z.extractall(PRECINCT_RESULTS_DIR)


def moveStateResults():
    stateResultsFiles = [f for f in os.listdir(DOWNLOAD_DIR) if f.split('_')[
        1].startswith('State')]
    print(
        f"Moving {len(stateResultsFiles)} files from ${DOWNLOAD_DIR} to {STATE_RESULTS_DIR}")
    for f in stateResultsFiles:
        os.rename(os.path.join(DOWNLOAD_DIR, f),
                  os.path.join(STATE_RESULTS_DIR, f))


In [30]:
stateResultsFiles = [f for f in os.listdir(STATE_RESULTS_DIR)]
df_state_res = pd.concat([pd.read_csv(os.path.join(STATE_RESULTS_DIR, f),
                       engine='python', delimiter=",", encoding='utf-8', index_col=False) for f in stateResultsFiles])
df_state_res.head()

Unnamed: 0,elec_date,entity_description,race_number,race_description,race_party,tot_race_prec,race_prec_reporting,cand_number,cand_name,cand_party,cand_absmail_votes,cand_early_votes,cand_elecday_votes,cand_tot_votes,race_county_owner
0,4/7/2015,"FEDERAL, STATE AND COUNTY",11001,FOR STATE SENATOR DISTRICT 11 (UNEXPIRED TERM),DEM,35.0,35.0,1,KEVIN MATTHEWS,DEM,88,73,1458,1619,
1,4/7/2015,"FEDERAL, STATE AND COUNTY",11001,FOR STATE SENATOR DISTRICT 11 (UNEXPIRED TERM),DEM,35.0,35.0,2,REGINA GOODWIN,DEM,73,95,1149,1317,
2,4/7/2015,"FEDERAL, STATE AND COUNTY",11001,FOR STATE SENATOR DISTRICT 11 (UNEXPIRED TERM),DEM,35.0,35.0,3,HEATHER NASH,DEM,17,12,457,486,
3,4/7/2015,BECKHAM COUNTY,30001,COUNTY QUESTION,,13.0,13.0,1,FOR THE PROPOSITION - YES,,41,34,1272,1347,BECKHAM
4,4/7/2015,BECKHAM COUNTY,30001,COUNTY QUESTION,,13.0,13.0,2,AGAINST THE PROPOSITION - NO,,15,15,919,949,BECKHAM


In [55]:
print(
    f"Race numbers: {len(df_state_res[['race_number']].drop_duplicates().index)}")
print(
    f"Race description: {len(df_state_res[['race_description']].drop_duplicates().index)}")
print(
    f"Race numbers + dates: {len(df_state_res[['race_number', 'elec_date']].drop_duplicates().index)}")
print(
    f"Race numbers + dates + description: {len(df_state_res[['race_number', 'elec_date', 'race_description', 'race_party', 'tot_race_prec', 'race_prec_reporting']].drop_duplicates().index)}")


Race numbers: 2645
Race description: 4368
Race numbers + dates: 8384
Race numbers + dates + description: 8384


In [42]:
c = df_state_res[['entity_description', 'race_number', 'elec_date', 'race_description']].drop_duplicates().groupby('entity_description').size()
c.sort_values(ascending=False)
# df_state_res[df_state_res['race_number'] == 6061].head()

entity_description
FEDERAL, STATE AND COUNTY                                        1095
CITY OF TULSA                                                      64
CITY OF LAWTON                                                     55
CITY OF NORMAN                                                     51
CITY OF ALTUS                                                      51
                                                                 ... 
GARFIELD COUNTY (EMS)                                               1
STRINGTOWN PUBLIC SCHOOLS INDEPENDENT SCHOOL DISTRICT NO. 007       1
STRINGTOWN PUBLIC SCHOOLS                                           1
STRATFORD PUBLIC SCHOOLS INDEPENDENT SCHOOL DISTRICT NO. 002        1
ZION PUBLIC SCHOOL                                                  1
Length: 1439, dtype: int64

In [53]:
df_state_res[['race_number', 'elec_date', 'race_description']].drop_duplicates().head()

Unnamed: 0,race_number,elec_date,race_description
0,11001,4/7/2015,FOR STATE SENATOR DISTRICT 11 (UNEXPIRED TERM)
3,30001,4/7/2015,COUNTY QUESTION
5,30002,4/7/2015,COUNTY QUESTION
7,30004,4/7/2015,COUNTY QUESTION
9,40201,4/7/2015,FOR COMMISSIONER WARD 2 CITY OF CHEROKEE


In [50]:
def loadElectionDates(engine):
    from datetime import datetime
    df_dates = pd.read_json('./data/source/election_dates.json')
    df_dates['id'] = df_dates['date']
    df_dates['date'] = df_dates.apply(
        lambda r: datetime.strptime(str(r['date']), '%Y%m%d'), axis=1)
    df_dates.to_sql('election_dates', con=engine, if_exists='replace')


Unnamed: 0,date,label,id
0,2012-11-06,General Election,20121106
1,2012-08-28,Runoff Primary Election,20120828
2,2012-06-26,Primary Election and Special Elections,20120626
3,2012-04-03,Board of Education Runoff Election and Special...,20120403
4,2012-03-06,Presidential Preferential Primary and Special ...,20120306


In [52]:
from datetime import datetime
df_dates = pd.read_json('./data/source/election_dates.json')
df_dates['id'] = df_dates['date']
df_dates['date'] = df_dates.apply(
    lambda r: datetime.strptime(str(r['date']), '%Y%m%d'), axis=1)
df_dates[['id', 'date', 'label']].to_csv(
    './data/staging/election_dates.csv', index=False)


In [61]:
from etl.loadStateResults import loadStateResults

rc = loadStateResults()
print(f"Loaded state results {rc}") 

Loaded state results       race_number   elec_date  \
0            6061   6/11/2013   
1            6061   8/26/2014   
2            6141    4/1/2014   
3            6141    4/2/2013   
4            6241   10/8/2013   
...           ...         ...   
8379        60721   8/11/2015   
8380        60731   11/3/2020   
8381        60731   11/8/2016   
8382        60732   11/8/2016   
8383        60751  11/14/2017   

                                       race_description race_party  \
0                        PROPOSITION  BLAINE COUNTY EMS       None   
1                     PROPOSITION OKEENE PUBLIC SCHOOLS       None   
2             PROPOSITION  CITY OF NORMAN - PROPOSITION       None   
3            PROPOSITION   CITY OF NORMAN - PROPOSITION       None   
4                           PROPOSITION  SPECIAL COUNTY       None   
...                                                 ...        ...   
8379             PROPOSITION  BERRYHILL FIRE PROTECTION       None   
8380  FOR BOARD MEMBER

race_number
elec_date
race_description
race_party
tot_race_prec
race_prec_reporting
race_tot_votes
race_absmail_votes
race_early_votes
race_elecday_votes
race_num_candidates
