# Votes: Clean voting data

Basic cleaning: 
* Add a header row
* Comma-separate (not this stupid pipe crap)
* Capitalise all names and address fields (Street name, city)

Column-specific cleaning:
* Clean up dirt in the voting column "city"
* If street name is numeric, then swap apt and street name (affects 27000 people in ESSEX)
* If apt/unit no. is filled and street name is empty, put apt/unit no. contents into street name
* Many birthdates are 01/01/1800 (1200 of these in Essex0); some are 01/01/1111, 12/31/1899.  Some are blank.  Turned all the nonvalid birthdates (including blanks) into 01/01/1800.

Notes and remaining issues: 
* TODO: Last row in votes file is the number of voters. Removed this, but doesn't match the number of voting ids: check them?
* TODO: VAN only takes 10-digit phone numbers. Turned everything else into blanks, but will need to think about how to potentially use these other numbers too (e.g. are there dominant local area codes we can use?). Wierd thing of the day is that there's a dominant 3-digit code (732): does this mean something like "do not contact", or "doesn't have phone"?
* Ignored: IF mailing address is empty, copy over the street address
* Burlington county: why is Egg Harbor in here?
* Whole pile of votes in Sussex with city as "sussex" (they're in Wantage etc; think clerk may have been lazy here... )

In [8]:
# Just run this section!!!

import pandas as pd
import csv
import zipfile
import os
import glob
import csv 
import re
import njvotes
pd.set_option("display.max_columns", 999)


voteheaders = njvotes.get_voteheaders()
voteheadertypes = njvotes.get_voteheadertypes()
citycorrections = njvotes.set_citycorrections()

# Get dataset
for county in citycorrections.keys():
    print('{}'.format(county))
    votedatafile = '../2017 Voting data/' + county + '/ElectionHistory.txt'
    df = pd.read_csv(votedatafile, sep='|', names=voteheaders, dtype=voteheadertypes) 
    # index_col=None, low_memory=False 
    
    # last row in voting file is a comment with number of voters
    df = df[:-1]
    df = df.drop('xx',axis=1)
    df = df.fillna('')

    # Lowercase the city names and clean up all the obvious errors in them
    df['city'] = df['city'].str.lower().str.strip() #Yep, there were spaces at the end of city names...
    for badval, goodval in citycorrections[county].items():
        df['city'] = df['city'].str.replace(badval, goodval)

    # Clean up errors in address columns
    idx = df['street name'] == ''
    df.loc[idx, ['street name', 'apt/unit no.']] = df.loc[idx, ['apt/unit no.', 'street name']].values
    idx = (df['street name'].str.isnumeric())
    df.loc[idx,['street name','apt/unit no.']] = df.loc[idx,['apt/unit no.','street name']].values

    # Titlecase all the names and placenames
    for col in ['last name', 'first name', 'middle name', 'prefix', 'suffix', 'street name', 
               'city', 'election name', 'municipality']:
        df[col] = df[col].str.title().str.strip()

    # Clean up birthdates (so only 01/01/1800 is the missing-data value)
    df.loc[df['birth date']=='01/01/1111', ['birth date']] = '01/01/1800'
    df.loc[df['birth date']=='12/31/1899', ['birth date']] = '01/01/1800'
    df.loc[df['birth date']=='', ['birth date']] = '01/01/1800'   

    # Output
    df.to_csv(votedatafile[:-4]+'_cleaned.csv', index=False)

WARREN
BURLINGTON
HUDSON
MONMOUTH
MORRIS
GLOUCESTER
CAMDEN
MIDDLESEX
MERCER
SUSSEX
BERGEN
CAPE MAY
OCEAN
ATLANTIC
UNION
ESSEX
SOMERSET
SALEM
PASSAIC
CUMBERLAND
HUNTERDON


# All the checks done to create the corrections lists above

In [4]:
# Get the 'official' list of towns in this county
county = 'WARREN'

def capcase(x):
    return ' '.join([y.capitalize() for y in x.split(' ')])

corrs = citycorrections[county]

# Get official list of NJ towns
alltowns = pd.read_csv('../2017 other data/localnames_nj.txt', sep='\t', 
                       names=['Local Name', 'Municipality', 'County'])
ctowns = alltowns[alltowns['County'] == capcase(county)]
# print('{}'.format(ctowns))

# muns = ctowns['Municipality'].unique()
# muns.sort()
# print('{}'.format(muns))

ctowns['Local Name'].unique()

array(['Allamuchy', 'Allamuchy Township', 'Allens Mills', 'Alpha',
       'Alphano', 'Anderson', 'Andover Furnace', 'Asbury', 'Bass Lake',
       'Belvidere', 'Bettystown', 'Blair Lake', 'Blairstown',
       'Blairstown Township', 'Brainards', 'Brass Castle', 'Bridgeville',
       'Broadway', 'Browning', 'Buckwood Park', 'Butlers Park',
       'Buttzville', 'Calno', 'Carpentersville', 'Catfish Pond',
       'Cedar Lake', 'Changewater', 'Columbia', 'Cooks Pond',
       'Coopersville', 'Cornish', 'Deckers Ferry', 'Delaware',
       'Delaware Park', 'Denville', 'Dunnfield', 'Ebenezer ', 'Feebletown',
       'Finesville', 'Fort Golden', 'Foul Rift', 'Franklin Grove',
       'Franklin Township', 'Frelinghuysen Township', 'Glovers Pond',
       'Great Meadows', 'Greenwich Township', 'Hackettstown', 'Hainesburg',
       'Hardwick', 'Hardwick Center', 'Hardwick Township', 'Harmony',
       'Harmony Station', 'Harmony Township', 'Hazen', 'Hope',
       'Hughesville ', 'Huntington', 'Hutchinson'

In [5]:
# Get the list of towns found in the voting dataset

votedatafile = '../2017 Voting data/' + county + '/ElectionHistory.txt'
df = pd.read_csv(votedatafile, sep='|', names = voteheaders, 
                index_col=None, low_memory=False, 
                 dtype={'phone number': 'str', 'zip5': 'str', 'zip4': 'str', 
                       'county precinct': 'str'})
df['city'] = df['city'].str.lower()
cits = df['city'].astype(str).unique()
cits.sort()
cits

array(['allamuchy', 'alpha', 'andover', 'asbury', 'belvidere',
       'blairstown', 'bloomsbury', 'broadway', 'columbia', 'delaware',
       'great meadows', 'hackettstown', 'hampton', 'hope', 'johnsonburg',
       'mansfield', 'milford', 'nan', 'newton', 'oxford', 'phillipsburg',
       'port murray', 'roselle park', 'stanhope', 'stewarsville',
       'stewartsville', 'stewartville', 'vienna', 'washington'], dtype=object)

In [7]:
# Go look at data for one of these towns
df[df['city'] =="vienna"]

Unnamed: 0,voter id,status code,party code,last name,first name,middle name,prefix,suffix,sex,street number,suffix a,suffix b,street name,apt/unit no.,address line 1,address line 2,city,state,zip5,zip4,mailing street number,mailing suffix a,mailing suffix b,mailing street name,mailing apt/unit no.,mailing address line 1,mailing address line 2,mailing city,mailing state,mailing country,mailing zip code,birth date,date registered,county precinct,municipality,ward,district,phone number,election date,election name,election type,election category,ballot type,xx
202350,118074434,A,UNA,BORBONE,DEBORAH,A,,,F,25,,,WATER ST,,PO BOX 412,,vienna,NJ,07880,,,,,,,,,,,,,08/20/1960,05/28/2004,11800115,INDEPENDENCE,0.0,1.0,908,11/03/2009,STATE GENERAL ELECTION,GEN,S,M,
202351,118074434,A,UNA,BORBONE,DEBORAH,A,,,F,25,,,WATER ST,,PO BOX 412,,vienna,NJ,07880,,,,,,,,,,,,,08/20/1960,05/28/2004,11800115,INDEPENDENCE,0.0,1.0,908,11/04/2008,GENERAL ELECTION,GEN,S,M,
202352,118074434,A,UNA,BORBONE,DEBORAH,A,,,F,25,,,WATER ST,,PO BOX 412,,vienna,NJ,07880,,,,,,,,,,,,,08/20/1960,05/28/2004,11800115,INDEPENDENCE,0.0,1.0,908,11/05/2013,GENERAL ELECTION,GEN,S,M,
202353,118074434,A,UNA,BORBONE,DEBORAH,A,,,F,25,,,WATER ST,,PO BOX 412,,vienna,NJ,07880,,,,,,,,,,,,,08/20/1960,05/28/2004,11800115,INDEPENDENCE,0.0,1.0,908,11/04/2014,GENERAL ELECTION,GEN,S,M,
202354,118074434,A,UNA,BORBONE,DEBORAH,A,,,F,25,,,WATER ST,,PO BOX 412,,vienna,NJ,07880,,,,,,,,,,,,,08/20/1960,05/28/2004,11800115,INDEPENDENCE,0.0,1.0,908,11/08/2016,GENERAL ELECTION,GEN,S,M,
202430,118033522,IF,REP,BRANDT,JASMINE,A,,,F,3,,,WATER ST,,,,vienna,NJ,07880,,,,,PO BOX 425,,,,VIENNA,NJ,,07880,07/10/1951,09/08/1992,11800115,INDEPENDENCE,0.0,1.0,,04/27/2011,STATE SCHOOL ELECTION,ANS,S,M,
202431,118033522,IF,REP,BRANDT,JASMINE,A,,,F,3,,,WATER ST,,,,vienna,NJ,07880,,,,,PO BOX 425,,,,VIENNA,NJ,,07880,07/10/1951,09/08/1992,11800115,INDEPENDENCE,0.0,1.0,,06/08/2010,STATE PRIMARY 2010,PRI,S,M,
202432,118033522,IF,REP,BRANDT,JASMINE,A,,,F,3,,,WATER ST,,,,vienna,NJ,07880,,,,,PO BOX 425,,,,VIENNA,NJ,,07880,07/10/1951,09/08/1992,11800115,INDEPENDENCE,0.0,1.0,,06/02/2009,STATE PRIMARY ELECTION,PRI,S,M,
202433,118033522,IF,REP,BRANDT,JASMINE,A,,,F,3,,,WATER ST,,,,vienna,NJ,07880,,,,,PO BOX 425,,,,VIENNA,NJ,,07880,07/10/1951,09/08/1992,11800115,INDEPENDENCE,0.0,1.0,,11/03/2009,STATE GENERAL ELECTION,GEN,S,M,
202434,118033522,IF,REP,BRANDT,JASMINE,A,,,F,3,,,WATER ST,,,,vienna,NJ,07880,,,,,PO BOX 425,,,,VIENNA,NJ,,07880,07/10/1951,09/08/1992,11800115,INDEPENDENCE,0.0,1.0,,02/05/2008,PRESIDENTIAL PRIMARY,PRI,S,M,
