In [2]:
from tabula import read_pdf
import pandas as pd

df = read_pdf('SummaryReport.pdf', pages='5-12', guess=False, pandas_options={'header': None})


In [3]:
import json

with open('lookup.json') as f:
    lookup = json.load(f)

lookup_race = pd.DataFrame(
        [{'key': key, **val} for (key, val) in lookup['races'].items()]
    ).set_index('key', drop=True)

lookup_candidate = pd.DataFrame(
        [{'key': key, **val} for (key, val) in lookup['candidates'].items()]
    ).append(
        {'cboe_results_name': 'WRITE-IN', 'chi_vote_name': 'Write-in', 'key': '0000000'},
        ignore_index=True
    ).set_index('key', drop=True)

In [4]:
import re
from collections import defaultdict
from pprint import PrettyPrinter

pp = PrettyPrinter(indent=4)

# Python program to illustrate the intersection 
# of two lists 
def intersection(lst1, lst2): 
  
    # Use of hybrid method 
    temp = set(lst2) 
    lst3 = [value for value in lst1 if value in temp] 
    return lst3 

#format df
iterator = df.iterrows()

contests = defaultdict(dict)

race_id = None

for idx, row in iterator:
    series = pd.Series(row)
    if series.str.contains('precincts counted').any():
        try:
            # get race name by matching to lookup
            race_name = intersection(series.values, lookup_race.cboe_results_name.values)[0]
        except IndexError:
            break
        
        # get race id using lookup
        race_id = lookup_race[lookup_race.cboe_results_name==race_name].index.values[0]
        
        # get precincts figures
        prs_str = series[series.str.contains('precincts counted', na=False)].values[0]
        prs_rpt, prs_tot = map(int, re.findall('\d+', prs_str))
        
        contests[race_id]['meta'] = [race_name, prs_rpt, prs_tot]
        contests[race_id]['cands'] = []
        
    if race_id and series.str.match(r'^[A-Z -\.]+$').any():
        try:
            cand_cboe_name = intersection(series.values, lookup_candidate.cboe_results_name.values)[0]
            cand_name = lookup_candidate[lookup_candidate.cboe_results_name == cand_cboe_name].chi_vote_name.values[0]
            
            vote_str = series[series.str.match('^[\d,]+$', na=False)]
            vote_cnt = pd.to_numeric(vote_str.str.replace(',', '')).values[0]
            
            pct_str = series[series.str.contains('%', na=False)].values[0]
            
            contests[race_id]['cands'].append([cand_name, vote_cnt, pct_str])
            
        except IndexError: # no match
            print(race_name, series)
            pass
        
    if race_id and series.str.contains('Total').any():
        # get total
        tot_str = series[series.str.match('[\d,]+', na=False)]
        vote_tot = pd.to_numeric(tot_str.str.replace(',', '')).values[0]
        contests[race_id]['meta'].append(vote_tot)
        
        contests[race_id]['cands'] = sorted(contests[race_id]['cands'], key=lambda x: x[1], reverse=True)
        
pp.pprint(contests)

Alderman 2nd Ward 0    PROCLAMATION
1             NaN
2             NaN
3             NaN
4    Page 5 of 18
Name: 45, dtype: object
Alderman 10th Ward 0    PROCLAMATION
1             NaN
2             NaN
3             NaN
4    Page 6 of 18
Name: 93, dtype: object
Alderman 17th Ward 0    PROCLAMATION
1             NaN
2             NaN
3             NaN
4    Page 7 of 18
Name: 139, dtype: object
Alderman 24th Ward 0    PROCLAMATION
1             NaN
2             NaN
3             NaN
4    Page 8 of 18
Name: 188, dtype: object
Alderman 31st Ward 0    PROCLAMATION
1             NaN
2             NaN
3             NaN
4    Page 9 of 18
Name: 233, dtype: object
Alderman 39th Ward 0     PROCLAMATION
1              NaN
2              NaN
3    Page 10 of 18
4              NaN
Name: 275, dtype: object
Alderman 46th Ward 0     PROCLAMATION
1              NaN
2              NaN
3    Page 11 of 18
4              NaN
Name: 324, dtype: object
defaultdict(<class 'dict'>,
            {   '0010': {  

In [5]:
contest_headers = ['name', 'prs_rpt', 'prs_tot', 'vote_tot']
cand_headers = ['name', 'vote_cnt', 'vote_pct']
cand_classes = ['', 'amt', 'amt append-bar']

idx_total_votes = contest_headers.index('vote_tot')
idx_cand_votes = cand_headers.index('vote_cnt')

def assign_winners(contests):
    from operator import itemgetter
    
    for idx, contest in contests.items():
        meta, cands = itemgetter('meta', 'cands')(contest)
        
        total_votes = meta[idx_total_votes]
        
        # does the candidate with the most votes have >50%?
        top_cand = cands[0]
        has_winner = top_cand[idx_cand_votes] / total_votes > 0.5
        
        # add empty winner column to cand row
        cands = list(map(lambda x: ['', *x], cands))

        if has_winner:
            cands[0] = ['✓', *cands[0][1:]]
        
        else:
            cands[0] = ['✓', *cands[0][1:]]
            cands[1] = ['✓', *cands[1][1:]]
        
        contests[idx] = {'meta': meta, 'cands': cands}
        
        
    return contests

assigned_contests = assign_winners(contests)
cand_headers = ['winner', *cand_headers]
cand_classes = ['', *cand_classes]

In [11]:
def validate_percents(contests):
    from operator import itemgetter
    
    for idx, contest in contests.items():
        meta, cands = itemgetter('meta', 'cands')(contest)
        
        total_votes = meta[idx_total_votes]
        
        # do the candidates add up to the total?
        votes = sum([c[2] for c in cands])
#         print(total_votes == votes)
        pp.pprint(cands)
        print(total_votes)
        print(votes)
        
        return
    
validate_percents(contests)

[   ['✓', 'Lori Lightfoot', 97667, '17.54%'],
    ['✓', 'Toni Preckwinkle', 89343, '16.04%'],
    ['', 'William M. Daley', 82294, '14.78%'],
    ['', 'Willie L. Wilson', 59072, '10.61%'],
    ['', 'Susana A. Mendoza', 50373, '9.05%'],
    ['', 'Amara Enyia', 44589, '8.01%'],
    ['', 'Jerry Joyce', 40099, '7.20%'],
    ['', 'Gery Chico', 34521, '6.20%'],
    ['', 'Paul Vallas', 30236, '5.43%'],
    ['', 'Garry McCarthy', 14784, '2.65%'],
    ['', 'La Shawn K. Ford', 5606, '1.01%'],
    ['', 'Robert "Bob" Fioretti', 4302, '0.77%'],
    ['', 'John Kenneth Kozlar', 2349, '0.42%'],
    ['', 'Write-in', 86, '0.02%']]
556844
555321


In [8]:
import datetime

results = {
    'contest_headers': contest_headers,
    'cand_headers': cand_headers,
    'contests': assigned_contests,
    'cand_classes': cand_classes,
    'datetime': datetime.date(2019, 3, 13).isoformat(),
    'isFinal': True
}

In [9]:
class MyEncoder(json.JSONEncoder):
    """
    We have to use a custom encoder because pandas uses special numpy
    object types that json doesn't like.
    Source: https://stackoverflow.com/a/27050186
    """

    def default(self, obj):
        import numpy as np

        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(MyEncoder, self).default(obj)

with open('results.json', 'w') as outfile:
    json.dump(results, outfile, cls=MyEncoder)