In [2]:
import csv
import requests
from bs4 import BeautifulSoup as bs
import Levenshtein as ls
from pickle import dump, load

In [3]:
with open('Main_candidate_information.csv') as csvfile:
    rows = [row for row in csv.DictReader(csvfile)]
data = list(map(lambda row: (row['candidate'], row['state_dist']), rows))

data[0]

('Aaron Hermes', 'TX-22')

In [None]:
def parse_html(candidate_html):
    soup = bs(candidate_html, 'html.parser')
    stage = 0
    # look for elections
    pointer = soup.find('span', id='Elections')
    if pointer == None:
        return None
    # look for election year
    year = pointer.find_next()
    if year.text.strip() != '2020':
        return None
    last_section_heading = year.find_next('div', attrs={'class': 'electionsectionheading'})
    # look for votebox
    if 'General' not in last_section_heading.text:
        if year.find_next('div', attrs={'class': 'votebox'}):
            print('BUGGY TABLE');
        return None
    potential_table = year.find_next('div', attrs={'class': 'votebox'})
    # get table data
    if potential_table:
        return list(map(lambda x: x.find('a').text.strip(), potential_table.find_all('td', attrs={'class': 'votebox-results-cell--text'})))
    return None
    
    

def get_data_for_candidates(data, start_idx=0):
    ballotpedia_prefix = 'http://ballotpedia.org/'
    seen_dists = set()
    no_data = []
    general_election = set()

    for candidate, district in data:
        if district in seen_dists:
            continue
        # candidate string creation
        candidate_str = candidate.title().split(' ')
        if len(candidate_str) > 1:
            candidate_str = [candidate_str[0], candidate_str[-1]]
        candidate_str = '_'.join(candidate_str)

        url = ''.join((ballotpedia_prefix, candidate))
        page = requests.get(url)
        candidate_info = parse_html(page.text)

        if candidate_info == None:
            print('No Data:', candidate)
            no_data.append(candidate)
        else:
            seen_dists.add(district)
            for cand_name in candidate_info:
                general_election.add(cand_name)
    return general_election, no_data, seen_dists

candidate_data = get_data_for_candidates(data)

No Data: Aaron Paul Godfrey
No Data: Abe Jamie Garcia
No Data: Adair Boroughs
No Data: Adam Bolaños Scow
No Data: Adam Smith
No Data: Adam Wynn
No Data: Aida Estrada Gray
No Data: Al Green
No Data: Al Lemmo
No Data: Alan D. Swain
No Data: Albert Lee
No Data: Albert Maxwell Goldberg
No Data: Alex B. Morse
No Data: Alix Toulme Jr.
No Data: Allen R. Davidson
No Data: Amanda Brunzell
No Data: Ameena Matthews
No Data: Andrew Saltman
No Data: Andy Kim
No Data: Andy Meehan
No Data: Andy Terrell
No Data: Angelica Maria Dueñas
No Data: Anthony Brown
No Data: Anthony Clark
No Data: Anthony Davila
No Data: Antonio C. Amador
No Data: Antonio Delgado
No Data: Arthur J. Jones
No Data: Barbara Lee
No Data: Barry Moore
No Data: Bart Gottschalk
No Data: Betsy Dirksen Londrigan
No Data: Bill Foster
No Data: Bill Johnson
No Data: Billy Hibbitts
No Data: Bob Elliott
No Data: Bob Rogers


In [16]:
# save/load file

# with open('cand.data', 'wb') as f:
    # dump(candidate_data, file=f)

with open('cand.data', 'rb') as f:
    cand_data = load(f)

# reload candidate info just in case
cand_dict = dict()
with open('Main_candidate_information.csv') as csvfile:
    rows = [row for row in csv.DictReader(csvfile)]

cand_order = []
for row in rows:
    cand_dict[row['candidate']] = row
    cand_order.append(row['candidate'])

for d in cand_dict:
    print(d, cand_dict[d])
    break

Aaron Hermes {'icpsr': '492220004', 'candidate': 'Aaron Hermes', 'chamber': 'House', 'state_dist': 'TX-22', 'cand_party': 'R', 'party_code': '200', 'candidate_is_incumbent': '0', 'location': 'Texas 22', 'partisanship': 'R+10', 'incumbent_rep': 'Pete Olson', 'incumbent_party': 'Republican', 'incumbent_elected': '2008', 'incumbent_status': 'Incumbent retiring.', 'congress': 'NA', 'bioname': 'NA', 'bioguide_id': 'NA', 'born': 'NA', 'nominate_dim1': 'NA', 'nominate_dim2': 'NA', 'nominate_log_likelihood': 'NA', 'nominate_geo_mean_probability': 'NA', 'nominate_number_of_votes': 'NA', 'nominate_number_of_errors': 'NA', 'nokken_poole_dim1': 'NA', 'nokken_poole_dim2': 'NA', 'senate_class': 'NA', 'new_class_code_senate': 'NA', 'text_stem': 'TX-22-hermes_aaron-', 'screen_name': '@realAaronHermes', 'official_screen_name': 'NA', 'alternate_names': 'Aaron Hermes', 'first_name': 'Aaron', 'special_name': 'first_last', 'candidate_encoding_issues': 'NA', 'GeneralElec': '0'}


In [22]:
# group by district
cand_dists = dict()
for cand, cand_info in cand_dict.items():
    district = cand_info['state_dist']
    if district not in cand_dists:
        cand_dists[district] = []
    cand_dists[district].append(cand)

In [23]:
potential_candidates, unfound, seen_dists = cand_data
# print(list(potential_candidates)[0:5], list(unfound)[0:4], list(seen_dists)[0:4])
print('unseen districts:', set(cand_dists.keys()) - seen_dists)

unseen districts: {'MA-02', 'OK-04', 'SC-03', 'CT-01', 'NY-26', 'TN-06', 'AR-01', 'LA-04', 'KS-04', 'US-00', 'GA-05', 'PA-15', 'WA-09', 'ID-02'}


In [33]:
# try to match ballotpedia names with known names
# make name set
known_names = cand_dict.keys()
successful, no_data, seen_dists = cand_data
exact_matches = set()

In [34]:
# get all exact matches and update the cand dicts
for gen_name in successful:
    if gen_name in cand_dict:
        cand_dict[gen_name]['GeneralElec'] = '1'
        exact_matches.add(gen_name)

In [47]:
remaining = successful - exact_matches
unmatched = known_names - exact_matches
indirect_matches = dict()
print(len(remaining), 'names were unable to be automatically matched.')
for indirect in remaining:
    best_names = []
    for name in unmatched:
        if name in indirect_matches:
            continue

        ed = ls.distance(indirect, name)
        
        best_names.append((name, ed))
        if len(best_names) > 5:
            best_names.remove(max(best_names, key=lambda x: x[1]))
    
    print('unmatched name:', indirect)
    best_names.sort(key=lambda x: x[1])
    print('0', 'No Match')
    for idx, option in enumerate(best_names):
        print(idx+1, option[0].rjust(15), cand_dict[option[0]]['cand_party'].ljust(2), cand_dict[option[0]]['state_dist'])
    choice = None
    while choice == None:
        try:
            choice = int(input('Choice: ').strip())
            if 0 <= choice <= min(5, len(best_names)):
                print('oob')
            assert (0 <= choice <= min(5, len(best_names))), ('choice between 0 and ' + str(min(5, len(best_names))))
        except (ValueError, AssertionError):
            choice = None
            print('Invalid input, please enter a number')
    if choice != 0:
        selected_cand = best_names[choice-1][0]
        cand_dict[selected_cand]['GeneralElec'] = '1'
        indirect_matches[selected_cand] = indirect
    else:
        break
    print(indirect_matches)
    print('------------------------------------')

563 names were unable to be automatically matched.
unmatched name: Ross Lynn Leone
0 No Match
1       Rob Lydon D  CA-01
2      Ross Spano R  FL-15
3 Russ Cirincione D  NJ-06
4     Katelyn Lee D  MS-03
5     Roshan Mody D  FL-20
Invalid input, please enter a number
oob
{'Rob Lydon': 'Ross Lynn Leone'}
------------------------------------
unmatched name: Sylvia Caravetta
0 No Match
1  Elisa Cardnell D  TX-02
2     Alia Ureste R  TX-23
3    David Cavell D  MA-04
4    Steve Darden R  TN-01
5     Mike Craven R  KY-03
oob
{'Rob Lydon': 'Ross Lynn Leone', 'Alia Ureste': 'Sylvia Caravetta'}
------------------------------------
unmatched name: Richard Dunn
0 No Match
1    Richard Rowe D  FL-03
2    Richard Mata AI CA-48
3   Richard Ojeda D  WV
4  Richard Herman R  TX-13
5   Richard Piwko R  MI-10
oob
{'Rob Lydon': 'Ross Lynn Leone', 'Alia Ureste': 'Sylvia Caravetta', 'Richard Ojeda': 'Richard Dunn'}
------------------------------------
unmatched name: Bobby Lyons
0 No Match
1       Rob Jones R

KeyboardInterrupt: Interrupted by user

In [41]:
def sort_cands(cand_dict):
    sorted_cands = [cand_dict[label] for label in cand_order]
    return sorted_cands

candidates_sorted = sort_cands(cand_dict)

# add zeros
for candidate in candidates_sorted:
    if candidate['GeneralElec'] != '1':
        if candidate['state_dist'] in seen_dists:
            candidate['GeneralElec'] = '0'
        else:
            candidate['GeneralElec'] = 'UNSEEN_DIST'

In [42]:
# upload data for exact candidates
with open('output.csv', 'w') as f:
    # print(list(candidates_sorted[0].keys()))
    headers = list(candidates_sorted[0].keys())
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for cand_line in candidates_sorted:
        try:
            writer.writerow(cand_line)
        except:
            print(cand_line)