In [43]:
from unidecode import unidecode

from fuzzywuzzy import fuzz
from nameparser import HumanName
import pandas as pd
import urllib.request

In [2]:
class URLs:
    CANDIDATES = 'https://www.opensecrets.org/downloads/crp/CRP_IDs.xls'
    FORECAST = 'https://projects.fivethirtyeight.com/congress-model-2018/house_district_forecast.csv'

In [3]:
class ManualData:
    CANDIDATE_IDS = {}

In [4]:
def candidates_fetch():
    """Returns a file-like object of the candidates Excel file"""
    headers = {'User-Agent': 'curl/7.54.0'}
    req = urllib.request.Request(URLs.CANDIDATES, headers=headers)
    response = urllib.request.urlopen(req)
    return response

In [5]:
def candidates_read():
    candidates_file = candidates_fetch()
    candidates = pd.read_excel(candidates_file, skiprows=0, header=13, usecols='B:F')
    return candidates

In [6]:
def forecast_fetch():
    """Returns a file-like object of the 538 House Forecast csv"""
    return urllib.request.urlopen(URLs.FORECAST)

In [7]:
def forecast_read():
    forecast_file = forecast_fetch()
    forecast = pd.read_csv(forecast_file)
    latest_date = forecast['forecastdate'].max()
    latest_forecast = forecast[(forecast['forecastdate'] == latest_date)]
    classic = latest_forecast[(latest_forecast['model'] == 'classic')]
    return classic

In [8]:
def forecast_clean(forecast):
    forecast['district_code'] = forecast['state'] + forecast['district'].astype(str).str.zfill(2)
    forecast['name'] = forecast['candidate'].map(lambda c: HumanName(unidecode(c)))
    forecast['margin'] = abs(0.5 - forecast['win_probability'])

In [9]:
def candidates_clean(candidates):
    candidates['name'] = candidates['CRPName'].map(lambda c: HumanName(unidecode(c)))
    candidates['party'] = candidates['Party']
    candidates['district_code'] = candidates['DistIDRunFor']

In [34]:
def mean(nums):
    return float(sum(nums)) / len(nums)

In [44]:
def name_distance(name1, name2):
    return mean([
        fuzz.ratio(name1.first, name2.first),
        fuzz.ratio(name1.last, name2.last)
    ])

In [142]:
NAME_THRESHOLD = 92

def best_match(forecast_candidate, candidates):
    """Returns matching candidate from Opensecrets list, or None"""
    # consider only those of the same district
    district_candidates = candidates[(candidates['district_code'] == forecast_candidate['district_code'])]
    distances = district_candidates.apply(lambda c: name_distance(c['name'], forecast_candidate['name']), axis=1).sort_values(ascending=False)
        
    close_distances = distances[(distances > NAME_THRESHOLD)]
    if close_distances.size > 0:
        return candidates.loc[close_distances.index]['CID']
    else:
        return None


In [10]:
candidates = candidates_read()

In [11]:
candidates_clean(candidates)

In [12]:
forecast = forecast_read()

In [13]:
forecast_clean(forecast)

In [150]:
forecast['foo'] = forecast.apply(lambda c: best_match(c, candidates), axis=1)

ValueError: Wrong number of items passed 849, placement implies 1

In [144]:
a

3489    N00007999
Name: CID, dtype: object

In [None]:
forecast

In [None]:
candidates

In [None]:
forecast.sort_values(by=['margin'])