# Precinct Matching Framework

In [None]:
import pandas as pd
import geopandas as gpd

from reference_data import (
    geoid_to_county_name,
    state_to_fips,
    state_abbreviation_to_state_name,
    state_fip_to_county_to_geoid,
)

### Import the datasets

In [None]:
county_id = 'Mercer County'

county_results_filename = 'election_results_county_id={}.csv'.format(county_id)
county_results_df = pd.read_csv(county_results_filename)
county_shapefile_filename = 'shapefile_county_id={}'.format(county_id)
county_shapefile_gdf = gpd.read_file(county_shapefile_filename)

# correct for the truncation caused by 10 character column name limit in shapefiles
county_shapefile_gdf.rename(columns={'original_p':'original_precinct_name'}, inplace=True)

The next cell aliases `county_results_df` as `df` and `county_shapefile_gdf` as `gdf` here because typing fewer characters allows for faster data exploration. 

In [None]:
df = county_results_df.copy()
gdf = county_shapefile_gdf.copy()

In [None]:
df.head()

In [None]:
gdf.head()

### Check Preconditions
These should all pass - they're here to ensure that everything in `config.ipynb` worked correctly

In [None]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
assert 'county_id' in df.columns and 'county_id' in gdf.columns
assert 'original_precinct_name' in df.columns and 'original_precinct_name' in gdf.columns

### General Modifications
Its benificial to apply some modifications uniformly to all precincts. For example, its good practice to make everything lower case. This modification and other are made in `edit_precinct_name` - read its specification to learn more about how to use it to make modifications.

In [19]:
# Helper Functions
default_remove_lst = []
default_target_to_replacement = {'-':' ', '/':' '}
default_stopping_words = []

def edit_precinct_name(prec_name, 
    remove_lst=default_remove_lst, 
    target_to_replacement=default_target_to_replacement,
    stopping_words=default_stopping_words,
    prec_dict={}):
    '''
    Returns the a lower case precinct string with certian modifications depending other arguments. 
    
    Modifications are performed in order of the parameters they depend on. By convention, case is 
    ignored by making prec_name lower case. Accordingly, one should pass arguements with lower case
    elements. That is, keys of the dictionaries and elements of lists should be lower case strings.

	Parameters:
		prec_name (str): precinct name
		remove_lst ((str) list): if a string in this list is a substring in prec_name it will be removed. 
            All elements should be lower case.
        target_to_replacement ({str:str} dictionary): keys (targets) will be replaced with their 
            corresponding value (replacements) in prec_name. All keys should be lower case.
        stopping_words ({str} list): If any substring of prec_name contains a element of stopping_words
             that is adjacent to a space character it will be removed. All elements should be lower case.
        prec_dict ({str:str} dictionary): After all the modifications above, if the edited prec_name
            string is in the set of keys for prec_dict, then it will be replaced with that key's value.  
            All keys should be lower case.

	Returns:
		prec_name (str): prec_name arguement returned with the 
    '''
    prec_name = prec_name.lower()
    for word in remove_lst:
        prec_name = prec_name.replace(word, '')
    for target, replacement in target_to_replacement.items():
        prec_name = prec_name.replace(target, replacement)
    words = prec_name.split()
    words = [word.lstrip('0') for word in words if word not in stopping_words]
    prec_name = " ".join(words)
    return prec_dict[prec_name] if prec_name in prec_dict.keys() else prec_name

In [None]:
from pprint import pprint

def row_to_edited_precinct_name(row, county_id_to_precinct_modifications_dictionary_x):
    county_id = row['county_id']
    original_prec_name = row['original_precinct_name']
    county_precinct_modification_dictionary = county_id_to_precinct_modifications_dictionary_x.get(county_id, {})
    edited_prec_name = edit_precinct_name(original_prec_name, prec_dict=county_precinct_modification_dictionary)
    return edited_prec_name

def enumerate_unmatched(precinct_list, unmatched_precincts, dataset_name):
    for original_precinct_name, edited_precinct_name in precinct_list:
        if edited_precinct_name in unmatched_precincts:
            print("{} <-- {} ({})".format(edited_precinct_name, original_precinct_name, dataset_name))  

            
county_id_to_precinct_modification_dictionary_x = {}
df_x['edited_precinct_name'] = df_x.apply(lambda row: row_to_edited_precinct_name(row, county_id_to_precinct_modification_dictionary_x), axis=1)
df_x['original_precinct_name, edited_precinct_name'] = df_x[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)

county_id_to_precinct_modification_dictionary_y = {}
df_y['edited_precinct_name'] = df_y.apply(lambda row: row_to_edited_precinct_name(row, county_id_to_precinct_modification_dictionary_y), axis=1)
df_y['original_precinct_name, edited_precinct_name'] = df_y[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)

n_counties = df_x['county_id'].nunique()
for idx, county_id in enumerate(df_x['county_id'].unique()):
    precinct_list_x = sorted(list(df_x[df_x['county_id'] == county_id]['original_precinct_name, edited_precinct_name'].unique()), key=lambda x: x[1])
    precinct_list_y = sorted(list(df_y[df_y['county_id'] == county_id]['original_precinct_name, edited_precinct_name'].unique()), key=lambda x: x[1])

    precinct_set_x = {x[1] for x in precinct_list_x if x[1] not in county_to_unmatched_precinct_lst_x.get(county_id, [])}
    precinct_set_y = {x[1] for x in precinct_list_y if x[1] not in county_to_unmatched_precinct_lst_y.get(county_id, [])}
    
    unmatched_precincts_x = sorted(list(precinct_set_x - precinct_set_y))
    unmatched_precincts_y = sorted(list(precinct_set_y - precinct_set_x))

    if len(unmatched_precincts_x) + len(unmatched_precincts_y) > 0:
        print('County {}/{} | {:.2f}% complete.'.format(idx+1,n_counties, (idx+1)/n_counties*100))
        print("county_id: {} | {} precincts in Kelso | {} precincts in Open Elections:\n".format(county_id, len(precinct_list_x), len(precinct_list_y)))
        print("unmatched_precincts_x:", unmatched_precincts_x)
        print("unmatched_precincts_y:", unmatched_precincts_y)
        

        if len(unmatched_precincts_x) == len(unmatched_precincts_y):
            precinct_modification_dictionary_x_to_y = {unmatched_precincts_x[i]: unmatched_precincts_y[i] for i in range(len(unmatched_precincts_x))}
            precinct_modification_dicitonary_y_to_x = {unmatched_precincts_y[i]: unmatched_precincts_x[i] for i in range(len(unmatched_precincts_y))}
            print("{}  to {} precinct modification dictionary: ".format(dataset_name_x, dataset_name_y))
            pprint(precinct_modification_dictionary_x_to_y)
            print("{}  to {} precinct modification dictionary: ".format(dataset_name_y, dataset_name_x))
            pprint(precinct_modification_dicitonary_y_to_x)
            
        else:
            print("Add unmatched precincts to the unmatched precinct.")
            break

        enumerate_unmatched(precinct_list_x, unmatched_precincts_x, dataset_name_x)
        enumerate_unmatched(precinct_list_y, unmatched_precincts_y, dataset_name_y)
        break