# Precinct Matching Framework

In [1]:
import pandas as pd
import geopandas as gpd
from pprint import pprint

### Import the datasets

In [2]:
county_id = 'Centre County'

county_results_filename = 'election_results_county_id={}.csv'.format(county_id)
county_results_df = pd.read_csv(county_results_filename)
dataset_name_df = 'Precinct Election Results df'

county_shapefile_filename = 'shapefile_county_id={}'.format(county_id)
county_shapefile_gdf = gpd.read_file(county_shapefile_filename)
dataset_name_gdf = 'Precinct Shapefile gdf'

# correct for the truncation caused by 10 character column name limit in shapefiles
county_shapefile_gdf.rename(columns={'original_p':'original_precinct_name'}, inplace=True)

The next cell aliases `county_results_df` as `df` and `county_shapefile_gdf` as `gdf` here because typing fewer characters allows for faster data exploration. 

In [3]:
df = county_results_df.copy()
gdf = county_shapefile_gdf.copy()

In [4]:
df.head()

Unnamed: 0,county,precinct,office,district,candidate,party,votes,election_day,absentee,county_id,original_precinct_name
0,Centre,1,U.S. Senate,,BOB CASEY JR,DEMOCRATIC,201,,,Centre County,1
1,Centre,1,U.S. Senate,,LOU BARLETTA,REPUBLICAN,145,,,Centre County,1
2,Centre,1,U.S. Senate,,NEAL GALE,GREEN,5,,,Centre County,1
3,Centre,1,U.S. Senate,,DALE R KERNS JR,LIBERTARIAN,4,,,Centre County,1
4,Centre,1,U.S. Senate,,WRITE-IN,,0,,,Centre County,1


In [5]:
gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,VTDST,NAMELSAD,VTDI,LSAD,CHNG_TYPE,ORIG_NAME,ORIG_CODE,RELATE,NAME,VINTAGE,FUNCSTAT,JUSTIFY,MTFCC,county_id,original_precinct_name,geometry
0,42,27,10,BELLEFONTE DISTRICT NORTH,A,0,,,,,BELLEFONTE DISTRICT NORTH,90,N,,G5240,Centre County,BELLEFONTE DISTRICT NORTH,"POLYGON Z ((-77.77760 40.91955 0.00000, -77.77..."
1,42,27,20,BELLEFONTE DISTRICT NORTHEAST,A,0,,,,,BELLEFONTE DISTRICT NORTHEAST,90,N,,G5240,Centre County,BELLEFONTE DISTRICT NORTHEAST,"POLYGON Z ((-77.77261 40.92294 0.00000, -77.76..."
2,42,27,30,BELLEFONTE DISTRICT SOUTH,A,0,,,,,BELLEFONTE DISTRICT SOUTH,90,N,,G5240,Centre County,BELLEFONTE DISTRICT SOUTH,"POLYGON Z ((-77.78047 40.90771 0.00000, -77.78..."
3,42,27,40,BELLEFONTE DISTRICT SOUTHEAST,A,0,,,,,BELLEFONTE DISTRICT SOUTHEAST,90,N,,G5240,Centre County,BELLEFONTE DISTRICT SOUTHEAST,"POLYGON Z ((-77.77243 40.91180 0.00000, -77.77..."
4,42,27,50,BELLEFONTE DISTRICT WEST,A,0,,,,,BELLEFONTE DISTRICT WEST,90,N,,G5240,Centre County,BELLEFONTE DISTRICT WEST,"POLYGON Z ((-77.79054 40.91699 0.00000, -77.78..."


### Check Preconditions
These should all pass - they're here to ensure that everything in `config.ipynb` worked correctly

In [6]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
assert 'county_id' in df.columns and 'county_id' in gdf.columns
assert 'original_precinct_name' in df.columns and 'original_precinct_name' in gdf.columns

### General Modifications
Its normally benificial to apply some modifications uniformly to all precincts. For example, its good practice to make everything lower case. This modification is made in `edit_precinct_name` - read its specification to learn more about how to use it to make more modifications.

In [7]:
centre = pd.read_csv('num_to_name.csv')
centre.Precinct.apply(lambda x : ())
number_to_name_centre = pd.Series(centre.Name.values,index=centre.Precinct).to_dict()
number_to_name_centre['24'] = '0024 24 SC EAST 1'
number_to_name_centre['25'] = '0025 25 SC EAST 2'
number_to_name_centre['26'] = '0026 26 SC EAST 3'
number_to_name_centre['27'] = '0027 27 SC EAST 4'
number_to_name_centre['28'] = '0028 28 SC EAST CENTRAL 1'
number_to_name_centre['33'] = '0033 33 SC WEST CENTRAL 1'
number_to_name_centre['44'] = 'College West'
df['original_precinct_name'] = df.original_precinct_name.map(lambda x: str(x).zfill(2)).map(number_to_name_centre)

In [19]:
def edit_precinct_name(prec_name, 
    remove_lst=[], 
    target_to_replacement={},
    stopping_words=['district','division'],
    prec_dict={}):
    '''
    Returns a lower case precinct name (string) with certian modifications depending other arguments. 
    
    Modifications are performed in order of the parameters they depend on. By convention, case is 
    ignored by making prec_name lower case. Accordingly, one should pass arguements with lower case
    elements. That is, keys of the dictionaries and elements of lists should be lower case strings.

	Parameters:
		prec_name (str): precinct name
		remove_lst ((str) list): if a string in this list is a substring in prec_name it will be removed. 
            All elements should be lower case.
        target_to_replacement ({str:str} dictionary): keys (targets) will be replaced with their 
            corresponding value (replacements) in prec_name. All keys should be lower case.
        stopping_words ({str} list): If any substring of prec_name contains a element of stopping_words
             that is adjacent to a space character it will be removed. All elements should be lower case.
        prec_dict ({str:str} dictionary): After all the modifications above, if the edited prec_name
            string is in the set of keys for prec_dict, then it will be replaced with that key's value.  
            All keys should be lower case.

	Returns:
		prec_name (str): prec_name arguement returned with the 
    '''
    prec_name = str(prec_name).lower()
    for word in remove_lst:
        prec_name = prec_name.replace(word, '')
    for target, replacement in target_to_replacement.items():
        prec_name = prec_name.replace(target, replacement)
    words = prec_name.split()
    words = [word.lstrip('0') for word in words if word not in stopping_words]
    prec_name = " ".join(words)
    return prec_dict[prec_name] if prec_name in prec_dict.keys() else prec_name

df_to_gdf = {
    'ferguson north central': 'ferguson north central ward 2',
    'ferguson west central': 'ferguson west central ward 1',
    'halfmoon': 'halfmoon precinct proper',
    'halfmoon east central': 'halfmoon precinct east central',
    'philipsburg 1st ward': 'philipsburg 1',
    'philipsburg 2nd ward': 'philipsburg 2',
    'philipsburg 3rd ward': 'philipsburg 3',
    '24 24 sc east 1': 'state college east 1',
    '25 25 sc east 2': 'state college east 2',
    '26 26 sc east 3': 'state college east 3',
    '27 27 sc east 4': 'state college east 4',
    '28 28 sc east central 1': 'state college east central 1',
    '33 33 sc west central 1': 'state college west central 1',
}

gdf_to_df = {
    'ferguson northeast 1 a': 'ferguson northeast 1', 'ferguson northeast 1 b': 'ferguson northeast 1'
}

# Tune the matching by adding optional arguements to edit
df['edited_precinct_name'] = df['original_precinct_name'].apply(lambda name: edit_precinct_name(name,prec_dict=df_to_gdf))
gdf['edited_precinct_name'] = gdf['original_precinct_name'].apply(lambda name: edit_precinct_name(name,prec_dict=gdf_to_df))

######## Manual Corrections ###########
# Make precinct specific corrections here like splitting one precinct into two because of new congressional districts
gdf.loc[gdf['VTDST']=='000290','edited_precinct_name'] = 'howard township'
gdf.loc[gdf['VTDST']=='000300','edited_precinct_name'] = 'howard borough'

# make the 'original_precinct_name, edited_precinct_name' for use in the loop below
df['original_precinct_name, edited_precinct_name'] = df[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)
gdf['original_precinct_name, edited_precinct_name'] = gdf[['original_precinct_name','edited_precinct_name']].apply(tuple, axis=1)

######## Matching Framework ###########
unmatched_precinct_lst_df = ['ferguson northeast 1', 'howard borough', 'howard township']
unmatched_precinct_lst_gdf = ['ferguson northeast 1 a', 'ferguson northeast 1 b', 'howard']
unmatched_precinct_lst_gdf = []
unmatched_precinct_lst_df = []


precinct_list_df = sorted(list(df[df['county_id'] == county_id]['original_precinct_name, edited_precinct_name'].unique()), key=lambda x: x[1])
precinct_list_gdf = sorted(list(gdf[gdf['county_id'] == county_id]['original_precinct_name, edited_precinct_name'].unique()), key=lambda x: x[1])

precinct_set_df = {x[1] for x in precinct_list_df if x[1] not in unmatched_precinct_lst_df}
precinct_set_gdf = {x[1] for x in precinct_list_gdf if x[1] not in unmatched_precinct_lst_gdf}

unmatched_precincts_df = sorted(list(precinct_set_df - precinct_set_gdf))
unmatched_precincts_gdf = sorted(list(precinct_set_gdf - precinct_set_df))
n_unmatched = len(unmatched_precincts_df) + len(unmatched_precincts_gdf)
if n_unmatched > 0:
        print("county_id: '{}' | {} precincts in {} | {} precincts in {}:\n".format(county_id, len(precinct_list_df), dataset_name_df, len(precinct_list_gdf), dataset_name_gdf))
        n_precincts_total = len(precinct_list_df) + len(precinct_list_gdf)
        print(n_unmatched, " precincts are unmatched out of ", n_precincts_total)
        df_unmatched = df[(df['edited_precinct_name'].isin(unmatched_precincts_df)) & (df.county_id == county_id)]
        gdf_unmatched = gdf[(gdf['edited_precinct_name'].isin(unmatched_precincts_gdf)) & (gdf.county_id == county_id)]
        if n_unmatched > 100:
            print("\nLook for parterns and use change the parameters to edit_precinct_name accordingly.\n")
            for index, (original_precinct_name_df, edited_precinct_name_df) in enumerate(precinct_list_df):
                original_precinct_name_gdf, edited_precinct_name_gdf = precinct_list_gdf[index]
                if edited_precinct_name_df in unmatched_precincts_df and edited_precinct_name_gdf in unmatched_precincts_gdf:
                    print("{} <-- {} ({})".format(edited_precinct_name_df, original_precinct_name_df, dataset_name_df))
                    print("{} <-- {} ({})\n".format(edited_precinct_name_gdf, original_precinct_name_gdf, dataset_name_gdf))
        else:
            print("unmatched_precincts_df ({}) - len = {}| '{}':".format(dataset_name_df, len(unmatched_precincts_df), county_id), unmatched_precincts_df)
            print("\nunmatched_precincts_gdf ({}) - len = {}| '{}':".format(dataset_name_gdf, len(unmatched_precincts_gdf), county_id), unmatched_precincts_gdf)
            precinct_modification_dictionary_df_to_gdf = {unmatched_precincts_df[i]: unmatched_precincts_gdf[i] if i < len(unmatched_precincts_gdf) else '' for i in range(len(unmatched_precincts_df))}
            precinct_modification_dicitonary_gdf_to_df = {unmatched_precincts_gdf[i]: unmatched_precincts_df[i] if i < len(unmatched_precincts_df) else '' for i in range(len(unmatched_precincts_gdf))}
            print("{}  to {} precinct modification dictionary: ".format(dataset_name_df, dataset_name_gdf))
            print("'{}':".format(county_id))
            pprint(precinct_modification_dictionary_df_to_gdf)
            print("{}  to {} precinct modification dictionary: ".format(dataset_name_gdf, dataset_name_df))
            print("'{}':".format(county_id))
            pprint(precinct_modification_dicitonary_gdf_to_df)
            for index, (original_precinct_name_df, edited_precinct_name_df) in enumerate(precinct_list_df):
                original_precinct_name_gdf, edited_precinct_name_gdf = precinct_list_gdf[index]
                if edited_precinct_name_df in unmatched_precincts_df or edited_precinct_name_gdf in unmatched_precincts_gdf:
                    print("{} <-- {} ({})".format(edited_precinct_name_df, original_precinct_name_df, dataset_name_df))
                    print("{} <-- {} ({})\n".format(edited_precinct_name_gdf, original_precinct_name_gdf, dataset_name_gdf))
            else:
                print("Add unmatched precincts to the unmatched precinct.")
else:
    print("All Done! (make sure you have one to one matches)")

All Done! (make sure you have one to one matches)
