# Config

This file should be filled out once per project. It will generate the template that the rest of the project will use.

In [32]:
import pandas as pd
import geopandas as gpd
import os
from shutil import copyfile

from reference_data import (
    geoid_to_county_name,
    state_to_fips,
    state_to_state_po,
    state_po_to_state, 
    state_fip_to_county_to_geoid,
)

In [3]:
state = "<REPLACE ME>"
assert state in state_to_fips.keys()
state_fips = state_to_fips[state]
state_po = state_to_state_po[state]
print("State:",state," | State Fips:",state_fips," | State PO:", state_po)

State: Pennsylvania  | State Fips: 42  | State PO: PA


Confirm that the cell above prints out the correct information for the state you're working on. 

The next cell will take in the year for the election you are working on as an input. Then it will make a  congressional district shapefile and county shapefile for the year and state you are working on. The shapefile will be taken from a national shapefile prepared by the [U.S. Census Bureau](https://www.census.gov/). The national congressional shapefiles are stored in this repository at `./data/congressioinal_distric_shapefiles` and the national county shapefiles are stored at `./data/county_shapefiles`.

These shapefiles may come in handy as you try to match precincts for the rest of the state. Its worthwhile to make state specific shapefiles (which this next cell does for you) because it will be faster than using the national shapefile.

Presently, this codebase supports 2014, 2016, and 2018. Feel free to submit a pull request if you want to add 2020 the Census Buearu publishes `cb_2020_us_county_500k` or `tl_2020_us_cd117`. Of course these aren't essential to have for precinct mathcing. So if you don't want to have these helper files no sweat!

In [23]:
years_to_congress_num = {"2014":"114", "2016":"115", "2018":"116"}
year = "<REPLACE ME>"
assert year in years_to_congress_num.keys()
national_county_gdf = gpd.read_file("./data/county_shapefiles/cb_{}_us_county_500k".format(year))
state_county_gdf = national_county_gdf[national_county_gdf.STATEFP == str(state_fips)][["NAME", "COUNTYFP", "geometry"]].reset_index()
state_county_gdf.head()

Unnamed: 0,index,NAME,COUNTYFP,geometry
0,238,Armstrong,5,"POLYGON ((-79.69293 40.66974, -79.69252 40.673..."
1,239,Chester,29,"MULTIPOLYGON (((-75.59129 39.84044, -75.58917 ..."
2,240,Clinton,35,"POLYGON ((-78.09338 41.21693, -78.05358 41.273..."
3,241,Greene,59,"POLYGON ((-80.51942 39.80618, -80.51923 39.837..."
4,242,Juniata,67,"POLYGON ((-77.74677 40.38466, -77.74365 40.387..."


In [24]:
national_congressional_districts_gdf = gpd.read_file("./data/congressioinal_distric_shapefiles/tl_{}_us_cd{}".format(year,years_to_congress_num[year]))
state_congressional_districts_gdf = national_congressional_districts_gdf[national_congressional_districts_gdf.STATEFP == str(state_fips)][['NAMELSAD','CD116FP','geometry']].sort_values(by='CD116FP').reset_index()
state_congressional_districts_gdf.head()

Unnamed: 0,index,NAMELSAD,CD116FP,geometry
0,316,Congressional District 1,1,"POLYGON ((-75.48406 40.41845, -75.47919 40.422..."
1,319,Congressional District 2,2,"POLYGON ((-75.16130 39.96289, -75.16126 39.963..."
2,323,Congressional District 3,3,"POLYGON ((-75.28027 39.97496, -75.27951 39.975..."
3,325,Congressional District 4,4,"POLYGON ((-75.72036 40.24877, -75.71985 40.250..."
4,326,Congressional District 5,5,"POLYGON ((-75.60154 39.85596, -75.60154 39.856..."


### Persisting the state specific dataset.

This cell makes a folder and stores state data there for your future use (including  `state_congressional_districts_gdf` and `state_county_gdf`). If the folder already exists, then this script will throw an exception to prevent an accidental overwrite.

In [35]:
path = '.data/state_specific_data'
congressional_districts_file = '{}_{}_congressional_districts'.format(year,state)
counties_file = '{}_{}_counties'.format(year,state)
os.mkdir(path)
os.mkdir("/".join([path, "shapefiles"]))

# Generate Shapefiles (for GIS inspection purposes)
state_congressional_districts_gdf.to_file("/".join([path,"shapefiles",congressional_districts_file]))
state_county_gdf.to_file("/".join([path,"shapefiles",counties_file]))

### Import the datasets

If you will be using any other statewide datasets (precinct shapefiles or election results) import them below. 

* `gdf` denotes "GeoDataFrame" which is the data structure that will be used to hold shapefiles
* `df` denotes "DataFrame" which is the data structure that will be used to hold election results

In [None]:
statewide_shapefile_file_path = '<REPLACE ME>'
statewide_shapefile_gdf = gpd.read_file(statewide_shapefile_file_path)
statewide_shapefile_gdf.head()

In [None]:
statewide_election_results_file_path = '<REPLACE ME>'
statewide_results_df = pd.read_csv(statewide_election_results_file_path)
statewide_results_df.head()

### Precondition: County ID

Make a column in `statewide_shapefile_gdf` and `statewide_results_df` called `county_id` with a common naming scheme. County name or county fips code would work. I highly reccomend using county name e.g. "Essex County" for readablity. You can use the helper functions below to convert between county FIPS and county name.

In [None]:
def get_county_name(county_fips):
    geoid = str(state_fips) + str(county_fips).zfill(3)
    assert(len(geoid) == 5)
    return geoid_to_county_name[geoid]

county_to_geoid = state_fip_to_county_to_geoid[int(state_fips)]
def get_geoid(county_name):
    return county_to_geoid[county_name]

# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
statewide_shapefile_gdf['county_id'] = '<REPLACE ME>'
statewide_results_df['county_id'] = '<REPLACE ME>'
n_counties = '<REPLACE ME>'
assert 'county_id' in statewide_shapefile_gdf.columns and 'county_id' in statewide_results_df.columns
county_set_statewide_shapefile_gdf = set(statewide_shapefile_gdf['county_id'].unique())
county_set_statewide_results_df = set(statewide_results_df['county_id'].unique())
print("statewide_shapefile_gdf unmatched counties: ", county_set_statewide_shapefile_gdf.difference(county_set_statewide_results_df))
print("statewide_results_df unmatched counties: ", county_set_statewide_results_df.difference(county_set_statewide_shapefile_gdf))
assert county_set_statewide_shapefile_gdf.intersection(county_set_statewide_results_df) != set()
assert len(county_set_statewide_shapefile_gdf.union(county_set_statewide_results_df)) == int(n_counties)

### Precondition: Precinct Name

Identify the column that should contains the precinct names to be matched by naming it `original_precinct_name`

In [None]:
# TODO: Pass the precondition described above which takes the form of an assert statement in this cell.
statewide_shapefile_gdf['original_precinct_name'] = '<REPLACE ME>'
statewide_results_df['original_precinct_name'] = '<REPLACE ME>'
assert 'original_precinct_name' in statewide_shapefile_gdf.columns and 'original_precinct_name' in statewide_results_df.columns

The next cell makes a directory where all the county specific matching will take place and initializes each county matching folder.

In [None]:
path = './matching'
os.mkdir(path)

for county_id in county_set_statewide_shapefile_gdf.union(county_set_statewide_results_df):
    # Make a folder for this county
    os.mkdir('/'.join([path,county_id]))
    
    # Make the Matching Notebook
    notebook_filename = 'precinct_matching_county_id={}.ipynb'.format(county_id)
    notebook_filepath = "/".join([path,county_id,notebook_filename])
    copyfile('precinct_matching_framework.ipynb',notebook_filepath)
    
    # Read in the file
    with open('precinct_matching_framework.ipynb', 'r') as file:
        precinct_matching_framework = file.read()

    # Customize the county_id string
    precinct_matching_framework = precinct_matching_framework.replace('<$COUNTY_ID$>', county_id)

    # Write the file out again
    with open(notebook_filepath, 'w') as file:
          file.write(precinct_matching_framework)
    
    # Initialize a README.md file
    with open('/'.join([path,county_id,"README.md"]), "a") as f:
        f.write("## Documentation for matching in `county_id` = {}".format(county_id))
    
    # Generate CSV for the election results for this county
    if county_id in county_set_statewide_results_df:
        county_results_filename = 'election_results_county_id={}.csv'.format(county_id)
        county_results_df = statewide_results_df[statewide_results_df.county_id == county_id]
        county_results_df.to_csv("/".join([path,county_id,county_results_filename]), index=False)
    
    # Generate Shapefile (for GIS inspection purposes)
    if county_id in county_set_statewide_shapefile_gdf:
        county_shapefile_filename = 'shapefile_county_id={}'.format(county_id)
        county_shapefile_gdf = statewide_shapefile_gdf[statewide_shapefile_gdf.county_id == county_id]
        county_shapefile_gdf.to_file("/".join([path,county_id,county_shapefile_filename]))