In [1]:
%matplotlib inline
import sys, os, time, math, csv
import itertools
import collections

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import fiona
import shapely.geometry
import scipy.spatial
import haversine

from MigrationData import IRSMigrationData

In [2]:
migration_data = IRSMigrationData(data_dir="data/raw/migration/")
years = range(2004,2015)

## Calculate largest county intersection from migration data

In [3]:
# List of state prefixes for the FIPS codes of the 48 continental US states
CONTINENTAL_STATE_FIPS = ["01","04","05","06","08","09","10","12","13","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","44","45","46","47","48","49","50","51","53","54","55","56"]
assert len(CONTINENTAL_STATE_FIPS) == 48

In [4]:
fips_sets = [] # a set for each year of data containing *all* the FIPS codes from that year
for i, year in enumerate(years):
    in_records, out_records = migration_data.get_raw_data(year)

    fips_set = set()
    for origin, destination, val in in_records:
        fips_set.add(origin)
        fips_set.add(destination)
    for origin, destination, val in out_records:
        fips_set.add(origin)
        fips_set.add(destination)
    fips_sets.append(fips_set)

# a set for each year of data containing the FIPS codes in continental states that are not used for special purposes (county FIPS '000' is reserved')
new_fips_sets = []
for i, year in enumerate(years):
    fips_set = fips_sets[i]
    new_fips_set = set()
    for fips in fips_set:
        state_code = fips[:2]
        county_code = fips[2:]
        if state_code in CONTINENTAL_STATE_FIPS and county_code != "000":
            new_fips_set.add(fips)

    print("%d -- %d counties" % (year, len(new_fips_set)))
    new_fips_sets.append(new_fips_set)

2004 -- 3108 counties
2005 -- 3108 counties
2006 -- 3108 counties
2007 -- 3108 counties
2008 -- 3108 counties
2009 -- 3108 counties
2010 -- 3108 counties
2011 -- 3108 counties
2012 -- 3108 counties
2013 -- 3107 counties
2014 -- 3107 counties


In [5]:
joined_set = set(new_fips_sets[0])
for s in new_fips_sets[1:]:
    joined_set.intersection_update(s)
print("Total of %d locations in continental states that are common to all years of data." % (len(joined_set)))

# sort FIPS code in numerical order
joined_list = {fips:int(fips) for fips in joined_set}
sorted_joined_list = sorted(joined_list, key=joined_list.get)

Total of 3106 locations in continental states that are common to all years of data.


In [6]:
output_fn = "data/processed/county_intersection_list_2004_2014.txt"
f = open(output_fn, "w")
f.write("\n".join(sorted_joined_list))
f.close()

## Calculate distance matrix

In [7]:
f = fiona.open("data/intermediate/boundary_shapefiles/cb_2015_us_county_500k.shp", "r")
used_geoids = set()
data = []
for row in f:
    geoid = row["properties"]["GEOID"]
    if geoid in joined_set:
        used_geoids.add(geoid)
        geom = shapely.geometry.shape(row['geometry'])
        lon, lat = geom.centroid.x, geom.centroid.y
        data.append((geoid, lon, lat))
    else:
        print("GEOID %s not in accepted list" % (geoid))
f.close()

# report whether we matched all geoids in the input list
assert len(joined_set - used_geoids) == 0

GEOID 11001 not in accepted list
GEOID 46102 not in accepted list


In [8]:
data = sorted(data)

In [9]:
f = open("data/processed/county_centroid_list.csv","w")
f.write("geoid,lon,lat\n")
for fipsCode,lon,lat in data:
    f.write("%s,%f,%f\n" % (fipsCode,lon,lat))
f.close()

In [10]:
coords = [(coord[2], coord[1]) for coord in data]
distance_matrix = scipy.spatial.distance.cdist(coords, coords, haversine.haversine)
np.save("data/processed/county_distance_matrix.npy", distance_matrix)

## Save migration matrices to file

In [26]:
for year in years:
    print(year)
    migration_matrix = migration_data.get_processed_data(year, sorted_joined_list)
    np.save("data/processed/migration/migration_matrix_%d.npy" % (year), migration_matrix)

2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
