In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import sqlite3
import os
from fractions import Fraction

def get_ratio(ins,outs):    
    """ Function to display in to out ratio in the format 1:7 """
    if ins==0:
        ratio='0'+':'+str(int(outs))
        return ratio
    elif outs==0:
        ratio=str(int(ins))+':'+'0'
        return ratio
    elif ins==outs:
        ratio='1:1'
        return ratio
    else:
        ratio=str(Fraction(ins/outs).limit_denominator(1000)).split('/')[0]+':'+str(Fraction(ins/outs).limit_denominator(1000)).split('/')[1]
        return ratio

In [2]:
data_path=r'C:\Users\Ernest\Desktop\Anastasia\irs-migration\irs_migration\data'
db='irsmig_county_database'
con = sqlite3.connect(os.path.join(data_path,db,"irs_migration_county.sqlite"))

In [3]:
years=['2011_12','2012_13','2013_14','2014_15'] # project years 

#for year in years:
year=years[3]
table1='outflow_{}'.format(year)
table2='inflow_{}'.format(year)

In [4]:
df_out = pd.read_sql_query("SELECT * from {} where {}.origin!={}.destination".format(table1, table1, table1), con)
df_in=pd.read_sql_query("SELECT * from {} where {}.origin!={}.destination".format(table2, table2, table2), con)
con.close()
df_in.head()

Unnamed: 0,uid,st_dest_abbrv,destination,origin,st_orig_abbrv,co_orig_name,returns,exemptions,income,disclosure
0,01001_01051,AL,1001,1051,AL,Elmore County,304.0,681.0,12150.0,
1,01001_01101,AL,1001,1101,AL,Montgomery County,279.0,642.0,9803.0,
2,01001_01021,AL,1001,1021,AL,Chilton County,68.0,177.0,2376.0,
3,01001_01047,AL,1001,1047,AL,Dallas County,49.0,113.0,1581.0,
4,01001_01073,AL,1001,1073,AL,Jefferson County,37.0,72.0,1422.0,


In [5]:
df_out.head()

Unnamed: 0,uid,st_orig_abbrv,origin,destination,st_dest_abbrv,co_dest_name,returns,exemptions,income,disclosure
0,01001_01051,AL,1001,1051,AL,Elmore County,319.0,692.0,12952.0,
1,01001_01101,AL,1001,1101,AL,Montgomery County,288.0,577.0,10053.0,
2,01001_01021,AL,1001,1021,AL,Chilton County,60.0,140.0,2121.0,
3,01001_01073,AL,1001,1073,AL,Jefferson County,29.0,47.0,960.0,
4,01001_01117,AL,1001,1117,AL,Shelby County,26.0,37.0,1150.0,


In [6]:
# Not nessesary, but coceptually makes it easier for me to think about flows as a single table; 
# Need to add co_dest_name to the inflow and co_orig_name to the outflow to bring both tables into one
counties=pd.read_json(os.path.join(data_path,'irs_counties.json'),dtype=False)
counties.tail()

Unnamed: 0,co_fips,co_name
995,6039,Madera County
996,53005,Benton County
997,53021,Franklin County
998,37167,Stanly County
999,27137,St. Louis County


In [7]:
# Make the index to be same for same records
df_out['uid']=df_out.origin+"_"+df_out.destination
df_in['uid']=df_in.origin+"_"+df_in.destination
df_in.set_index('uid', inplace=True)
df_out.set_index('uid', inplace=True)

In [8]:
# merge counties data to get misising county name info
t_in=df_in.reset_index().merge(counties, left_on='destination', right_on='co_fips', how='left').drop('co_fips',1).rename(columns={'co_name':'co_dest_name'}).set_index('uid')
t_out=t=df_out.reset_index().merge(counties, left_on='origin', right_on='co_fips', how='left').drop('co_fips',1).rename(columns={'co_name':'co_orig_name'}).set_index('uid')

In [9]:
# now when both tables have same columns, bring both tables into one
# since many records existed in both tables, drop duplicates
flows=pd.concat([t_in, t_out],axis=0).drop_duplicates()
flows.head()

Unnamed: 0_level_0,co_dest_name,co_orig_name,destination,disclosure,exemptions,income,origin,returns,st_dest_abbrv,st_orig_abbrv
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
01051_01001,Autauga County,Elmore County,1001,,681.0,12150.0,1051,304.0,AL,AL
01101_01001,Autauga County,Montgomery County,1001,,642.0,9803.0,1101,279.0,AL,AL
01021_01001,Autauga County,Chilton County,1001,,177.0,2376.0,1021,68.0,AL,AL
01047_01001,Autauga County,Dallas County,1001,,113.0,1581.0,1047,49.0,AL,AL
01073_01001,Autauga County,Jefferson County,1001,,72.0,1422.0,1073,37.0,AL,AL


In [10]:
# some counties had different names in inflow and outflow table; Ex: Orleans Parish vs Orelans County
# these records weren't drop above, so eleminate them
print (flows.shape)
flows=flows[~flows.index.duplicated(keep='first')]
print (flows.shape)

(56950, 10)
(54479, 10)


In [11]:
# NYC counties
nyc=['36005','36047','36061','36081','36085']

In [12]:
# get flows that either originate or end in 5 NYC counties
nyc_flows=flows[(flows.origin.isin(nyc))|(flows.destination.isin(nyc))]

In [13]:
nyc_in=nyc_flows[(nyc_flows['destination'].isin(nyc)) & (~nyc_flows['origin'].isin(nyc))]
nyc_out=nyc_flows[(~nyc_flows['destination'].isin(nyc)) & (nyc_flows['origin'].isin(nyc))]

In [14]:
to_nyc=nyc_in[['origin','co_orig_name','exemptions','st_orig_abbrv']].groupby(['origin','co_orig_name','st_orig_abbrv']).sum().sort_values('exemptions',ascending=False).reset_index()
from_nyc=nyc_out[['destination','co_dest_name','st_dest_abbrv','exemptions']].groupby(['destination','co_dest_name','st_dest_abbrv']).sum().sort_values('exemptions',ascending=False).reset_index()
to_nyc.rename(columns={'origin':'co_fips','co_orig_name':'co_name','exemptions':'inflow','st_orig_abbrv':'state'},inplace=True)
from_nyc.rename(columns={'destination':'co_fips','co_dest_name':'co_name','st_dest_abbrv':'state','exemptions':'outflow'},inplace=True)
# top 5 inflows to NYC counties
to_nyc.head()

Unnamed: 0,co_fips,co_name,state,inflow
0,59000,Other flows - Different State,DS,19743.0
1,36059,Nassau County,NY,11900.0
2,36119,Westchester County,NY,9048.0
3,36103,Suffolk County,NY,5837.0
4,34017,Hudson County,NJ,4709.0


In [15]:
# top 5 outflows to NYC counties
from_nyc.head()

Unnamed: 0,co_fips,co_name,state,outflow
0,59000,Other flows - Different State,DS,23807.0
1,36059,Nassau County,NY,20316.0
2,36119,Westchester County,NY,13986.0
3,36103,Suffolk County,NY,8171.0
4,34017,Hudson County,NJ,7273.0


In [17]:
flows_nyc = to_nyc.merge(from_nyc, on=['co_fips', 'co_name', 'state'], how='outer')
flows_nyc.sort_values('inflow', ascending=False).head(n=10)

Unnamed: 0,co_fips,co_name,state,inflow,outflow
0,59000,Other flows - Different State,DS,19743.0,23807.0
1,36059,Nassau County,NY,11900.0,20316.0
2,36119,Westchester County,NY,9048.0,13986.0
3,36103,Suffolk County,NY,5837.0,8171.0
4,34017,Hudson County,NJ,4709.0,7273.0
5,34003,Bergen County,NJ,3860.0,5816.0
6,6037,Los Angeles County,CA,3465.0,5365.0
7,48453,Travis County,TX,3067.0,3308.0
8,9001,Fairfield County,CT,2966.0,4283.0
9,57001,Foreign - Overseas,FR,2872.0,6666.0


In [18]:
flows_nyc.sort_values('outflow',ascending=False).head(n=10)
# top senders to NYC are also top receivers of migrabts from NYC; first 4 counties are ranked the same for recieving and sending

Unnamed: 0,co_fips,co_name,state,inflow,outflow
0,59000,Other flows - Different State,DS,19743.0,23807.0
1,36059,Nassau County,NY,11900.0,20316.0
2,36119,Westchester County,NY,9048.0,13986.0
3,36103,Suffolk County,NY,5837.0,8171.0
4,34017,Hudson County,NJ,4709.0,7273.0
9,57001,Foreign - Overseas,FR,2872.0,6666.0
5,34003,Bergen County,NJ,3860.0,5816.0
6,6037,Los Angeles County,CA,3465.0,5365.0
10,34013,Essex County,NJ,2861.0,4791.0
8,9001,Fairfield County,CT,2966.0,4283.0


In [19]:
# calculate fractions, net flow and ratio
flows_nyc['net_flow'] = flows_nyc.inflow - flows_nyc.outflow
flows_nyc['in_ratio'] = flows_nyc.inflow / flows_nyc.outflow
flows_nyc['out_ratio'] = flows_nyc.outflow / flows_nyc.inflow
# need to fill Nulls with 0 for in and out flow to apply get_ration function
flows_nyc.inflow.fillna(0, inplace=True)
flows_nyc.outflow.fillna(0, inplace=True)
flows_nyc['in_to_out_ratio'] = flows_nyc.apply(lambda x: get_ratio(x['inflow'], x['outflow']), axis=1)
flows_nyc.head()

Unnamed: 0,co_fips,co_name,state,inflow,outflow,net_flow,in_ratio,out_ratio,in_to_out_ratio
0,59000,Other flows - Different State,DS,19743.0,23807.0,-4064.0,0.829294,1.205845,787:949
1,36059,Nassau County,NY,11900.0,20316.0,-8416.0,0.585745,1.707227,263:449
2,36119,Westchester County,NY,9048.0,13986.0,-4938.0,0.646933,1.545756,601:929
3,36103,Suffolk County,NY,5837.0,8171.0,-2334.0,0.714356,1.399863,5:7
4,34017,Hudson County,NJ,4709.0,7273.0,-2564.0,0.647463,1.544489,472:729


In [16]:
# associated metro_area information is based on 2016 CBSA boundaries; M1 class is for metropolitan statistical areas
# M2 is for micropolitan statistical areas
county=pd.read_json(os.path.join(data_path,'counties.json'), dtype=False)
county.head()

Unnamed: 0,cbsa_fips,co_fips,co_name,csa_fips,lsad,metro_name,st_fips
0,,19107,Keokuk County,,,,19
1,,19189,Winnebago County,,,,19
10,,1025,Clarke County,,,,1
100,,37095,Hyde County,,,,37
1000,48900.0,37141,Pender County,,M1,"Wilmington, NC",37


In [20]:
# merge metro area information to flows_nyc
flows_nyc = flows_nyc.merge(county[['co_fips', 'metro_name', ]], on='co_fips', how='left')

In [21]:
# group in and out flows by the metro area
# sort in the descending order of inflow first, then outflow
by_metro = flows_nyc[['metro_name', 'inflow', 'outflow']].groupby('metro_name').sum().sort_values(['inflow', 'outflow'],
                                                                                                  ascending=False)
# top 10 metro areas that send/recieve to/from NYC counties
by_metro.head(n=10)

Unnamed: 0_level_0,inflow,outflow
metro_name,Unnamed: 1_level_1,Unnamed: 2_level_1
"New York-Newark-Jersey City, NY-NJ-PA",50825.0,79873.0
"Miami-Fort Lauderdale-West Palm Beach, FL",5011.0,8347.0
"Washington-Arlington-Alexandria, DC-VA-MD-WV",4503.0,4399.0
"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",4433.0,5401.0
"Los Angeles-Long Beach-Anaheim, CA",4096.0,5996.0
"Boston-Cambridge-Newton, MA-NH",3655.0,3802.0
"Austin-Round Rock, TX",3067.0,3362.0
"Bridgeport-Stamford-Norwalk, CT",2966.0,4283.0
"San Francisco-Oakland-Hayward, CA",2470.0,3565.0
"Chicago-Naperville-Elgin, IL-IN-WI",2206.0,2232.0
