In [95]:
import pandas as pd
import numpy as np
import geopandas as gpd
import sqlite3
import os
from fractions import Fraction

def get_ratio(ins,outs):    
    """ Function to display in to out ratio in the format 1:7 """
    if ins==0:
        ratio='0'+':'+str(int(outs))
        return ratio
    elif outs==0:
        ratio=str(int(ins))+':'+'0'
        return ratio
    elif ins==outs:
        ratio='1:1'
        return ratio
    else:
        ratio=str(Fraction(ins/outs).limit_denominator(1000)).split('/')[0]+':'+str(Fraction(ins/outs).limit_denominator(1000)).split('/')[1]
        return ratio

In [96]:
data_path='/Users/anastasiaclark/irs_migration/irs_nyc_migration/data'
db='irsmig_county_database'
con = sqlite3.connect(os.path.join(data_path,db,"irs_migration_county.sqlite"))

In [97]:
years=['2011_12','2012_13','2013_14','2014_15'] # project years 

#for year in years:
year=years[3]
table1='outflow_{}'.format(year)
table2='inflow_{}'.format(year)

In [98]:
df_out = pd.read_sql_query("SELECT * from {} where {}.origin!={}.destination".format(table1, table1, table1), con)
df_in=pd.read_sql_query("SELECT * from {} where {}.origin!={}.destination".format(table2, table2, table2), con)      

In [99]:
# Make the index to be same for same records
df_out['uid']=df_out.origin+"_"+df_out.destination
df_in['uid']=df_in.origin+"_"+df_in.destination
df_in.set_index('uid', inplace=True)
df_out.set_index('uid', inplace=True)

In [100]:
metros=pd.read_csv(os.path.join(data_path,'metros','metros_basic.csv'),converters={'fips':str,'co_code':str,'cbsa_code':str})
metros.head()
metros.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825 entries, 0 to 1824
Data columns (total 8 columns):
cbsa_code      1825 non-null object
cbsa_name      1825 non-null object
metro_micro    1825 non-null object
co_name        1825 non-null object
st_name        1825 non-null object
st_code        1825 non-null int64
co_code        1825 non-null object
fips           1825 non-null object
dtypes: int64(1), object(7)
memory usage: 114.1+ KB


In [101]:
# NYC counties
nyc=['36005','36047','36061','36081','36085']

In [102]:
nyc_in=df_in[df_in['destination'].isin(nyc)]
nyc_out=df_out[df_out['origin'].isin(nyc)]

In [103]:
to_nyc=nyc_in[['origin','co_orig_name','exemptions','st_orig_abbrv']].groupby(['origin','co_orig_name','st_orig_abbrv']).sum().sort_values('exemptions',ascending=False).reset_index()
from_nyc=nyc_out[['destination','co_dest_name','st_dest_abbrv','exemptions']].groupby(['destination','co_dest_name','st_dest_abbrv']).sum().sort_values('exemptions',ascending=False).reset_index()
to_nyc.rename(columns={'origin':'co_fips','co_orig_name':'co_name','exemptions':'inflow','st_orig_abbrv':'state'},inplace=True)
from_nyc.rename(columns={'destination':'co_fips','co_dest_name':'co_name','st_dest_abbrv':'state','exemptions':'outflow'},inplace=True)

In [104]:
flows_nyc = to_nyc.merge(from_nyc, on=['co_fips', 'co_name', 'state'], how='outer')
flows_nyc.sort_values('inflow', ascending=False).head(n=10)

Unnamed: 0,co_fips,co_name,state,inflow,outflow
0,36047,Kings County,NY,41886.0,36031.0
1,36061,New York County,NY,38830.0,28352.0
2,36081,Queens County,NY,27061.0,33000.0
3,59000,Other flows - Different State,DS,19743.0,23807.0
4,36005,Bronx County,NY,19504.0,26424.0
5,36059,Nassau County,NY,11900.0,20316.0
6,36119,Westchester County,NY,9048.0,13986.0
7,36103,Suffolk County,NY,5837.0,8171.0
8,34017,Hudson County,NJ,4709.0,7273.0
9,36085,Richmond County,NY,4234.0,7708.0


In [105]:
flows_nyc.sort_values('outflow',ascending=False).head(n=10)
# top senders to NYC are also top receivers of migrabts from NYC; first 4 counties are ranked the same for recieving and sending

Unnamed: 0,co_fips,co_name,state,inflow,outflow
0,36047,Kings County,NY,41886.0,36031.0
2,36081,Queens County,NY,27061.0,33000.0
1,36061,New York County,NY,38830.0,28352.0
4,36005,Bronx County,NY,19504.0,26424.0
3,59000,Other flows - Different State,DS,19743.0,23807.0
5,36059,Nassau County,NY,11900.0,20316.0
6,36119,Westchester County,NY,9048.0,13986.0
7,36103,Suffolk County,NY,5837.0,8171.0
9,36085,Richmond County,NY,4234.0,7708.0
8,34017,Hudson County,NJ,4709.0,7273.0


In [106]:
# calculate fractions, net flow and ratio
flows_nyc['net_flow'] = flows_nyc.inflow - flows_nyc.outflow
flows_nyc['in_ratio'] = flows_nyc.inflow / flows_nyc.outflow
flows_nyc['out_ratio'] = flows_nyc.outflow / flows_nyc.inflow
# need to fill Nulls with 0 for in and out flow to apply get_ration function
flows_nyc.inflow.fillna(0, inplace=True)
flows_nyc.outflow.fillna(0, inplace=True)
flows_nyc['in_to_out_ratio'] = flows_nyc.apply(lambda x: get_ratio(x['inflow'], x['outflow']), axis=1)
flows_nyc.head()

Unnamed: 0,co_fips,co_name,state,inflow,outflow,net_flow,in_ratio,out_ratio,in_to_out_ratio
0,36047,Kings County,NY,41886.0,36031.0,5855.0,1.162499,0.860216,93:80
1,36061,New York County,NY,38830.0,28352.0,10478.0,1.369568,0.730157,63:46
2,36081,Queens County,NY,27061.0,33000.0,-5939.0,0.82003,1.219467,524:639
3,59000,Other flows - Different State,DS,19743.0,23807.0,-4064.0,0.829294,1.205845,787:949
4,36005,Bronx County,NY,19504.0,26424.0,-6920.0,0.738117,1.354799,730:989


In [107]:
# drop a column in each table, so that the columns are the same in both tables
df_in.drop('co_orig_name',1,inplace=True)
df_out.drop('co_dest_name',1,inplace=True)

In [108]:
# get inflow and outflow into a single table and since most of the records exist in both tables, drop duplicates
flows=pd.concat([df_in,df_out],axis=0).drop_duplicates(subset=['origin','destination','returns','exemptions'])

In [109]:
# check if there are any records with duplicated indexes; should be none
flows[flows.index.duplicated()]

Unnamed: 0_level_0,destination,disclosure,exemptions,income,origin,returns,st_dest_abbrv,st_orig_abbrv
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
06001_06083,06083,,112.0,2469.0,06001,74.0,CA,CA
06013_06085,06085,,1359.0,70854.0,06013,788.0,CA,CA
06013_32031,32031,,261.0,22906.0,06013,141.0,NV,CA
06013_06019,06019,,186.0,3576.0,06013,84.0,CA,CA
06013_48113,48113,,101.0,5569.0,06013,51.0,TX,CA
06017_06071,06071,,35.0,2632.0,06017,22.0,CA,CA
06029_06075,06075,,59.0,4878.0,06029,40.0,CA,CA
06033_06013,06013,,51.0,2110.0,06033,27.0,CA,CA
06037_12061,12061,,41.0,10657.0,06037,23.0,FL,CA
06071_53061,53061,,106.0,2530.0,06071,59.0,WA,CA


In [110]:
# examine some of the records that might be errors

# why there are records that have different exemptions/income numbers in inflow and outflow tables
# where they should be the same?
# Run a check in the sqlite databse--same results
# SELECT * FROM inflow_2014_15 where origin="06001" and destination ="06083"
# SELECT * FROM outflow_2014_15 where origin="06001" and destination ="06083"
flows.loc['06013_06085']

Unnamed: 0_level_0,destination,disclosure,exemptions,income,origin,returns,st_dest_abbrv,st_orig_abbrv
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
06013_06085,6085,,1363.0,256538.0,6013,789.0,CA,CA
06013_06085,6085,,1359.0,70854.0,6013,788.0,CA,CA


# Continue from here after above issue resolved

In [111]:
# metro_in=df_in[(df_in.cbsa_name.notnull) & (df_in.cbsa_name.str.contains('New York'))].copy()
# metro_in.rename(columns={'cbsa_name':'cbsa_dest'},inplace=True)

# metro_out=df_out[(df_out.cbsa_name.notnull) & (df_out.cbsa_name.str.contains('New York'))].copy()
# metro_out.rename(columns={'cbsa_name':'cbsa_orig'},inplace=True)

In [112]:
# merge metro area information
# df_in = df_in.merge(metros[['fips', 'cbsa_name']],left_on='destination', right_on='fips', how='left').drop('fips',1).rename(columns={'cbsa_name':'dest_cbsa'})
# df_in = df_in.merge(metros[['fips', 'cbsa_name']],left_on='origin', right_on='fips', how='left').rename(columns={'cbsa_name':'orig_cbsa'})

# df_out = df_out.merge(metros[['fips', 'cbsa_name']],left_on='destination', right_on='fips', how='left').drop('fips',1).rename(columns={'cbsa_name':'dest_cbsa'})
# df_out = df_out.merge(metros[['fips', 'cbsa_name']],left_on='origin', right_on='fips', how='left').rename(columns={'cbsa_name':'orig_cbsa'})

In [113]:
# group in and out flows by the metro area
# sort in the descending order of inflow first, then outflow
# by_metro = flows_nyc[['cbsa_name', 'inflow', 'outflow']].groupby('cbsa_name').sum().sort_values(['inflow', 'outflow'], ascending=False)
# top 10 metro areas that send/recieve to/from NYC counties
# by_metro.head(n=10)

In [114]:
con.close()