# analyze_ny_irs_mig_flows
The script analyzes the migration flows for NYC and NY metro area <br />
The data used is from the IRS MIgration SQLite database compiled by Baruch College <br />
The most current version of the irs migration database is available at 
https://www.baruch.cuny.edu/confluence/display/geoportal/IRS+Migration+Database

In [35]:
%matplotlib inline
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import sqlite3
import os
import pandas.io.formats.format as pf
plt.style.use ('ggplot')

# display numbers (floats) with thousand separator
pd.options.display.float_format = '{:,.0f}'.format

# found this hack to format integers for display with thousand separator
# https://stackoverflow.com/questions/29663252/format-pandas-integers-for-display?answertab=active#tab-top
class IntArrayFormatter(pf.GenericArrayFormatter):
    def _format_strings(self):
        formatter = self.formatter or '{:,d}'.format
        fmt_values = [formatter(x) for x in self.values]
        return fmt_values
pf.IntArrayFormatter = IntArrayFormatter

In [37]:
# define paths and variables
project_path='~/irs_nyc_migration'
data_path = '~/irs_nyc_migration/data'
db = 'irsmig_county_database'

con = sqlite3.connect (os.path.join (data_path, db, "irs_migration_county.sqlite"))
cur=con.cursor()
metros = pd.read_csv (os.path.join (data_path, 'metros', 'metros_basic.csv'),
                      converters={'fips': str, 'co_code': str, 'cbsa_code': str})

# project years
years = ('2011_12', '2012_13', '2013_14', '2014_15')  

# NYC counties
nyc = ('36005', '36047', '36061', '36081', '36085')

# NY Metro Counties
nyma=('34003','34013','34017','34019','34023','34025','34027','34029','34031',\
      '34035','34037','34039','36005','36027','36047','36059','36061','36071',\
      '36079','36081','36085','36087','36103','36119','42103')

# county fips and city names
big_cities={'06037':'Los Angeles Co, CA','17031': 'Cook Co, IL (Chicago)','48201': 'Harris Co, TX (Houston)',
   '04013': 'Maricopa Co, AZ (Phoenix)','42101': 'Philadelphia Co, PA', '48029': 'Bexar Co, TX (San Antonio)', '06073': 'San Diego Co, CA',
  '48113': 'Dallas Co, TX', '06085': 'Santa Clara Co, CA (San Jose)','36005':'New York City, NY','36047':'New York City, NY',
       '36061':'New York City, NY','36081':'New York City, NY','36085':'New York City, NY'}

# cbsa codes for 14 biggest metros
big_metros=['35620','31080','16980','19100','26420','47900','37980','33100','12060','14460','41860','38060','40140','19820']

# codes for foreign migration
foreign=('57005','57009','57001','57003','57007')

# codes for suppressed flows
suppressed=('58000','59000')

# plots colors
cbrew=['#f1a340','#f7f7f7','#998ec3']
cbrew2=['#fc8d59','#ffffbf','#99d594']
cbrew3=['#99d594','#ffffbf','#fc8d59']
flowcolor3=['#7FC97F','#666666','#00BFBF']
flowcolor2=['#7FC97F','#666666']

In [None]:
# look at one year of the inflow data in the database
inflow_temp = pd.read_sql_query ("SELECT * from inflow_2011_12", con)
inflow_temp.head()

In [None]:
# look at one year of the outflow data in the database
inflow_temp = pd.read_sql_query ("SELECT * from outflow_2011_12", con)
inflow_temp.head()

In [None]:
def get_flows_by_city(year, city):
    """ function to read in data from the database
    and combine flows for places of ineterest

    :rtype: dataframe"""

    # read in inflow & outflow data and store it in a pandas dataframe
    table1 = 'outflow_{}'.format (year)
    table2 = 'inflow_{}'.format (year)

    # SQL query to select 
    # flows between, but not within counties
    df_out = pd.read_sql_query ("SELECT * from {} where {}.origin!={}.destination".format (table1, table1, table1), con)
    df_in = pd.read_sql_query ("SELECT * from {} where {}.origin!={}.destination".format (table2, table2, table2), con)
    
    # most of the records for inflow will appear in the outflow table as well
    # and be same in both tables
    # make the uid be same for same records and set it as dataframe's index
    df_out['uid'] = df_out.origin + "_" + df_out.destination
    df_in['uid'] = df_in.origin + "_" + df_in.destination

    df_in.set_index ('uid', inplace=True)
    df_out.set_index ('uid', inplace=True)
    
    # select a subset of records between and not within selected counties
    city_in = df_in[(df_in['destination'].isin (city)) & (~df_in['origin'].isin (city))]
    city_out = df_out[(df_out['origin'].isin (city)) & (~df_out['destination'].isin (city))]
      
    to_city = city_in[['origin', 'co_orig_name', 'exemptions', 'st_orig_abbrv']].groupby (
        ['origin', 'co_orig_name', 'st_orig_abbrv']).sum().sort_values ('exemptions', ascending=False).reset_index ()
    
    from_city = city_out[['destination', 'co_dest_name', 'st_dest_abbrv', 'exemptions']].groupby (
        ['destination', 'co_dest_name', 'st_dest_abbrv']).sum().sort_values ('exemptions',
                                                                              ascending=False).reset_index ()
    
    to_city.rename (columns={'origin': 'co_fips', 'co_orig_name': 'co_name', 'exemptions': 'inflow' + year,
                             'st_orig_abbrv': 'state'}, inplace=True)
    
    from_city.rename (columns={'destination': 'co_fips', 'co_dest_name': 'co_name', 'st_dest_abbrv': 'state',
                               'exemptions': 'outflow' + year}, inplace=True)

    flows_city = to_city.merge (from_city, on=['co_fips', 'co_name', 'state'], how='outer')
    flows_city.fillna(0, inplace=True)

    # merge metro areas info to selected counties flows to determine which counties from these flows are urban
    flows_city = flows_city.merge (metros[['cbsa_code', 'cbsa_name', 'fips']], left_on='co_fips', right_on='fips',
                                   how='left').drop ('fips', 1)
    
    # label counties that are nor part of the metro areas and are not supressed or foregin as 'non-metro' counties
    flows_city.loc[(~flows_city['co_fips'].isin(suppressed)) & (~flows_city['co_fips'].isin(foreign)) & (flows_city['cbsa_code'].isnull()),['cbsa_name']]='non-metro'

    # add calculated columns
    flows_city['net_flow' + year] = flows_city['inflow' + year] - flows_city['outflow' + year]

    return flows_city

# NYC & NY metro

In [None]:
# empty lists to hold dataframes for each year
city_flows_dfs = []
metro_flows_dfs = []

In [None]:
# run the functions to get inflow/outflow data for New York city and for NY metro area
# for each year and append the results to a list
for year in years:
    city_flows_dfs.append (get_flows_by_city (year, nyc))

for year in years:
    metro_flows_dfs.append (get_flows_by_city (year, nyma))

# merge all years dfs for NYC (Metro) from the list into a single df
city_flows=reduce(lambda x, y: pd.merge(x, y, on = ['co_fips', 'co_name', 'state', 'cbsa_code', 'cbsa_name'], how='outer'), city_flows_dfs)
metro_flows=reduce(lambda x, y: pd.merge(x, y, on = ['co_fips', 'co_name', 'state', 'cbsa_code', 'cbsa_name'], how='outer'), metro_flows_dfs)

In [None]:
# county-level migration flows for NYC 
city_flows.head()

In [None]:
# county-level migration flows for NYMA
metro_flows.head()

In [None]:
city_flows.inflow2011_12.dtype

In [None]:
# pandas originally assigned float datatype to the columns containing number of migrants
# change datatype to integers
cols_to_int=['inflow2011_12','outflow2011_12','net_flow2011_12','inflow2012_13','outflow2012_13',
             'net_flow2012_13','inflow2013_14','outflow2013_14','net_flow2013_14',
             'inflow2014_15','outflow2014_15','net_flow2014_15']

In [None]:
# before coverting to integers, we neeed to fill nulls with 0 in numerical columns only
for c in city_flows.columns:
    if city_flows[c].dtype.kind in 'if':
        city_flows[c].fillna(0, inplace=True)
        
for c in metro_flows.columns:
    if metro_flows[c].dtype.kind in 'if':
        metro_flows[c].fillna(0, inplace=True)

In [None]:
city_flows[cols_to_int]=city_flows[cols_to_int].applymap(np.int64)
metro_flows[cols_to_int]=metro_flows[cols_to_int].applymap(np.int64)

In [None]:
city_flows.info()

In [None]:
# get subsets of domestic and foreign flows separately
domestic_city_flows=city_flows[~city_flows['co_fips'].isin (foreign)].copy()
foreign_city_flows=city_flows[city_flows['co_fips'].isin (foreign)].copy()

domestic_metro_flows=metro_flows[~metro_flows['co_fips'].isin (foreign)].copy()
foreign_metro_flows=metro_flows[metro_flows['co_fips'].isin (foreign)].copy()

In [None]:
# group NYMA county-level flows by metro area to get metro-level flows for NYMA
grouped_by_metro=metro_flows.groupby(['cbsa_name','cbsa_code']).sum().reset_index()
grouped_by_metro.head()

In [None]:
# create ranks for inflow and outflow for each year for both city and metro-level flows to identify top places
for col in [c for c in domestic_city_flows.columns if 'inflow' in c or 'outflow' in c]:
    yr = col[-7:]
    in_out = col[0:2]
    domestic_city_flows['{}_rank{}'.format (in_out, yr)] = city_flows[col].rank (method='dense', ascending=False)
    grouped_by_metro['{}_rank{}'.format (in_out, yr)] = grouped_by_metro[col].rank (method='dense', ascending=False)

In [None]:
# add cummulative numbers for inflow, outflow, and netflow for city and metro 
# these are totals for all 4 years
domestic_city_flows['tot_inflow']=domestic_city_flows[['inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15']].sum(axis=1)

domestic_city_flows['tot_outflow']=domestic_city_flows[['outflow2011_12','outflow2012_13',
                               'outflow2013_14','outflow2014_15']].sum(axis=1)

domestic_city_flows['tot_net_flow']=domestic_city_flows['tot_inflow']-domestic_city_flows['tot_outflow']

In [None]:
grouped_by_metro['tot_inflow']=grouped_by_metro[['inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15']].sum(axis=1)

grouped_by_metro['tot_outflow']=grouped_by_metro[['outflow2011_12','outflow2012_13',
                               'outflow2013_14','outflow2014_15']].sum(axis=1)

grouped_by_metro['tot_net_flow']=grouped_by_metro['tot_inflow']-grouped_by_metro['tot_outflow']

In [None]:
# add net change for each year
for i, year in enumerate(years):
    if i+1<len(years):
        # this is the name of the column that holds net flow change between 2 periods
        yr_change='net_change'+years[i].split('_')[1]+'_'+years[i+1].split('_')[1]
        
        domestic_city_flows[yr_change]=domestic_city_flows['net_flow'+years[i+1]]-domestic_city_flows['net_flow'+years[i]]
        grouped_by_metro[yr_change]=grouped_by_metro['net_flow'+years[i+1]]-grouped_by_metro['net_flow'+years[i]]       

In [None]:
# write the resulting data out to use for mapping in QGIS 
#domestic_city_flows.to_csv('yrs_2011_2015_nyc_mig_by_county.csv')
#grouped_by_metro.to_csv('yrs_2011_2015_ny_mig_by_metro.csv')

Write out a subset of columns for each dataframe as supplemetal tables

In [None]:
domestic_city_flows[['co_fips','co_name','state','inflow2011_12','outflow2011_12','net_flow2011_12','inflow2012_13',
                     'outflow2012_13','net_flow2012_13','inflow2013_14','outflow2013_14','net_flow2013_14',
                     'inflow2014_15','outflow2014_15','net_flow2014_15','tot_inflow',
                     'tot_outflow','tot_net_flow']].to_csv(os.path.join(project_path,'data','irs_mig_flows_nyc_2011_15.csv'))

In [None]:
grouped_by_metro[['cbsa_code','cbsa_name','inflow2011_12','outflow2011_12','net_flow2011_12','inflow2012_13',
                     'outflow2012_13','net_flow2012_13','inflow2013_14','outflow2013_14','net_flow2013_14',
                     'inflow2014_15','outflow2014_15','net_flow2014_15','tot_inflow',
                     'tot_outflow','tot_net_flow']].to_csv(os.path.join(project_path,'data','irs_mig_flows_nyma_2011_15.csv'))

##  Summary Tables and Plots

In [None]:
def get_total_flows(df):
  in_mig=pd.DataFrame(df[['inflow2011_12','inflow2012_13','inflow2013_14','inflow2014_15']].sum(axis=0), columns=['in_migration']).reset_index().rename(columns={'index':'years'})
  in_mig['years']=in_mig.years.apply(lambda x :x[-7:])
  out_mig=pd.DataFrame(df[['outflow2011_12','outflow2012_13','outflow2013_14','outflow2014_15']].sum(axis=0), columns=['out_migration']).reset_index().rename(columns={'index':'years'})
  out_mig['years']=out_mig.years.apply(lambda x :x[-7:])
  df_total=pd.merge(in_mig,out_mig, on='years')
  df_total['net_migration']=df_total['in_migration']-df_total['out_migration']
  df_total.set_index('years',inplace=True)  
  return df_total

In [None]:
# total domestic and foreign migrations for NYC 
total_city_foreign=get_total_flows(foreign_city_flows)
total_city_dom=get_total_flows(domestic_city_flows)

# year by year domestic migration to nyma from counties 
total_metro_dom=get_total_flows(domestic_metro_flows)
total_metro_foreign=get_total_flows(foreign_metro_flows)

# year by year migration to nyma from other metros 
total_by_metro_dom=get_total_flows(grouped_by_metro)

In [None]:
total_metro_dom

In [None]:
total_metro_foreign

In [None]:
total_by_metro_dom

In [None]:
total_city_dom

In [None]:
total_city_foreign

In [None]:
# year by year plots
def plot_migration(df, colors):
    ax=df[['in_migration','out_migration']].plot(kind='bar', rot=0, legend=False, color=colors[0:2])
    df['net_migration'].plot(kind='bar', ax=ax, rot=0, color=colors[2], width=0.2)
    ax.legend (['Inflows','Outflows','Net Flow'],loc=8, bbox_to_anchor=(0.5, -0.3),ncol=3, fontsize=12, frameon=False)
    ax.set_xlabel('')
    ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    # ticks labels size
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()
    return ax

In [None]:
# plot each
pl1=plot_migration(total_city_dom, flowcolor3)
f=pl1.get_figure()
f.savefig(os.path.join(project_path, 'images','nyc_dom_yrly.png'),bbox_inches='tight')

#pl2=plot_migration(total_city_foreign, flowcolor3)
#f2=pl2.get_figure()
#f2.savefig(os.path.join(project_path, 'images','nyc_forgn_yrly.png'),bbox_inches='tight')

pl3=plot_migration(total_by_metro_dom, flowcolor3)
f3=pl3.get_figure()
f3.savefig(os.path.join(project_path, 'images','nyma_dom_yrly.png'),bbox_inches='tight')

In [None]:
# for cummulative, just sum the rows 
total_city_dom.sum()

In [None]:
# put cummulative in one table 
cum_mig=pd.concat([total_city_dom.sum(),total_city_foreign.sum(),total_metro_dom.sum(), total_metro_foreign.sum()], axis=1)
cum_mig.columns=['domestic_mig_nyc','foreign_mig_nyc', 'domestic_mig_nyma','foreign_mig_nyma']
cum_mig

In [None]:
# format the table for output into latex document
cum_mig_latex=cum_mig[['domestic_mig_nyc','domestic_mig_nyma']].T
cum_mig_latex.rename(index={'domestic_mig_nyc':'New York City','domestic_mig_nyma':'New York Metro'}, 
                     columns={'in_migration':'In Flows','out_migration':'Out Flows','net_migration':'Net Flow'},inplace=True)

cum_mig_latex

In [None]:
cum_mig_latex.to_latex()

## Top Places 

In [None]:
# number of top places
top = 10

In [None]:
# not all top places in one year remain top in other years
# these functions select places that were ranked 1-top in any of the 4-year period
def get_top_senders(df):
    all_time_top_senders = \
        df[
            (df['in_rank2011_12'] <= top) | (df['in_rank2012_13'] <= top) | (
                df['in_rank2013_14'] <= top) | (
                df['in_rank2014_15'] <= top)].copy()

    return all_time_top_senders
    
def get_top_receivers(df):    
    all_time_top_receivers = \
        df[
            (df['ou_rank2011_12'] <= top) | (df['ou_rank2012_13'] <= top) | (
                df['ou_rank2013_14'] <= top) | (df['ou_rank2014_15'] <= top)].copy()

    return all_time_top_receivers

In [None]:
# get top places for city 
top_senders_to_nyc=get_top_senders(domestic_city_flows)
top_senders_to_nyc['county'] = top_senders_to_nyc['co_name'] + "," + top_senders_to_nyc['state']
top_senders_to_nyc = top_senders_to_nyc.set_index ('county').drop (['co_name', 'state'], 1)

top_receivers_from_nyc=get_top_receivers(domestic_city_flows)
top_receivers_from_nyc['county'] = top_receivers_from_nyc['co_name'] + "," + top_receivers_from_nyc['state']
top_receivers_from_nyc = top_receivers_from_nyc.set_index ('county').drop (['co_name', 'state'], 1)

In [None]:
# get top senders/receivers for metro area
top_senders_to_nyma=get_top_senders(grouped_by_metro)
top_receivers_from_nyma=get_top_receivers(grouped_by_metro)

In [None]:
# look at the yearly migration from NYC to top receiving counties
top_receivers_from_nyc[['outflow2011_12','outflow2012_13','outflow2013_14','outflow2014_15']].sort_values(['outflow2011_12','outflow2012_13','outflow2013_14','outflow2014_15'], ascending=False)

In [None]:
# look at the yearly migration to NYC from top sending counties
top_senders_to_nyc[['inflow2011_12','inflow2012_13','inflow2013_14','inflow2014_15']].sort_values(['inflow2011_12','inflow2012_13','inflow2013_14','inflow2014_15'], ascending=False)

In [None]:
# look at the yearly migration to NYC from top sending counties
ax=top_senders_to_nyc[['inflow2011_12','inflow2012_13','inflow2013_14','inflow2014_15']].T.plot(kind='bar', rot=0, colormap='Paired')
ax.legend (bbox_to_anchor=(1.07, 1), loc='upper left')
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
ax.set_xticklabels(years)
plt.show()

In [None]:
# yearly migration from NYMA to top receiving counties
top_receivers_from_nyma[['cbsa_name','outflow2011_12','outflow2012_13','outflow2013_14','outflow2014_15']].sort_values(['outflow2011_12','outflow2012_13','outflow2013_14','outflow2014_15'], ascending=False)

In [None]:
# yearly migration to NYMA from top sending counties
top_senders_to_nyma[['cbsa_name','inflow2011_12','inflow2012_13','inflow2013_14','inflow2014_15']].sort_values(['inflow2011_12','inflow2012_13','inflow2013_14','inflow2014_15'], ascending=False)

In [None]:
def plot_ranks(df, cols, title):

    """function to plot change in ranks over time
    :param df: dataframe to plot
    :param cols: a list of colum ranks to plot 
    :param title: Tile to display
    """
    df_ranks=df[cols]
    df_ranks.columns = [col[-7:] if 'rank' in col else col for col in df_ranks.columns]
    ax = df_ranks.T.plot (colormap='Paired')
    ax.invert_yaxis ()
    ax.yaxis.set_major_locator (ticker.MaxNLocator (integer=True)) # display only whole numbers
    ax.legend (bbox_to_anchor=(1.07, 1), loc='upper left')
    ax.set_xlabel ('Year')
    ax.set_ylabel ('Rank')
    ax2 = ax.twinx () # get second y axis on the right
    ax2.yaxis.set_major_locator (ticker.MaxNLocator (integer=True))
    ax2.set_ylim (ax.get_ylim ())
    plt.title (title)
    plt.show ()

In [None]:
# plot change in ranks for top places that send migrants to NYC
plot_ranks (top_senders_to_nyc,['in_rank2011_12', 'in_rank2012_13', 'in_rank2013_14', 'in_rank2014_15'], 'Change in Ranks for Top Migrant Senders to NYC')

In [None]:
# plot change in ranks for top places that receive migrants from NYC
plot_ranks (top_receivers_from_nyc,['ou_rank2011_12', 'ou_rank2012_13', 'ou_rank2013_14', 'ou_rank2014_15'], 'Change in Ranks for Top Migrant Receivers from NYC')

In [None]:
# same for NYMA
plot_ranks (top_senders_to_nyma.set_index('cbsa_name'),['in_rank2011_12', 'in_rank2012_13', 'in_rank2013_14', 'in_rank2014_15'], 'Change in Ranks for Top Migrant Senders to New York Metro Area')

In [None]:
plot_ranks (top_receivers_from_nyma.set_index('cbsa_name'),['ou_rank2011_12', 'ou_rank2012_13', 'ou_rank2013_14', 'ou_rank2014_15'], 'Change in Ranks for Top Migrant Receivers from New York Metro Area')

## Deficit areas (negative net change) and Surpus Areas (positive net change)

The top deficit and surplus areas aren't same over the years. We can look at them one year at a time

In [None]:
# top surplus (send more to NYC than recieve from NYC) counties for 2011-2012
domestic_city_flows[['co_name','state','net_flow2011_12']].sort_values('net_flow2011_12', ascending=False).head(n=20)

In [None]:
# top deficit (receive more from NYC than send to NYC) counties for 2011-2012
domestic_city_flows[['co_name','state','net_flow2011_12']].sort_values('net_flow2011_12', ascending=True).head(n=20)

We can look at top places for cummulative (all 4 years) flows

In [None]:
domestic_city_flows['label']=domestic_city_flows['co_name']+', '+domestic_city_flows['state']

In [None]:
def plot_cum_top_places(df, col, label, color, title, n=20, high_to_low=True):
    if high_to_low:
        top_n=df[[col,label]].sort_values(col, ascending=False).head(n=n)
    else:
        top_n=df[[col,label]].sort_values(col, ascending=True).head(n=n)
    ax=top_n.set_index(label).plot(kind='barh', color=color, figsize=(8,6))
    ax.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    ax.set_title(title)
    ax.set_ylabel('')
    plt.show()      

In [None]:
plot_cum_top_places(domestic_city_flows,'tot_inflow','label','green','Domestic Migration to NYC (2011-2015): Top 20 Migrant Senders')

In [None]:
plot_cum_top_places(domestic_city_flows,'tot_outflow','label','purple','Domestic Migration to NYC (2011-2015): Top 20 Migrant Receivers')

In [None]:
plot_cum_top_places(domestic_city_flows,'tot_net_flow','label','red','Domestic Migration to NYC (2011-2015): Top 20 Deficit Counties', high_to_low=False)

In [None]:
plot_cum_top_places(domestic_city_flows,'tot_net_flow','label','blue','Domestic Migration to NYC (2011-2015): Top 20 Surplus Counties')

In [None]:
plot_cum_top_places(grouped_by_metro.reset_index(),'tot_inflow','cbsa_name','green','Domestic Migration to NYMA: Top 20 Senders')

In [None]:
plot_cum_top_places(grouped_by_metro.reset_index(),'tot_outflow','cbsa_name','purple','Domestic Migration to NYMA (2011-2015): Top 20 Migrant Receivers')

In [None]:
plot_cum_top_places(grouped_by_metro.reset_index(),'tot_net_flow','cbsa_name','red','Domestic Migration to NYMA (2011-2015): Top 20 Deficit Metro Areas', high_to_low=False)

In [None]:
plot_cum_top_places(grouped_by_metro.reset_index(),'tot_net_flow','cbsa_name','blue','Domestic Migration to NYMA (2011-2015): Top 20 Surplus Counties')

## Net Totals Tables

In [None]:
domestic_city_flows.sort_values(by='tot_net_flow',inplace=True)
citytop20net=domestic_city_flows[domestic_city_flows.co_fips != '59000'][['label','tot_net_flow']].head(n=20)
citytop20net.set_index('label',inplace=True)
print(citytop20net.to_latex())

In [None]:
citybot10net=domestic_city_flows[['label','tot_net_flow']].tail(n=10)
citybot10net.set_index('label',inplace=True)
citybot10net.sort_values(by='tot_net_flow', ascending=False, inplace=True)
print(citybot10net.to_latex())

In [None]:
grouped_by_metro.sort_values(by='tot_net_flow',inplace=True)
metrotop20net=grouped_by_metro[['cbsa_name','tot_net_flow']].head(n=20)
metrotop20net.set_index('cbsa_name',inplace=True)
print(metrotop20net.to_latex())

In [None]:
metrobot10net=grouped_by_metro[['cbsa_name','tot_net_flow']].tail(n=10)
metrobot10net.set_index('cbsa_name',inplace=True)
metrobot10net.sort_values(by='tot_net_flow', ascending=False, inplace=True)
print(metrobot10net.to_latex())

## Urban Suburban Status

In [None]:
# function to label counties in metro as 'suburban' and others as 'distant'
def is_suburban(col, metro):
    if col in metro:
        return 'suburban'
    else:
        return 'distant'

In [None]:
domestic_city_flows['cnty_status']=domestic_city_flows.co_fips.apply(lambda x: is_suburban(x, nyma))

In [None]:
by_status=domestic_city_flows[['cnty_status','inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15',
                              'outflow2011_12','outflow2012_13',
                               'outflow2013_14','outflow2014_15']].groupby('cnty_status').sum()

In [None]:
by_status['pct_dif_inflow']=(by_status['inflow2014_15']-by_status['inflow2011_12'])/by_status['inflow2011_12']*100
by_status['pct_dif_outflow']=(by_status['outflow2014_15']-by_status['outflow2011_12'])/by_status['outflow2011_12']*100
by_status['tot_inflow']=by_status[['inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15']].sum(axis=1)
by_status['tot_outflow']=by_status[['outflow2011_12','outflow2012_13',
                               'outflow2013_14','outflow2014_15']].sum(axis=1)

In [None]:
by_status

In [None]:
# inflows and outflows to NYC from suburban counties accounted for more than third of all domestic NYC flows 
ax=by_status[['tot_inflow','tot_outflow']].plot.pie(subplots=True, legend=False, autopct='%1.1f%%',figsize=(8,4),colormap='Paired')

In [None]:
# yearly change in inflows to NYC for suburban and distant counties
by_status[['inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15']].T.plot(kind='bar', rot=30, colormap='Paired')

## Top Places in one table

Most of the top senders to NYC/NYMA are also top receivers from NYC/NYMA. We can look at them in a single table/chart.

In [None]:
domestic_city_flows.set_index('co_fips', inplace=True)

In [None]:
# get the list of top sending/receiving counties
top_inflows=domestic_city_flows['tot_inflow'].sort_values(ascending=False).head(n=16)
top_outflows=domestic_city_flows['tot_outflow'].sort_values(ascending=False).head(n=16)
counties_for_plot=top_inflows.index.tolist()
counties_for_plot.extend(top_outflows.index.tolist())

In [None]:
counties_for_plot=set(counties_for_plot)
# some places only appear in only one of the tables--we end up with more than 16 places
len(counties_for_plot)

In [None]:
df_to_plot=domestic_city_flows.loc[counties_for_plot,['label','tot_inflow','tot_outflow']]
df_to_plot.sort_values(by='tot_outflow',ascending=False, inplace=True)
df_to_plot

In [None]:
# drop Other flows-Different State and plot
axc=df_to_plot.drop('59000').set_index('label').plot(kind='barh',figsize=(8,7), color=flowcolor2)
axc.legend(['Coming to NYC','Leaving from NYC'], loc='best',fontsize=12, frameon=False)
axc.invert_yaxis()
axc.set_ylabel('')
axc.set_xlabel('Migrants')
axc.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

# ticks labels size
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
nyc_fig=axc.get_figure()
nyc_fig.savefig(os.path.join(project_path, 'images','nyc_top_flow.png'),bbox_inches='tight')

In [None]:
grouped_by_metro.set_index('cbsa_code', inplace=True)
top_m_inflows=grouped_by_metro['tot_inflow'].sort_values(ascending=False).head(n=15)
top_m_outflows=grouped_by_metro['tot_outflow'].sort_values(ascending=False).head(n=15)
metros_for_plot=top_m_inflows.index.tolist()
metros_for_plot.extend(top_m_outflows.index.tolist())
metros_for_plot=set(metros_for_plot)
# some places only appear in only one of the tables--we end up with more than 15 places 
len(metros_for_plot)

In [None]:
dfm_to_plot=grouped_by_metro.loc[metros_for_plot,['cbsa_name','tot_inflow','tot_outflow']]
dfm_to_plot.sort_values(by='tot_outflow',ascending=False, inplace=True)
dfm_to_plot

In [None]:
axm=dfm_to_plot.set_index('cbsa_name').plot(kind='barh',figsize=(8,7),color=flowcolor2)
axm.legend(['Coming to NYMA','Leaving from NYMA',], loc='best',fontsize=12, frameon=False)
axm.set_ylabel('')
axm.invert_yaxis()
axm.set_xlabel('Migrants')
axm.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

# ticks labels size
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
nyma_fig=axm.get_figure()
nyma_fig.savefig(os.path.join(project_path, 'images','nyma_top_flow.png'),bbox_inches='tight')

## Comparison with other big cities and metros

Compare cumulative flows for selected big (population over 1M ) cities

In [None]:
# empty list to hold the records of the dataframe we'll build from querying the database
df_lines=[]

In [None]:
# get cumulative total domestic inflows and outflows for big cities
for cnty in big_cities.keys():
    
    # exclude NYC: it consists of multiple counties and we already have the data
    if cnty not in nyc:        
        # yearly outflows
        all_outs=[]        
        # yearly inflows
        all_ins=[]        
        for year in years:
            table1 = 'outflow_{}'.format (year)
            table2 = 'inflow_{}'.format (year)

            cur.execute("SELECT SUM (exemptions) as outflow{0} from {1} where {2}.origin='{3}' and {4}.destination!='{5}' and {6}.destination not in {7} ".format (year, table1,  table1, cnty, table1, cnty, table1, foreign))    
            outs=cur.fetchone()[0]

            cur.execute("SELECT SUM (exemptions) as inflow{0} from {1} where {2}.origin!='{3}' and {4}.destination='{5}' and {6}.origin not in {7} ".format (year, table2, table2, cnty, table2, cnty, table2, foreign))    
            ins=cur.fetchone()[0]
            
            all_outs.append(outs)
            all_ins.append(ins)
        # sum yearly data
        total_out=sum(all_outs)
        total_in=sum(all_ins)
        df_line=(cnty,total_in,total_out)
        df_lines.append(df_line)

In [None]:
# create a dataframe from the list 
df_cities=pd.DataFrame(df_lines, columns=['county','tot_inflow','tot_outflow'])
df_cities['tot_net_flow']=df_cities['tot_inflow']-df_cities['tot_outflow']

In [None]:
# add city names column
city_names=pd.DataFrame.from_dict(big_cities, orient='index')
df_cities=df_cities.merge(city_names, left_on='county', right_index=True, how='left').rename(columns={0:'city_name'})

In [None]:
# append NYC records
nyc_tot=pd.DataFrame(domestic_city_flows[['tot_inflow','tot_outflow','tot_net_flow']].sum()).T
nyc_tot['city_name']='New York City, NY'
df_cities=df_cities.append(nyc_tot)

In [None]:
# reorder columns; remove county column
df_cities=df_cities[['city_name','tot_inflow','tot_outflow','tot_net_flow']]
df_cities.sort_values(by='tot_net_flow',ascending=False, inplace=True)
df_cities

In [None]:
# plot domestic migration for 10 big cities
axbc=df_cities[['city_name','tot_inflow','tot_outflow']].set_index('city_name').plot(kind='barh', figsize=(8,5), color=flowcolor2)
axbc.legend(['In Flows','Out Flows'], fontsize=12, frameon=False)
axbc.invert_yaxis()              
axbc.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
axbc.set_ylabel('')
# ticks labels size
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
big_c_fig=axbc.get_figure()
big_c_fig.savefig(os.path.join(project_path, 'images','big_cities_dom.png'),bbox_inches='tight')

Compare cumulative flows for selected largest metros

In [None]:
mdf_lines=[]

In [None]:
# running this cell takes awhile
import time
start = time.time()

for m in big_metros:
    all_outs=[]
    all_ins=[]
    for year in years:
        
        table1 = 'outflow_{}'.format (year)
        table2 = 'inflow_{}'.format (year)
        
        outflows=pd.read_sql_query ("SELECT * from {} where {}.origin!={}.destination".format (table1, table1, table1), con)
        
        outflows=outflows.merge(metros[['cbsa_code', 'cbsa_name', 'fips']], left_on='origin', right_on='fips',
                                   how='left').drop ('fips', 1).rename(columns={'cbsa_name':'cbsa_orig_name',
                                                                                'cbsa_code':'cbsa_orig'})
        outflows=outflows.merge(metros[['cbsa_code', 'cbsa_name', 'fips']], left_on='destination', right_on='fips',
                                   how='left').drop ('fips', 1).rename(columns={'cbsa_name':'cbsa_dest_name',
                                                                                'cbsa_code':'cbsa_dest'})
        
        inflows=pd.read_sql_query ("SELECT * from {} where {}.origin!={}.destination".format (table2, table2, table2), con)
        
        inflows=inflows.merge(metros[['cbsa_code', 'cbsa_name', 'fips']], left_on='origin', right_on='fips',
                                   how='left').drop ('fips', 1).rename(columns={'cbsa_name':'cbsa_orig_name',
                                                                                'cbsa_code':'cbsa_orig'})
        inflows=inflows.merge(metros[['cbsa_code', 'cbsa_name', 'fips']], left_on='destination', right_on='fips',
                                   how='left').drop ('fips', 1).rename(columns={'cbsa_name':'cbsa_dest_name',
                                                                                'cbsa_code':'cbsa_dest'})
        
        # select & sum flows from counties that are not in the same metro and are not foreign
        tot_out=outflows.loc[(outflows['cbsa_orig']==m) & (outflows['cbsa_dest']!=m) & (~outflows['destination'].isin(foreign)), 'exemptions'].sum()
        
        # select & sum flows to metro from counties that are not in the same metro and are not foreign
        tot_in=inflows.loc[(inflows['cbsa_orig']!=m) & (inflows['cbsa_dest']==m) & (~inflows['origin'].isin(foreign)), 'exemptions'].sum()
        
        all_outs.append(tot_out)
        all_ins.append(tot_in)
        
    total_out=sum(all_outs)
    total_in=sum(all_ins)
    df_line=(m,total_in,total_out)
    mdf_lines.append(df_line)
    
end = time.time()
print(int(end - start), 'seconds passed')

In [None]:
mdf_lines

In [None]:
df_metros=pd.DataFrame(mdf_lines, columns=['cbsa','tot_inflow','tot_outflow'])
df_metros['tot_net_flow']=df_metros['tot_inflow']-df_metros['tot_outflow']
df_metros

In [None]:
# add metro names to cbsa codes and merge
metro_names=metros[['cbsa_name','cbsa_code']].drop_duplicates()
df_metros=df_metros.merge(metro_names, left_on='cbsa', right_on='cbsa_code').drop('cbsa_code',1)
df_metros.sort_values(by='tot_net_flow', ascending=False, inplace=True)
df_metros

In [None]:
# plot domestic migration for big metros
axbm=df_metros[['cbsa_name','tot_inflow','tot_outflow']].set_index('cbsa_name').plot(kind='barh', figsize=(8,6), color=flowcolor2)
axbm.legend(['In Flows','Out Flows'], fontsize=12, frameon=False)
axbm.get_xaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
axbm.invert_yaxis() 
axbm.set_ylabel('')
# ticks labels size
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
big_m_fig=axbm.get_figure()
big_m_fig.savefig(os.path.join(project_path, 'images','big_metros_dom.png'),bbox_inches='tight')

## Los Angeles

Quick loook at Los Angeles City and LA metro area flows

In [None]:
la=['06037']
lama=('06037','06059')

In [None]:
la_flows_dfs=[]
lama_flows_dfs=[]

In [None]:
for year in years:
    la_flows_dfs.append (get_flows_by_city (year, la))

for year in years:
    lama_flows_dfs.append (get_flows_by_city (year, lama))

# merge all years dfs for LA (LAMA) from the list into a single df
la_flows=reduce(lambda x, y: pd.merge(x, y, on = ['co_fips', 'co_name', 'state', 'cbsa_code', 'cbsa_name'], how='outer'), la_flows_dfs)
lama_flows=reduce(lambda x, y: pd.merge(x, y, on = ['co_fips', 'co_name', 'state', 'cbsa_code', 'cbsa_name'], how='outer'), lama_flows_dfs)

In [None]:
la_flows.head()

In [None]:
# get subsets of domestic flows 
domestic_la_flows=la_flows[~la_flows['co_fips'].isin (foreign)].copy()

#group by metro area to get metro to metro flows
lama_by_metro=lama_flows.groupby(['cbsa_name','cbsa_code']).sum().reset_index()

In [None]:
# add cummulative numbers for inflow, outflow, netflow for city and metro 
# these are totals for all 4 years
domestic_la_flows['tot_inflow']=domestic_la_flows[['inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15']].sum(axis=1)

domestic_la_flows['tot_outflow']=domestic_la_flows[['outflow2011_12','outflow2012_13',
                               'outflow2013_14','outflow2014_15']].sum(axis=1)

domestic_la_flows['tot_net_flow']=domestic_la_flows['tot_inflow']-domestic_la_flows['tot_outflow']

In [None]:
domestic_la_flows['label']=domestic_la_flows['co_name']+', '+domestic_la_flows['state']

In [None]:
lama_by_metro['tot_inflow']=lama_by_metro[['inflow2011_12','inflow2012_13',
                               'inflow2013_14','inflow2014_15']].sum(axis=1)

lama_by_metro['tot_outflow']=lama_by_metro[['outflow2011_12','outflow2012_13',
                               'outflow2013_14','outflow2014_15']].sum(axis=1)

lama_by_metro['tot_net_flow']=lama_by_metro['tot_inflow']-lama_by_metro['tot_outflow']

In [None]:
domestic_la_flows[['label','tot_inflow','tot_outflow','tot_net_flow']].sort_values(by='tot_net_flow',ascending=False).head(n=30)

In [None]:
lama_by_metro[['cbsa_name','tot_inflow','tot_outflow','tot_net_flow']].sort_values(by='tot_net_flow',ascending=False).head(n=30)