In [2]:
# load data files
# https://www.irs.gov/statistics/soi-tax-stats-migration-data

from io import BytesIO
import re
import requests
import requests_cache
import pandas as pd

requests_cache.install_cache()

data_keys = ['1112', '1213', '1314', '1415', '1516']

pd_options = {}

def get_data_url(data_key):
    return f'https://www.irs.gov/pub/irs-soi/{data_key}inmigall.csv'

dfs = []

for key in data_keys:
    url = get_data_url(key)
    r = requests.get(url)
    
    with BytesIO(r.content) as fh:
        df = pd.read_csv(fh, **pd_options)
        df['timeframe'] = key
        dfs.append(df)
        
mig_df = pd.concat(dfs).set_index(['statefips', 'state', 'state_name', 'agi_stub', 'timeframe']).sort_index()

mig_df.index.set_levels([
    "Total",
    (1,1e4),
    (1e4, 2.5e4),
    (2.5e4, 5e4),
    (5e4, 7.5e4),
    (7.5e4, 1e5),
    (1e5, 2e5),
    (2e5,)
#     "$1 under $10,000",
#     "$10,000 under $25,000",
#     "$25,000 under $50,000",
#     "$50,000 under $75,000",
#     "$75,000 under $100,000",
#     "$100,000 under $200,000",
#     "$200,000 or more",
], level = 'agi_stub', inplace=True)

display(mig_df.head(n=10))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total_n1_0,total_n2_0,total_y1_agi_0,total_y2_agi_0,total_n1_1,total_n2_1,total_y1_agi_1,total_y2_agi_1,total_n1_2,total_n2_2,...,samest_y1_agi_4,samest_y2_agi_4,samest_n1_5,samest_n2_5,samest_y1_agi_5,samest_y2_agi_5,samest_n1_6,samest_n2_6,samest_y1_agi_6,samest_y2_agi_6
statefips,state,state_name,agi_stub,timeframe,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,AL,ALABAMA,Total,1112,1679103,3807951,90691917,93398378,172506,364898,3140327,3636450,303586,718498,...,431216,432999,4771,8596,315632,292890,3503,5225,193900,189261
1,AL,ALABAMA,Total,1213,1673858,3788019,92432250,98657104,172504,363798,3236937,3789283,299910,708172,...,411898,472254,4328,7779,280558,283677,3440,5100,186519,190366
1,AL,ALABAMA,Total,1314,1663655,3748068,97120991,97625193,171347,353586,3327271,3864916,297613,699666,...,411939,412056,4112,7340,297847,282473,3178,4662,194808,178900
1,AL,ALABAMA,Total,1415,1660788,3712732,95926666,101230618,172947,349905,3417966,4034640,296254,691544,...,319300,345282,3174,5609,221760,240787,2586,3843,162699,155858
1,AL,ALABAMA,Total,1516,1668163,3703054,100346717,105018157,173605,343526,3538976,4178813,297405,688471,...,406614,447939,4303,7555,322722,338471,3577,5343,237844,224165
1,AL,ALABAMA,"(1, 10000.0)",1112,160174,266204,1819941,1017090,34539,54396,325485,231739,32246,58044,...,13120,6626,520,747,9102,3229,609,791,8575,3516
1,AL,ALABAMA,"(1, 10000.0)",1213,151289,249878,1707255,957335,31838,49595,307675,214368,29902,53793,...,10102,4896,429,625,6083,2581,526,677,7241,2817
1,AL,ALABAMA,"(1, 10000.0)",1314,143816,237017,1677861,907684,30561,47026,291201,204912,28125,50348,...,8842,3950,371,524,5258,2119,491,632,8399,2787
1,AL,ALABAMA,"(1, 10000.0)",1415,137619,225593,1608564,867298,29319,44733,278283,196529,26786,47716,...,7000,3202,304,429,4749,1812,352,457,5846,1910
1,AL,ALABAMA,"(1, 10000.0)",1516,133267,215805,1580537,834010,27743,42043,262349,184653,25536,44791,...,7778,3541,379,536,5697,2144,491,635,9408,2834


In [3]:
# single line chart output

targets = ['outflow_n1_0', 'inflow_n1_0']

df = mig_df.query('state == "IL" and agi_stub != "Total"')[targets]

df['net_flow'] = df.inflow_n1_0 - df.outflow_n1_0

# df.pivot(index='timeframe', columns='agi_stub', values='net_flow')
df = df.reset_index().pivot(index='timeframe', columns='agi_stub', values='net_flow')

df = df.rename(index=lambda x: f'1/1/20{x[2:]}')

df.index.names=['date']

df.columns = [
    "$1 under $10,000",
    "$10,000 under $25,000",
    "$25,000 under $50,000",
    "$50,000 under $75,000",
    "$75,000 under $100,000",
    "$100,000 under $200,000",
    "$200,000 or more",
]

In [4]:
# small mults output

targets = ['outflow_n1_0', 'inflow_n1_0']

df = mig_df.query('state == "IL" and agi_stub != "Total"')[targets]

df['net_flow'] = df.inflow_n1_0 - df.outflow_n1_0

df = df.reset_index().rename(columns={'timeframe': 'date', 'agi_stub': 'group'})[['group', 'date', 'net_flow']]

df.set_index(['group', 'date'], inplace=True)

df.index.set_levels([
    "$1 under $10,000",
    "$10,000 under $25,000",
    "$25,000 under $50,000",
    "$50,000 under $75,000",
    "$75,000 under $100,000",
    "$100,000 under $200,000",
    "$200,000 or more",
], level = 'group', inplace=True)

df.index.set_levels([
    '1/1/2012',
    '1/1/2013',
    '1/1/2014',
    '1/1/2015',
    '1/1/2016'
], level='date', inplace=True)

df.to_csv('sample.csv')

In [5]:
# single line chart output by state

targets = ['outflow_n1_0', 'inflow_n1_0']

df = mig_df.query('agi_stub == "Total"')[targets]

df['net_flow'] = df.inflow_n1_0 - df.outflow_n1_0

df = df.reset_index().pivot(index='timeframe', columns='state', values='net_flow')

df = df.rename(index=lambda x: f'1/1/20{x[2:]}')

df.index.names=['date']

df.to_csv('sample.csv')

In [14]:
# small mults output by income, many states

targets = ['outflow_n1_0', 'inflow_n1_0']

df = mig_df.query('agi_stub != "Total"')[targets]

df['net_flow'] = df.inflow_n1_0 - df.outflow_n1_0

df = df.reset_index().rename(columns={'timeframe': 'date', 'agi_stub': 'group'})

pivot = pd.pivot_table(df, values='net_flow', columns='state', index=['group', 'date'])

pivot.index.set_levels([
    "$1 under $10,000",
    "$10,000 under $25,000",
    "$25,000 under $50,000",
    "$50,000 under $75,000",
    "$75,000 under $100,000",
    "$100,000 under $200,000",
    "$200,000 or more",
], level = 'group', inplace=True)

pivot.index.set_levels([
    '1/1/2012',
    '1/1/2013',
    '1/1/2014',
    '1/1/2015',
    '1/1/2016'
], level='date', inplace=True)

# pivot.to_csv('sample.csv')

In [16]:
# small mults output by income, many states

targets = ['outflow_n1_0', 'inflow_n1_0']

df = mig_df.query('agi_stub != "Total"')[targets]

df['net_flow'] = df.inflow_n1_0 - df.outflow_n1_0

df = df.reset_index().rename(columns={'timeframe': 'date', 'agi_stub': 'group'})

pivot = pd.pivot_table(df, values=targets, index=['state', 'group', 'date'])

pivot.index.set_levels([
    "$1 under $10,000",
    "$10,000 under $25,000",
    "$25,000 under $50,000",
    "$50,000 under $75,000",
    "$75,000 under $100,000",
    "$100,000 under $200,000",
    "$200,000 or more",
], level = 'group', inplace=True)

pivot.index.set_levels([
    '1/1/2012',
    '1/1/2013',
    '1/1/2014',
    '1/1/2015',
    '1/1/2016'
], level='date', inplace=True)

# pivot.to_csv('sample.csv')
# pivot['IL']
pivot.query('state == "IL"')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,inflow_n1_0,outflow_n1_0
state,group,date,Unnamed: 3_level_1,Unnamed: 4_level_1
IL,"$1 under $10,000",1/1/2012,16358,17985
IL,"$1 under $10,000",1/1/2013,14743,17328
IL,"$1 under $10,000",1/1/2014,12617,16250
IL,"$1 under $10,000",1/1/2015,10641,12971
IL,"$1 under $10,000",1/1/2016,11161,14358
IL,"$10,000 under $25,000",1/1/2012,29942,35656
IL,"$10,000 under $25,000",1/1/2013,28665,35960
IL,"$10,000 under $25,000",1/1/2014,25091,34028
IL,"$10,000 under $25,000",1/1/2015,20936,27578
IL,"$10,000 under $25,000",1/1/2016,24369,33150
