In [1]:
# load dependencies and prepare cache

import re
import requests
import requests_cache
# from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from io import BytesIO

requests_cache.install_cache()

In [2]:
r = requests.get('https://www.irs.gov/pub/irs-soi/1516il.xls')

dfs = {}

for direction in ['in', 'out']:
    pd_options = {
        'sheet_name': f'County {direction.capitalize()}flow',
        'header': None,
        'dtype': 'object'
    }

    with BytesIO(r.content) as fh:
        df = pd.read_excel(fh, **pd_options)

    df.iloc[3] = df.iloc[3].fillna(method='ffill')

    columns = [tuple(values) for name, values in df.iloc[3:5].iteritems()]
    df = df.transpose().set_index([3,4]).transpose()
    df = df[df.notnull().all(axis=1)].reset_index(drop=True)

    df.set_index(list(df.columns)[:6], inplace=True)

    df.columns = df.columns.droplevel(1)

    # list(map(lambda x, i: i, df.index.names)
    # [i for i, x in enumerate(df.index.names)]
    def rename_index(div, subdiv):
        names = []

        if 'from' in div.lower():
            names.append('from')
        else:
            names.append('to')

        if 'state' in subdiv.lower():
            names.append('state')
        else:
            names.append('county')

        if 'code' in subdiv.lower():
            names.append('code')

        return '_'.join(names)

    df.index.names = [rename_index(*x) for x in df.index.names]

    df.reset_index(inplace=True)
    
    dfs[direction] = df

In [3]:
# dfs['in'].set_index(list(dfs['in'].columns)[:5]) - dfs['out'].set_index(list(dfs['out'].columns)[:5])
dfs['in'].rename(columns={
    'Number of returns': 'in_returns',
    'Number of exemptions': 'in_exemptions',
    'Adjusted gross income (AGI)': 'in_agi'
}, inplace=True)

dfs['out'].rename(columns={
    'Number of returns': 'out_returns',
    'Number of exemptions': 'out_exemptions',
    'Adjusted gross income (AGI)': 'out_agi'
}, inplace=True)

In [4]:
df = dfs['in'].merge(
    dfs['out'],
    how='outer',
    on=['to_state_code', 'to_county_code', 'from_state_code', 'from_county_code']
)

In [5]:
df.set_index(['to_state_code', 'to_county_code', 'from_state_code', 'from_county_code'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,3,from_state,from_county,in_returns,in_exemptions,in_agi,to_state,to_county,out_returns,out_exemptions,out_agi
to_state_code,to_county_code,from_state_code,from_county_code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
17,0,96,0,IL,Total Migration-US and Foreign,237975,435119,15066576,,,,,
17,0,97,0,IL,Total Migration-US,236184,431390,14873842,,,,,
17,0,97,1,IL,Total Migration-Same State,141774,266101,8525584,,,,,
17,0,97,3,IL,Total Migration-Different State,94410,165289,6348258,,,,,
17,0,98,0,IL,Total Migration-Foreign,1791,3729,192734,,,,,
17,1,96,0,IL,Adams County Total Migration-US and Foreign,910,1698,39398,,,,,
17,1,97,0,IL,Adams County Total Migration-US,910,1698,39398,,,,,
17,1,97,1,IL,Adams County Total Migration-Same State,359,666,15388,,,,,
17,1,97,3,IL,Adams County Total Migration-Different State,551,1032,24010,,,,,
17,1,98,0,IL,Adams County Total Migration-Foreign,d,d,d,,,,,


In [6]:
non_migrants = df[
    (df['to_state_code'] == df['from_state_code']) & 
    (df['to_county_code'] == df['from_county_code'])
]

non_migrants = non_migrants[
    ['to_state_code', 'to_county_code', 'from_county', 'in_returns', 'in_exemptions', 'in_agi']
].rename(
    columns=lambda x: x.replace('to_', '').replace('from_', '').replace('in_', 'nom_')
).reset_index(drop=True)

In [7]:
total_in = df[df['from_state_code'] == 96].reset_index(drop=True)

total_out = df[df['to_state_code'] == 96].reset_index(drop=True)

In [8]:
prepped_in = total_in[
    ['to_state_code', 'to_county_code', 'from_county', 'in_returns', 'in_exemptions', 'in_agi']
].rename(columns=lambda x: x.replace('to_', '').replace('from_', ''))

prepped_out = total_out[
    ['from_state_code', 'from_county_code', 'to_county', 'out_returns', 'out_exemptions', 'out_agi']
].rename(columns=lambda x: x.replace('to_', '').replace('from_', ''))

prepped_net = prepped_in.merge(prepped_out, on=['state_code', 'county_code', 'county'])


In [9]:
prepped_net['net_returns'] = prepped_net.in_returns - prepped_net.out_returns
prepped_net['net_exemptions'] = prepped_net.in_exemptions - prepped_net.out_exemptions
prepped_net['net_agi'] = prepped_net.in_agi - prepped_net.out_agi

prepped_net

3,state_code,county_code,county,in_returns,in_exemptions,in_agi,out_returns,out_exemptions,out_agi,net_returns,net_exemptions,net_agi
0,17,0,Total Migration-US and Foreign,237975,435119,15066576,281076,522956,19908754,-43101,-87837,-4842178
1,17,1,Adams County Total Migration-US and Foreign,910,1698,39398,1103,1956,51541,-193,-258,-12143
2,17,3,Alexander County Total Migration-US and Foreign,85,172,2183,183,381,5007,-98,-209,-2824
3,17,5,Bond County Total Migration-US and Foreign,327,655,12915,352,668,14841,-25,-13,-1926
4,17,7,Boone County Total Migration-US and Foreign,1373,2861,65794,1497,2836,76680,-124,25,-10886
5,17,9,Brown County Total Migration-US and Foreign,68,118,2779,91,165,3487,-23,-47,-708
6,17,11,Bureau County Total Migration-US and Foreign,712,1394,28413,745,1376,35301,-33,18,-6888
7,17,13,Calhoun County Total Migration-US and Foreign,51,101,1902,50,97,2097,1,4,-195
8,17,15,Carroll County Total Migration-US and Foreign,270,509,11229,319,532,12635,-49,-23,-1406
9,17,17,Cass County Total Migration-US and Foreign,229,449,8362,378,745,13575,-149,-296,-5213


In [10]:
nom_returns = non_migrants[non_migrants.county_code == 1].reset_index().at[0, 'nom_returns']
net_returns = prepped_net[prepped_net.county_code == 1].reset_index().at[0, 'net_returns']

nom_returns / net_returns

-134.86528497409327

In [11]:
prepped_net = prepped_net.merge(
    non_migrants.drop('county', axis=1),
    on=['state_code', 'county_code'],
    how='left'
)

In [12]:
prepped_net.county = prepped_net.county.apply(lambda x: x.replace(" Total Migration-US and Foreign", ""))

prepped_net['weighted_net_returns'] = (prepped_net['net_returns'] / prepped_net['nom_returns'] * 100)

In [13]:
import qgrid

qgrid_widget = qgrid.show_grid(prepped_net)
qgrid_widget

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [14]:
prepped_net.sort_values('weighted_net_returns')

3,state_code,county_code,county,in_returns,in_exemptions,in_agi,out_returns,out_exemptions,out_agi,net_returns,net_exemptions,net_agi,nom_returns,nom_exemptions,nom_agi,weighted_net_returns
2,17,3,Alexander County,85,172,2183,183,381,5007,-98,-209,-2824,2024,4390,83515,-4.8419
39,17,77,Jackson County,1468,2499,44523,2064,3328,71228,-596,-829,-26705,17540,35293,1003589,-3.39795
9,17,17,Cass County,229,449,8362,378,745,13575,-149,-296,-5213,4996,11155,260440,-2.98239
55,17,109,McDonough County,620,1069,20680,818,1370,30219,-198,-301,-9539,9715,19595,551449,-2.03809
33,17,65,Hamilton County,134,297,5860,192,383,8012,-58,-86,-2152,2902,6240,160693,-1.99862
77,17,153,Pulaski County,99,214,3262,131,261,3879,-32,-47,-617,1899,4038,86454,-1.6851
10,17,19,Champaign County,4266,7222,219947,5386,9388,310869,-1120,-2166,-90922,66528,137239,4834261,-1.6835
93,17,185,Wabash County,160,331,5675,230,415,9036,-70,-84,-3361,4298,9298,272568,-1.62866
85,17,169,Schuyler County,110,222,4344,149,301,5217,-39,-79,-873,2583,5311,143888,-1.50987
70,17,139,Moultrie County,289,570,10393,368,684,15004,-79,-114,-4611,5263,11711,339529,-1.50105
