In [1]:
# load dependencies and prepare cache

import re
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

requests_cache.install_cache()

## 'total' data (1996-2016)

In [2]:
# load data files
# https://www.irs.gov/statistics/soi-tax-stats-migration-data

result = requests.get('https://www.irs.gov/statistics/soi-tax-stats-migration-data')
assert result.status_code == 200
c = result.content

soup = BeautifulSoup(c)

my_target = lambda tag: tag.name == 'h3' and 'State-to-State Migration Data' in tag.get_text()

# get 1990 to 2011 data urls
data_links = soup.find(my_target).next_sibling.next_sibling.find_all('a')

urls = [{'url': a['href'], 'date_range': a.string} for a in data_links]

In [3]:
def transform_df(df):
    
    df = df[df.columns[-6:]]
    df.columns = [
        "Origin from",
        "State",
        "State Name",
        "Number of returns",
        "Number of exemptions",
        "Aggregate adjusted gross income (AGI)",
    ]

    def test_returns(x):
        try:
            return int(x)
        except:
            return -1

    df = df[df['Number of returns'].apply(test_returns) > 0]
    
    return df

dfs = []

for direction in ('in', 'out'):
    for x in range(2010, 1989, -1):
        filename = f'il_{direction}_{x}_{x+1}.xls'
        df = pd.read_excel(f'working/{filename}')
        df = transform_df(df)

        state, direction, date_from, date_to = filename.split('.')[0].split('_')
        df['direction'] = direction
        df['date_from'] = date_from
        df['date_to'] = date_to

        dfs.append(df)
        
# now 2012 thru 2016...

from io import BytesIO

data_keys = ['1112', '1213', '1314', '1415', '1516']

pd_options = {}

outflow_files = [('out', f'https://www.irs.gov/pub/irs-soi/stateoutflow{key}.csv') for key in data_keys]
inflow_files = [('in', f'https://www.irs.gov/pub/irs-soi/stateinflow{key}.csv') for key in data_keys]

for direction, url in (inflow_files + outflow_files):
    date_from = '20' + url.split('flow')[1][:2]
    date_to = '20' + url.split('flow')[1][2:4]
    
    r = requests.get(url)
    
    with BytesIO(r.content) as fh:
        df = pd.read_csv(fh, **pd_options)
        
        if direction == 'out':
            df = df.query('y1_statefips == 17')
        elif direction == 'in':
            df = df.query('y2_statefips == 17')
        df = transform_df(df)
        
        df['direction'] = direction
        df['date_from'] = date_from
        df['date_to'] = date_to
        
        dfs.append(df)
    
big_df = pd.concat(dfs)

In [4]:
big_df['Origin from'] = big_df['Origin from'].apply(lambda x: str(x).zfill(2))
big_df['State'] = big_df['State'].str.upper()
big_df['State Name'] = big_df['State Name'].str.title().str.replace(' Of ', ' of ')

num_cols = ['Number of returns', 'Number of exemptions', 'date_to', 'date_from']
big_df[num_cols] = big_df[num_cols].astype(int)

my_df = big_df.set_index(['direction', 'Origin from', 'date_to']).sort_index()

In [5]:
this_df = my_df.copy()

# total migration
this_df = this_df.loc[this_df.index.get_level_values('Origin from') == '96']

pivot = this_df.reset_index().pivot(index='date_to', columns='direction', values='Number of returns')
pivot['group'] = 'total'

pivot.loc[:, 'in':'out'] = pivot.loc[:, 'in':'out'].astype(int)
pivot.reset_index(inplace=True)

pivot.columns = ['label', 'inflow', 'outflow', 'group']
DF_TOTAL = pivot[['group', 'label', 'inflow', 'outflow']]

display(DF_TOTAL)

Unnamed: 0,group,label,inflow,outflow
0,total,1991,97133,114303
1,total,1992,97177,113789
2,total,1993,96732,114691
3,total,1994,93362,115103
4,total,1995,98259,118129
5,total,1996,97167,116823
6,total,1997,97206,122718
7,total,1998,98765,124806
8,total,1999,102659,125262
9,total,2000,102905,128306


## data by income bracket (2012-2016)

In [6]:
# load data files
# https://www.irs.gov/statistics/soi-tax-stats-migration-data

data_keys = ['1112', '1213', '1314', '1415', '1516']

pd_options = {}

def get_data_url(data_key):
    return f'https://www.irs.gov/pub/irs-soi/{data_key}inmigall.csv'

dfs = []

for key in data_keys:
    url = get_data_url(key)
    r = requests.get(url)
    
    with BytesIO(r.content) as fh:
        df = pd.read_csv(fh, **pd_options)
        df['timeframe'] = key
        dfs.append(df)
        
mig_df = pd.concat(dfs).set_index(['statefips', 'state', 'state_name', 'agi_stub', 'timeframe']).sort_index()

mig_df.index.set_levels([
    "Total",
    1,
    1e4,
    2.5e4,
    5e4,
    7.5e4,
    1e5,
    2e5,
#     "$1 under $10,000",
#     "$10,000 under $25,000",
#     "$25,000 under $50,000",
#     "$50,000 under $75,000",
#     "$75,000 under $100,000",
#     "$100,000 under $200,000",
#     "$200,000 or more",
], level = 'agi_stub', inplace=True)

mig_df = mig_df.query('state == "IL"')

display(mig_df.head(n=10))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,total_n1_0,total_n2_0,total_y1_agi_0,total_y2_agi_0,total_n1_1,total_n2_1,total_y1_agi_1,total_y2_agi_1,total_n1_2,total_n2_2,...,samest_y1_agi_4,samest_y2_agi_4,samest_n1_5,samest_n2_5,samest_y1_agi_5,samest_y2_agi_5,samest_n1_6,samest_n2_6,samest_y1_agi_6,samest_y2_agi_6
statefips,state,state_name,agi_stub,timeframe,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
17,IL,ILLINOIS,Total,1112,4972588,11045520,341597483,357454967,554824,1197084,11468029,13369644,916392,1868165,...,1191428,1236325,10230,17714,767862,745352,11088,15136,678470,698436
17,IL,ILLINOIS,Total,1213,4972846,10999459,348909406,386626118,557992,1202041,11826281,13988470,913491,1845024,...,1232911,1322917,11030,18975,900452,984768,12082,16554,670238,729831
17,IL,ILLINOIS,Total,1314,4961995,10899672,377423619,379017923,558833,1184187,12421757,14538189,912473,1823770,...,1211765,1251723,10442,18052,895025,871693,11384,15733,702016,666946
17,IL,ILLINOIS,Total,1415,4984464,10839876,371457397,400267045,573896,1193414,13099928,15645616,918096,1811288,...,871945,907119,7611,12849,641866,700515,8422,11403,1552187,654723
17,IL,ILLINOIS,Total,1516,5016616,10820739,397104639,419845630,587173,1199072,14127329,16831074,922297,1797704,...,1385137,1470059,10970,18373,976427,1055519,11783,16245,796116,967088
17,IL,ILLINOIS,1,1112,439244,662184,4978362,2611230,96640,145364,866800,611775,79743,126725,...,23548,9587,1028,1358,18765,5708,2176,2563,32478,11822
17,IL,ILLINOIS,1,1213,418625,627287,3983640,2487859,90675,136119,831509,579225,75244,118569,...,21089,8954,1075,1456,15392,5992,2234,2627,29680,11916
17,IL,ILLINOIS,1,1314,399284,597869,4488644,2372933,86515,128889,793909,552121,71591,113038,...,16445,7348,978,1299,13537,5325,2026,2410,31263,10956
17,IL,ILLINOIS,1,1415,385948,578029,4388293,2291767,83332,123330,761512,531554,69095,109215,...,14259,6090,734,938,10824,3942,1440,1688,23555,7347
17,IL,ILLINOIS,1,1516,368022,543137,4298446,2170523,77884,112865,723887,492694,64609,100871,...,16874,6866,1023,1359,16018,5660,2000,2369,34017,10336


In [7]:
TARGET_COLS = ['outflow_n1_0', 'inflow_n1_0'] # total outflow and total inflow per index

RANGES = {
    'Under $50k': (0, 5e4),     #    0 <= ... <  50k
    '$50-$100k': (5e4, 1e5),    #  50k <= ... < 100k
    '$100-$200k': (1e5, 2e5),   # 100k <= ... < 200k
    '$200k and greater': (2e5,) # 200k <= ...
}

def build_query(lte, gt=None):
    my_query = 'agi_stub != "Total" and '
    
    my_query += f'{lte} <= agi_stub'
    
    if (gt):
        my_query += f' < {gt} '
        
    return my_query

dfs = []

for label, RANGE in RANGES.items():
    my_df = mig_df[TARGET_COLS].query('agi_stub != "Total"')
    
    my_df = my_df.query(build_query(*RANGE))
    
    my_df = my_df.reset_index()[['timeframe', *TARGET_COLS]].groupby('timeframe').sum()
    
    my_df['group'] = label
    
    dfs.append(my_df.reset_index())

income_df = pd.concat(dfs)
display(income_df)

Unnamed: 0,timeframe,outflow_n1_0,inflow_n1_0,group
0,1112,84541,72252,Under $50k
1,1213,85465,69506,Under $50k
2,1314,81562,61409,Under $50k
3,1415,63590,49386,Under $50k
4,1516,79183,59519,Under $50k
0,1112,25494,19695,$50-$100k
1,1213,28786,21347,$50-$100k
2,1314,28694,19370,$50-$100k
3,1415,19063,14043,$50-$100k
4,1516,30853,20783,$50-$100k


In [8]:
this_df = income_df.copy()

this_df.timeframe = this_df.timeframe.apply(lambda x: '20' + str(x)[2:])

this_df.columns = ['label', 'outflow', 'inflow', 'group']
DF_INCOMES = this_df[['group', 'label', 'inflow', 'outflow']]

display(DF_INCOMES)

Unnamed: 0,group,label,inflow,outflow
0,Under $50k,2012,72252,84541
1,Under $50k,2013,69506,85465
2,Under $50k,2014,61409,81562
3,Under $50k,2015,49386,63590
4,Under $50k,2016,59519,79183
0,$50-$100k,2012,19695,25494
1,$50-$100k,2013,21347,28786
2,$50-$100k,2014,19370,28694
3,$50-$100k,2015,14043,19063
4,$50-$100k,2016,20783,30853


## save to google sheet

In [9]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from gspread_dataframe import get_as_dataframe, set_with_dataframe

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name('/Users/pjudge/.credentials/BGA Graphics-3edf4552f3a5.json', scope)

gc = gspread.authorize(credentials)

worksheet = gc.open_by_key('1a1Udx7Bt222Bg5dFpeol84-wwlT0iXYwq17KvelGy88').worksheet('data')

def blank_out_worksheet(worksheet):
    """
    totally blank out worksheet
    """
    from gspread_dataframe import get_as_dataframe, set_with_dataframe
    
    zeroed_df = get_as_dataframe(worksheet)
    
    # set vals to null
    zeroed_df[:] = np.nan
    
    # set cols to null
    zeroed_df.rename(columns=lambda x: np.nan, inplace=True)
    
    # set worksheet to blank dataframe
    set_with_dataframe(worksheet, zeroed_df)
    
df_out = pd.concat([DF_TOTAL, DF_INCOMES], ignore_index=True)

df_out.outflow = df_out.outflow.apply(lambda x: -x)

df_out.loc[df_out.label=='2015', 'inflow':'outflow'] = ''
df_out.loc[df_out.label==2015, 'inflow':'outflow'] = ''

blank_out_worksheet(worksheet)
set_with_dataframe(worksheet, df_out)