In [1]:
# load data files
# https://www.irs.gov/statistics/soi-tax-stats-migration-data

import re
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

requests_cache.install_cache()

result = requests.get('https://www.irs.gov/statistics/soi-tax-stats-migration-data')
assert result.status_code == 200
c = result.content

soup = BeautifulSoup(c)

my_target = lambda tag: tag.name == 'h3' and 'State-to-State Migration Data' in tag.get_text()

# get 1990 to 2011 data urls
data_links = soup.find(my_target).next_sibling.next_sibling.find_all('a')

urls = [{'url': a['href'], 'date_range': a.string} for a in data_links]

In [2]:
def transform_df(df):
    
    df = df[df.columns[-6:]]
    df.columns = [
        "Origin from",
        "State",
        "State Name",
        "Number of returns",
        "Number of exemptions",
        "Aggregate adjusted gross income (AGI)",
    ]

    def test_returns(x):
        try:
            return int(x)
        except:
            return -1

    df = df[df['Number of returns'].apply(test_returns) > 0]
    
    return df

dfs = []

for direction in ('in', 'out'):
    for x in range(2010, 1989, -1):
        filename = f'il_{direction}_{x}_{x+1}.xls'
        df = pd.read_excel(f'working/{filename}')
        df = transform_df(df)

        state, direction, date_from, date_to = filename.split('.')[0].split('_')
        df['direction'] = direction
        df['date_from'] = date_from
        df['date_to'] = date_to

        dfs.append(df)
        
# now 2012 thru 2016...

from io import BytesIO

data_keys = ['1112', '1213', '1314', '1415', '1516']

pd_options = {}

outflow_files = [('out', f'https://www.irs.gov/pub/irs-soi/stateoutflow{key}.csv') for key in data_keys]
inflow_files = [('in', f'https://www.irs.gov/pub/irs-soi/stateinflow{key}.csv') for key in data_keys]

for direction, url in (inflow_files + outflow_files):
    date_from = '20' + url.split('flow')[1][:2]
    date_to = '20' + url.split('flow')[1][2:4]
    
    r = requests.get(url)
    
    with BytesIO(r.content) as fh:
        df = pd.read_csv(fh, **pd_options)
        
        if direction == 'out':
            df = df.query('y1_statefips == 17')
        elif direction == 'in':
            df = df.query('y2_statefips == 17')
        df = transform_df(df)
        
        df['direction'] = direction
        df['date_from'] = date_from
        df['date_to'] = date_to
        
#         display(df)
        dfs.append(df)
    
big_df = pd.concat(dfs)

In [3]:
big_df['Origin from'] = big_df['Origin from'].apply(lambda x: str(x).zfill(2))
big_df['State'] = big_df['State'].str.upper()
big_df['State Name'] = big_df['State Name'].str.title().str.replace(' Of ', ' of ')

num_cols = ['Number of returns', 'Number of exemptions', 'date_to', 'date_from']
big_df[num_cols] = big_df[num_cols].astype(int)

my_df = big_df.set_index(['direction', 'Origin from', 'date_to']).sort_index()

In [4]:
my_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,State,State Name,Number of returns,Number of exemptions,Aggregate adjusted gross income (AGI),date_from
direction,Origin from,date_to,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
in,01,1991,AL,Alabama,780,1528,,1990
in,01,1992,AL,Alabama,793,1671,,1991
in,01,1993,AL,Alabama,716,1472,18056,1992
in,01,1994,AL,Alabama,755,1597,19786,1993
in,01,1995,AL,Alabama,742,1508,20025,1994
in,01,1996,AL,Alabama,822,1721,23063,1995
in,01,1997,AL,Alabama,836,1744,28264,1996
in,01,1998,AL,Alabama,835,1723,28773,1997
in,01,1999,AL,Alabama,841,1758,32799,1998
in,01,2000,AL,Alabama,803,1634,29510,1999


In [5]:
this_df = my_df.copy()
this_df = this_df.loc[this_df.index.get_level_values('Origin from') == '96']

pivot = this_df.reset_index().pivot(index='date_to', columns='direction', values='Number of returns')
display(pivot)

direction,in,out
date_to,Unnamed: 1_level_1,Unnamed: 2_level_1
1991,97133,114303
1992,97177,113789
1993,96732,114691
1994,93362,115103
1995,98259,118129
1996,97167,116823
1997,97206,122718
1998,98765,124806
1999,102659,125262
2000,102905,128306


In [8]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from gspread_dataframe import get_as_dataframe, set_with_dataframe

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name('/Users/pjudge/.credentials/BGA Graphics-3edf4552f3a5.json', scope)

gc = gspread.authorize(credentials)

worksheet = gc.open_by_key('1a1Udx7Bt222Bg5dFpeol84-wwlT0iXYwq17KvelGy88').worksheet('data')

def blank_out_worksheet(worksheet):
    """
    totally blank out worksheet
    """
    from gspread_dataframe import get_as_dataframe, set_with_dataframe
    
    zeroed_df = get_as_dataframe(worksheet)
    
    # set vals to null
    zeroed_df[:] = np.nan
    
    # set cols to null
    zeroed_df.rename(columns=lambda x: np.nan, inplace=True)
    
    # set worksheet to blank dataframe
    set_with_dataframe(worksheet, zeroed_df)
    
df_out = pivot.copy()
df_out.out = df_out.out.apply(lambda x: -x)
df_out = df_out.astype(int).reset_index()

df_out.columns = ['label', 'inflow', 'outflow']

blank_out_worksheet(worksheet)
set_with_dataframe(worksheet, df_out)