In [1]:
# load data files
# https://www.irs.gov/statistics/soi-tax-stats-migration-data

import re
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

requests_cache.install_cache()

result = requests.get('https://www.irs.gov/statistics/soi-tax-stats-migration-data')
assert result.status_code == 200
c = result.content

soup = BeautifulSoup(c)

my_target = lambda tag: tag.name == 'h3' and 'State-to-State Migration Data' in tag.get_text()

# get 1990 to 2011 data urls
data_links = soup.find(my_target).next_sibling.next_sibling.find_all('a')

urls = [{'url': a['href'], 'date_range': a.string} for a in data_links]

In [2]:
from requests import Session
from urllib.parse import urljoin

class LiveServerSession(Session):
    def __init__(self, prefix_url=None, *args, **kwargs):
        super(LiveServerSession, self).__init__(*args, **kwargs)
        self.prefix_url = prefix_url

    def request(self, method, url, *args, **kwargs):
        url = urljoin(self.prefix_url, url)
        return super(LiveServerSession, self).request(method, url, *args, **kwargs)
    
def get_soi_data(url):
    baseUrl = 'https://www.irs.gov'
        
    try:
        with LiveServerSession(baseUrl) as s:
            r = s.get(url)
        assert r.status_code == 200
    except:
        raise
        
    return r

In [3]:
def get_soi_details(url):
    '''
    get cumulative data as df given lookup and year
    '''
    from io import BytesIO
    from zipfile import ZipFile
    import itertools
    
    r = get_soi_data(url)
    
    pd_options = {
        'header': None
    }
    
    if '.zip' in r.url:
        with ZipFile(BytesIO(r.content)) as my_zipfile:
            my_details = {}
            
            # filter down to Illinois data
            zip_list = list(filter(lambda x: re.match('(.*/)?il|.*il\wr|.*mig(in|out)il', x, re.IGNORECASE), my_zipfile.namelist()))
            
            for file in zip_list:
                chunk = file.split('/')[-1]
                
                if re.search('ou?(t|r)', chunk, flags=re.IGNORECASE):
                    my_details['file_out'] = file
                else:
                    my_details['file_in'] = file
                    
            return my_details

for entry in urls:
    entry.update(get_soi_details(entry['url']))

In [4]:
def save_soi_file(url, file, date_range, direction):
    from io import BytesIO
    from zipfile import ZipFile
    import itertools
    
    r = get_soi_data(url)
    
    pd_options = {
        'header': None
    }
    
    with ZipFile(BytesIO(r.content)) as zf:        
        target_path = 'original/il_{}_{}_{}.{}'.format(
            direction,
            date_range.split(' to ')[0],
            date_range.split(' to ')[1],
            file.split('.')[-1]
        )
        try:
            with open(target_path, 'wb') as f:
                f.write(zf.read(file))
        except:
            print(f'Skip writing {target_path} because file already exists!')
        

entry = urls[0]

for direction in ('in', 'out'):
    for entry in urls:
        save_soi_file(entry['url'], entry[f'file_{direction}'], entry['date_range'], direction)

Skip writing original/il_in_1990_1991.xls because file already exists!
Skip writing original/il_in_1991_1992.xls because file already exists!
Skip writing original/il_in_1992_1993.xls because file already exists!
Skip writing original/il_in_1993_1994.xls because file already exists!
Skip writing original/il_in_1994_1995.xls because file already exists!
Skip writing original/il_in_1995_1996.xls because file already exists!
Skip writing original/il_in_1996_1997.xls because file already exists!
Skip writing original/il_in_1997_1998.xls because file already exists!
Skip writing original/il_in_1998_1999.xls because file already exists!
Skip writing original/il_in_1999_2000.xls because file already exists!
Skip writing original/il_in_2000_2001.xls because file already exists!
Skip writing original/il_in_2001_2002.xls because file already exists!
Skip writing original/il_in_2002_2003.xls because file already exists!
Skip writing original/il_in_2003_2004.xls because file already exists!
Skip w

In [5]:
def transform_df(df):
    
    df = df[df.columns[-6:]]
    df.columns = [
        "Origin from",
        "State",
        "State Name",
        "Number of returns",
        "Number of exemptions",
        "Aggregate adjusted gross income (AGI)",
    ]

    def test_returns(x):
        try:
            return int(x)
        except:
            return -1

    df = df[df['Number of returns'].apply(test_returns) > 0]
    
    return df

dfs = []

for direction in ('in', 'out'):
    for x in range(2010, 1989, -1):
        filename = f'il_{direction}_{x}_{x+1}.xls'
        df = pd.read_excel(f'working/{filename}')
        df = transform_df(df)

        state, direction, date_from, date_to = filename.split('.')[0].split('_')
        df['direction'] = direction
        df['date_from'] = date_from
        df['date_to'] = date_to

        dfs.append(df)
    
big_df = pd.concat(dfs)

In [6]:
big_df['Origin from'] = big_df['Origin from'].apply(lambda x: str(x).zfill(2))
big_df['State'] = big_df['State'].str.upper()
big_df['State Name'] = big_df['State Name'].str.title().str.replace(' Of ', ' of ')

num_cols = ['Number of returns', 'Number of exemptions', 'date_to', 'date_from']
big_df[num_cols] = big_df[num_cols].astype(int)

my_df = big_df.set_index(['direction', 'Origin from', 'date_to']).sort_index()

# for key in my_df.index.get_level_values('Origin from').unique():
#     this_df = my_df.loc[my_df.index.get_level_values('Origin from') == key]
#     display(this_df)

In [7]:
# this_df = my_df.query('date_to == 1991 and direction == "in"')

# this_df.loc[~this_df.index.get_level_values('Origin from').isin(['17', '96'])].iloc[:, 2:5].sum().values

# this_df.loc[this_df.index.get_level_values('Origin from') == '96'].iloc[:, 2:5].values

In [11]:
this_df = my_df.copy()
this_df = this_df.loc[this_df.index.get_level_values('Origin from') == '96']

pivot = this_df.reset_index().pivot(index='date_to', columns='direction', values='Number of returns')

In [18]:
pivot

direction,in,out
date_to,Unnamed: 1_level_1,Unnamed: 2_level_1
1991,97133,114303
1992,97177,113789
1993,96732,114691
1994,93362,115103
1995,98259,118129
1996,97167,116823
1997,97206,122718
1998,98765,124806
1999,102659,125262
2000,102905,128306


In [9]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

credentials = ServiceAccountCredentials.from_json_keyfile_name('/Users/pjudge/.credentials/BGA Graphics-3edf4552f3a5.json', scope)

gc = gspread.authorize(credentials)

In [17]:
from gspread_dataframe import get_as_dataframe, set_with_dataframe

# worksheet = gc.open_by_key('1bUClFkz2bTx9moNSy6fiXmpVQZPeXGYQPb598iix8TA').worksheet('data')
worksheet = gc.open_by_key('1Y1jrpI2hqB1wK3taXPC3TCuyhWmZmWkFq1gEqh3e2MI').worksheet('data')

def blank_out_worksheet(worksheet):
    """
    totally blank out worksheet
    """
    from gspread_dataframe import get_as_dataframe, set_with_dataframe
    
    zeroed_df = get_as_dataframe(worksheet)
    
    # set vals to null
    zeroed_df[:] = np.nan
    
    # set cols to null
    zeroed_df.rename(columns=lambda x: np.nan, inplace=True)
    
    # set worksheet to blank dataframe
    set_with_dataframe(worksheet, zeroed_df)
    
df_out = pivot.copy()
df_out.out = df_out.out.apply(lambda x: -x)
df_out = df_out.astype(int).reset_index()

df_out.columns = ['label', 'inflow', 'outflow']

blank_out_worksheet(worksheet)
set_with_dataframe(worksheet, df_out)
