In [1]:
# compile structure for irs soi data
# https://www.irs.gov/statistics/soi-tax-stats-historic-table-2

import re
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd

requests_cache.install_cache()

result = requests.get('https://www.irs.gov/statistics/soi-tax-stats-historic-table-2')
assert result.status_code == 200
c = result.content

soup = BeautifulSoup(c)
state_links = soup.find('table').find_all('a')

state_refs = []

for a in state_links:
    state_dict = {}
    
    state_dict['name'] = a.string.strip()
    
    groups = re.search('\d\din(?P<id>\d\d)(?P<abbrev>\w\w)', a.attrs['href'])
    state_dict.update(groups.groupdict())
    
    state_refs.append(state_dict)
    
df_state_refs = pd.DataFrame(state_refs).sort_values('id').reset_index(drop=True)

display(df_state_refs)

Unnamed: 0,abbrev,id,name
0,al,1,Alabama
1,ak,2,Alaska
2,az,3,Arizona
3,ar,4,Arkansas
4,ca,5,California
5,co,6,Colorado
6,ct,7,Connecticut
7,de,8,Delaware
8,dc,9,District of Columbia
9,fl,10,Florida


In [2]:
def lookup_state(key, val):
    df = df_state_refs
    condition = df[key] == val
    state_info = df[condition].to_dict(orient='records')[0]
    return state_info

def get_soi_data(lookup, year):
    state_info = lookup_state(*lookup)
    filename = "{}in{}{}".format(
        str(year)[2:],
        state_info['id'],
        state_info['abbrev']
    )
    url = 'https://www.irs.gov/pub/irs-soi/{}.xls'.format(filename)
        
    try:
        r = requests.get(url)
        assert r.status_code == 200
    except AssertionError:
        r = requests.get(url + 'x')
        assert r.status_code == 200
    except:
        raise
        
    return r

def get_soi_df(lookup, year):
    '''
    get state data as df given lookup and year
    '''
    import io
    import itertools
    
    r = get_soi_data(lookup, year)
    
    with io.BytesIO(r.content) as fh:
        df = pd.read_excel(fh)
        
    header_idx = [idx for idx, row in df.astype(str).iterrows() if row.str.contains('100,?000').any()][0]
    headers = df.iloc[header_idx]
    df.rename(columns = headers, inplace=True)
        
    df.columns = ['Item', 'All returns', *df.columns.values[2:]]
    
    
    def format_column(x):
        if isinstance(x, str):
            pass
        else:
            x = '${:,}'.format(x)
            
        return x.split()[0]
    
    df.rename(columns = format_column, inplace=True)
    
    # drop rows with null values (footnotes)
    df = df[~df.isnull().any(axis=1)]
    
    df['year'] = year
        
    return df

In [3]:
def val_between(val, lte, gt=float("inf")):
    if val == 'Under' and lte == 0:
        return True
    
    try:
        val = val.replace(',', '').replace('$', '')
        val = float(val)
        return lte <= val < gt
    except ValueError:
        return False
    except:
        raise

def get_filtered_soi_data(lookup, year, items, money_range=None):
    df = get_soi_df(lookup, year)
    
    # get row where first column matches the item
    row = df.loc[df.index[df.iloc[:, 0].isin(items)].values]
    
    if (money_range):
        target_cols = list(filter(lambda x: val_between(x, *money_range), row.columns.values))
        row['sum'] = row[target_cols].sum(axis=1)
        
        target_cols = ['Item', 'year', *target_cols, 'sum']
        my_row = row[target_cols]
    else:
        my_row = row
    
    return my_row
    

def keyfunction(x):
    v = x.replace(',', '').replace('$', '')
    
    try:
        return float(v)
    except:
        if v == 'Under':
            return -1
        if v == 'All':
            return -2
        else:
            return -3

RANGES = {
    '->50k': (0, 5e4),         #    0 <= ... <  50k
#     '50k->75k': (5e4, 7.5e4),  #  50k <= ... <  75k
#     '75k->100k': (7.5e4, 1e5), #  75k <= ... < 100k
    '50k->100k': (5e4, 1e5),   #  50k <= ... < 100k
    '100k->200k': (1e5, 2e5),  # 100k <= ... < 200k
    '200k->': (2e5,)           # 200k <= ...
}

def get_soi_computed(place, items):
    df_out = pd.DataFrame()

    for range_name, RANGE in RANGES.items():
        rows = []

        for year in range(2000, 2017):
            if year == 2007:
                pass
            else:
                df_row = get_filtered_soi_data(('abbrev', place), year, items, RANGE)
                if len(df_row) == 1:
                    rows.append(df_row)

        df = pd.concat(rows, ignore_index=True)
        df_sorted = df[sorted(list(df.columns.values), key=keyfunction)]
        df_renamed = df_sorted[['Item', 'year', 'sum']].rename(columns={'sum': 'sum_{}'.format(range_name)}) 

        if len(df_out) == 0:
            df_out = df_renamed
        else:
            df_out = pd.merge(df_out, df_renamed)

    df_out['place'] = place
    
    return df_out

In [4]:
for idx, row in df_state_refs.iloc[:53].iterrows():
    print(row)
    get_soi_computed(row.abbrev, ['Returns Count', 'Number of returns'])

abbrev         al
id             01
name      Alabama
Name: 0, dtype: object


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




abbrev        ak
id            02
name      Alaska
Name: 1, dtype: object
abbrev         az
id             03
name      Arizona
Name: 2, dtype: object
abbrev          ar
id              04
name      Arkansas
Name: 3, dtype: object
abbrev            ca
id                05
name      California
Name: 4, dtype: object
abbrev          co
id              06
name      Colorado
Name: 5, dtype: object
abbrev             ct
id                 07
name      Connecticut
Name: 6, dtype: object
abbrev          de
id              08
name      Delaware
Name: 7, dtype: object
abbrev                      dc
id                          09
name      District of Columbia
Name: 8, dtype: object
abbrev         fl
id             10
name      Florida
Name: 9, dtype: object
abbrev         ga
id             11
name      Georgia
Name: 10, dtype: object
abbrev        hi
id            12
name      Hawaii
Name: 11, dtype: object
abbrev       id
id           13
name      Idaho
Name: 12, dtype: object
abbrev          

AssertionError: 

In [5]:
get_soi_computed('in', ['Returns Count', 'Number of returns'])

AssertionError: 