In [1]:
# compile structure for irs soi data
# https://www.irs.gov/statistics/soi-tax-stats-historic-table-2

import re
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd

requests_cache.install_cache()

result = requests.get('https://www.irs.gov/statistics/soi-tax-stats-historic-table-2')
assert result.status_code == 200
c = result.content

soup = BeautifulSoup(c)
state_links = soup.find('table').find_all('a')

state_refs = []

for a in state_links:
    state_dict = {}
    
    state_dict['name'] = a.string.strip()
    
    groups = re.search('\d\din(?P<id>\d\d)(?P<abbrev>\w\w)', a.attrs['href'])
    state_dict.update(groups.groupdict())
    
    state_refs.append(state_dict)
    
df_state_refs = pd.DataFrame(state_refs).sort_values('id').reset_index(drop=True)

display(df_state_refs)

Unnamed: 0,abbrev,id,name
0,al,1,Alabama
1,ak,2,Alaska
2,az,3,Arizona
3,ar,4,Arkansas
4,ca,5,California
5,co,6,Colorado
6,ct,7,Connecticut
7,de,8,Delaware
8,dc,9,District of Columbia
9,fl,10,Florida


In [94]:
from requests import Session
from urllib.parse import urljoin

class LiveServerSession(Session):
    def __init__(self, prefix_url=None, *args, **kwargs):
        super(LiveServerSession, self).__init__(*args, **kwargs)
        self.prefix_url = prefix_url

    def request(self, method, url, *args, **kwargs):
        url = urljoin(self.prefix_url, url)
        return super(LiveServerSession, self).request(method, url, *args, **kwargs)

def lookup_state(key, val):
    df = df_state_refs
    condition = df[key] == val
    state_info = df[condition].to_dict(orient='records')[0]
    return state_info

def get_soi_data(lookup, year):
    state_info = lookup_state(*lookup)
    filename = "{}in{}{}".format(
        str(year)[2:],
        state_info['id'],
        state_info['abbrev']
    )
    pattern = r'{}\.(xlsx?|zip)'.format(filename)
    url = soup.find('a', href=re.compile(pattern)).attrs['href']
    baseUrl = 'https://www.irs.gov'
        
    try:
        with LiveServerSession(baseUrl) as s:
            r = s.get(url)
        assert r.status_code == 200
    except:
        raise
        
    return r

def get_soi_df(lookup, year):
    '''
    get cumulative data as df given lookup and year
    '''
    from io import BytesIO
    from zipfile import ZipFile
    import itertools
    
    r = get_soi_data(lookup, year)
    
    pd_options = {
        'header': None
    }
    
    if '.xls' in r.url:
        with BytesIO(r.content) as fh:
            df = pd.read_excel(fh, **pd_options)
    
    elif '.zip' in r.url:
        with ZipFile(BytesIO(r.content)) as my_zipfile:
            for file in my_zipfile.namelist():
                with my_zipfile.open(file) as fh:
                    df = pd.read_excel(fh, **pd_options)
    
    return df

# get_soi_state_df(state, lookup, year):

def cleanup_df(df):
    header_idx = [idx for idx, row in df.astype(str).iterrows() if row.str.contains('100,?000').any()][0]
    headers = df.loc[header_idx]
    df.rename(columns = headers, inplace=True)
        
    df.columns = ['Item', 'All returns', *df.columns.values[2:]]
    
    
    def format_column(x):
        if isinstance(x, str):
            pass
        else:
            x = '${:,}'.format(x)
            
        return x.split()[0]
    
    df.rename(columns = format_column, inplace=True)
    
    # drop rows with null values (footnotes)
    df = df[~df.isnull().any(axis=1)]
        
    return df

# for year in range(2016, 2003, -1):
#     df = get_soi_df(('abbrev', 'cm'), year)
#     display(df.head())

In [98]:
place = 'Illinois'
year = 2016
df = get_soi_df(('abbrev', 'cm'), year)

# get rows to index by
state_row_idx = None
header_row_idx = None
iterator = df.iterrows()

while not (state_row_idx and header_row_idx):
    idx, row = next(iterator)
    if row.str.contains(place, flags=re.IGNORECASE).any():
        state_row_idx = idx
    if row.str.contains('100,?000').any():
        header_row_idx = idx

# get cols to limit to
series = df.loc[state_row_idx]
series = series[series.notnull()]
target_idx = series[series.str.contains(place, flags=re.IGNORECASE)].index[0]

iterator = series.loc[target_idx:].iteritems()
target_cols = range(next(iterator)[0], next(iterator)[0])

state_df = pd.concat(
    [
        df.loc[header_row_idx:, 0:0],
        df.loc[header_row_idx:, target_cols]
    ],
    axis=1
)

df = cleanup_df(state_df)
df['year'] = year
df['place'] = place
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Item,All,Under,$1,"$10,000","$25,000","$50,000","$75,000","$100,000","$200,000","$500,000","$1,000,000",year,place
7,,1,2,3,4,5,6,7,8,9,10,11,2016,Illinois
9,Number of returns,6100090,72090,868390,1248050,1380690,840020,557660,827510,247350,39160,19180,2016,Illinois
10,Number of single returns,2937940,45790,727340,735370,728080,374320,164210,130620,26420,3850,1960,2016,Illinois
11,Number of joint returns,2153740,21190,55360,164050,316690,335890,337640,658180,213930,34230,16580,2016,Illinois
12,Number of head of household returns,904510,2990,77210,330790,302670,108240,45530,30620,5300,790,380,2016,Illinois
13,Number with paid preparer's signature,3384380,47830,412060,648620,743040,480770,333050,500420,168290,32690,17600,2016,Illinois
14,Number of exemptions,11740380,105370,734940,2124090,2632110,1710090,1292940,2231930,729320,121280,58320,2016,Illinois
15,Number of dependent exemptions,3941650,17710,160900,801840,943340,534920,397870,746440,268130,47920,22580,2016,Illinois
16,Total number of volunteer prepared returns [2],108210,920,31460,39140,25060,7320,2560,1640,110,0,0,2016,Illinois
17,Number of volunteer income tax assistance (VIT...,52080,540,15870,20720,12140,1940,500,350,30,0,0,2016,Illinois


In [3]:
def val_between(val, lte, gt=float("inf")):
    if val == 'Under' and lte == 0:
        return True
    
    try:
        val = val.replace(',', '').replace('$', '')
        val = float(val)
        return lte <= val < gt
    except ValueError:
        return False
    except:
        raise

def get_filtered_soi_data(lookup, year, items, money_range=None):
    df = get_soi_df(lookup, year)
    
    # get row where first column matches the item
    row = df.loc[df.index[df.iloc[:, 0].isin(items)].values]
    
    if (money_range):
        target_cols = list(filter(lambda x: val_between(x, *money_range), row.columns.values))
        row['sum'] = row[target_cols].sum(axis=1)
        
        target_cols = ['Item', 'year', *target_cols, 'sum']
        my_row = row[target_cols]
    else:
        my_row = row
    
    return my_row
    

def keyfunction(x):
    v = x.replace(',', '').replace('$', '')
    
    try:
        return float(v)
    except:
        if v == 'Under':
            return -1
        if v == 'All':
            return -2
        else:
            return -3

RANGES = {
    '->50k': (0, 5e4),         #    0 <= ... <  50k
#     '50k->75k': (5e4, 7.5e4),  #  50k <= ... <  75k
#     '75k->100k': (7.5e4, 1e5), #  75k <= ... < 100k
    '50k->100k': (5e4, 1e5),   #  50k <= ... < 100k
    '100k->200k': (1e5, 2e5),  # 100k <= ... < 200k
    '200k->': (2e5,)           # 200k <= ...
}

def get_soi_computed(place, items):
    df_out = pd.DataFrame()

    for range_name, RANGE in RANGES.items():
        rows = []

        for year in range(2000, 2017):
            if year == 2007:
                pass
            else:
                df_row = get_filtered_soi_data(('abbrev', place), year, items, RANGE)
                if len(df_row) == 1:
                    rows.append(df_row)

        df = pd.concat(rows, ignore_index=True)
        df_sorted = df[sorted(list(df.columns.values), key=keyfunction)]
        df_renamed = df_sorted[['Item', 'year', 'sum']].rename(columns={'sum': 'sum_{}'.format(range_name)}) 

        if len(df_out) == 0:
            df_out = df_renamed
        else:
            df_out = pd.merge(df_out, df_renamed)

    df_out['place'] = place
    
    return df_out

In [4]:
for idx, row in df_state_refs.iloc[:53].iterrows():
    print(row)
    get_soi_computed(row.abbrev, ['Returns Count', 'Number of returns'])

abbrev         al
id             01
name      Alabama
Name: 0, dtype: object


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




abbrev        ak
id            02
name      Alaska
Name: 1, dtype: object
abbrev         az
id             03
name      Arizona
Name: 2, dtype: object
abbrev          ar
id              04
name      Arkansas
Name: 3, dtype: object
abbrev            ca
id                05
name      California
Name: 4, dtype: object
abbrev          co
id              06
name      Colorado
Name: 5, dtype: object
abbrev             ct
id                 07
name      Connecticut
Name: 6, dtype: object
abbrev          de
id              08
name      Delaware
Name: 7, dtype: object
abbrev                      dc
id                          09
name      District of Columbia
Name: 8, dtype: object
abbrev         fl
id             10
name      Florida
Name: 9, dtype: object
abbrev         ga
id             11
name      Georgia
Name: 10, dtype: object
abbrev        hi
id            12
name      Hawaii
Name: 11, dtype: object
abbrev       id
id           13
name      Idaho
Name: 12, dtype: object
abbrev          

AssertionError: 

In [5]:
get_soi_computed('in', ['Returns Count', 'Number of returns'])

AssertionError: 