In [1]:
# compile structure for irs soi data
# https://www.irs.gov/statistics/soi-tax-stats-historic-table-2

import re
import requests
import requests_cache
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

requests_cache.install_cache()

result = requests.get('https://www.irs.gov/statistics/soi-tax-stats-historic-table-2')
assert result.status_code == 200
c = result.content

soup = BeautifulSoup(c)
state_links = soup.find('table').find_all('a')

CACHED_DFS = {}

state_refs = []

for a in state_links:
    state_dict = {}
    
    state_dict['name'] = a.string.strip()
    
    groups = re.search('\d\din(?P<id>\d\d)(?P<abbrev>\w\w)', a.attrs['href'])
    state_dict.update(groups.groupdict())
    
    state_refs.append(state_dict)
    
df_state_refs = pd.DataFrame(state_refs).sort_values('id').reset_index(drop=True)

display(df_state_refs)

Unnamed: 0,abbrev,id,name
0,al,1,Alabama
1,ak,2,Alaska
2,az,3,Arizona
3,ar,4,Arkansas
4,ca,5,California
5,co,6,Colorado
6,ct,7,Connecticut
7,de,8,Delaware
8,dc,9,District of Columbia
9,fl,10,Florida


In [152]:
from requests import Session
from urllib.parse import urljoin

class LiveServerSession(Session):
    def __init__(self, prefix_url=None, *args, **kwargs):
        super(LiveServerSession, self).__init__(*args, **kwargs)
        self.prefix_url = prefix_url

    def request(self, method, url, *args, **kwargs):
        url = urljoin(self.prefix_url, url)
        return super(LiveServerSession, self).request(method, url, *args, **kwargs)

def lookup_state(key, val):
    df = df_state_refs
    condition = df[key] == val
    state_info = df[condition].to_dict(orient='records')[0]
    return state_info

def get_soi_data(lookup, year):
    state_info = lookup_state(*lookup)
    filename = "{}in{}{}".format(
        str(year)[2:],
        state_info['id'],
        state_info['abbrev']
    )
    pattern = r'{}\.(xlsx?|zip)'.format(filename)
    url = soup.find('a', href=re.compile(pattern)).attrs['href']
    baseUrl = 'https://www.irs.gov'
        
    try:
        with LiveServerSession(baseUrl) as s:
            r = s.get(url)
        assert r.status_code == 200
    except:
        raise
        
    return r

def get_soi_df(lookup, year):
    '''
    get cumulative data as df given lookup and year
    '''
    from io import BytesIO
    from zipfile import ZipFile
    import itertools
    
    r = get_soi_data(lookup, year)
    
    print(r.url)
    
    if ((lookup, year) in CACHED_DFS):
        return CACHED_DFS[(lookup, year)]
    
    pd_options = {
        'header': None
    }
    
    if '.xls' in r.url:
        with BytesIO(r.content) as fh:
            df = pd.read_excel(fh, **pd_options)
    
    elif '.zip' in r.url:
        with ZipFile(BytesIO(r.content)) as my_zipfile:
            for file in my_zipfile.namelist():
                with my_zipfile.open(file) as fh:
                    df = pd.read_excel(fh, **pd_options)
    
    CACHED_DFS[(lookup, year)] = df
    
    return df

In [286]:
def get_indices(df):
    from collections import defaultdict

    iterator = df.iterrows()
    indices = defaultdict(lambda: None)

    while not np.all([indices[k] for k in ['place', 'amt']]):
        idx, row = next(iterator)

        if row.str.contains('alabama', flags=re.IGNORECASE).any() and not indices['place']:
            indices['place'] = idx

        if row.apply(str).str.contains('100,?000').any() and not indices['amt']:
            indices['amt'] = idx

    return dict(indices)

def format_soi_df(df):
    import numpy as np

    my_df = df.copy()
        
    # identity target rows for indexing
    indices = get_indices(my_df)

    # fill null holes in columns for multiindexing
    my_df.loc[indices['place']].fillna(method='ffill', inplace=True)
    my_df.loc[indices['place']].fillna('', inplace=True)
    my_df.loc[indices['place']] = my_df.loc[indices['place']].str.strip()
    my_df.loc[indices['amt']].fillna('All returns', inplace=True)
    
    def format_column(x):
        if isinstance(x, float) and np.isnan(x):
            return x
        elif isinstance(x, str) and 'Under' in x:
            x = '0'
        elif isinstance(x, str) and 'Breakeven' in x:
            x = '0'
        elif isinstance(x, str) and 'All' in x:
            x = '-1'
        else:
            x = str(x)

        try:
            return float(x.split()[0].replace('$', '').replace(',', ''))
        except:
            return x.split()[0]

    my_df.loc[indices['amt']] = my_df.loc[indices['amt']].apply(format_column)

    # drop rows with any null values in arbitrary range – we don't need these anymore
    my_df = my_df[~my_df.iloc[:, 1:4].isnull().any(axis=1)]
    
    # get rid of footnotes in data
    my_df.replace('\s*\[\d+\]', '', regex=True, inplace=True)

    # set indices
    my_df = my_df.transpose().set_index([*indices.values()])
    my_df.index.set_names([*indices.keys()], inplace=True)

    # set columns
    my_df.columns = my_df.iloc[0]
    my_df = my_df.iloc[1:]
    my_df.columns.rename('Item', inplace=True)
    
    my_df = my_df[my_df.iloc[:, 3] != my_df.columns[3]] # drop rows that match header

    return my_df

In [247]:
def realign_tables(df):
    
    
    new_starts = df[df.iloc[:, 0].str.contains('^Table 2.', flags=re.IGNORECASE, na=False)].index
    
    if not new_starts.any():
        new_starts = df[df.iloc[:, 0].str.contains('^Tax Year 200', flags=re.IGNORECASE, na=False)].index
    
    new_starts = new_starts.drop(0)
    
    if(new_starts.any()):
        dfs = []
        iterator = iter(new_starts.values)
        
        curr = 0
        
#         dfs.append(df.loc[:(new_starts.values[0] - 1), 0])
        
        for next_val in new_starts.values:
            start = curr
            curr = next_val
            
            dfs.append(df.loc[start:(curr-1), :].reset_index(drop=True))
        
        dfs.append(df.loc[curr:, :].reset_index(drop=True))
        
        concat = pd.concat(dfs, axis=1, ignore_index=True)
        
        # begone null columns
        concat = concat.loc[:, ~concat.isnull().all()]
        
        return concat
        
    else:
        
        return df


In [289]:
import os

for year in range(2005, 1999, -1):
    print(datetime.datetime.now(), year, 'Fetching data')
    df = get_soi_df(('abbrev', 'cm'), year)
    print(datetime.datetime.now(), year, 'Realigning data')
    df = realign_tables(df)
    
#     display(df.head(n=20))
    
#     df.to_csv('sample.csv')
    
    print(datetime.datetime.now(), year, 'Formatting data')
    
    if os.path.isfile(f'./working/{year}.csv'):
        df = pd.read_csv(f'./working/{year}.csv', index_col=0, header=0)
    
    my_df = format_soi_df(df)
#     print()
    display(my_df.head(n=10))

2019-03-25 18:54:45.552109 2005 Fetching data
https://www.irs.gov/pub/irs-soi/05in54cm.xls
2019-03-25 18:54:45.576384 2005 Realigning data
2019-03-25 18:54:45.585121 2005 Formatting data


Unnamed: 0_level_0,Item,nan,Unnamed: 3_level_0,Number of returns,Number of joint returns,Number with paid preparer's signature,Number of exemptions,Adjusted gross income (AGI),Salaries and wages in AGI: Number,Amount,Taxable interest: Number,...,Alternative minimum tax: Number,Amount,Income tax: Number,Amount,Total tax liability: Number,Amount,Tax due at time of filing: Number,Amount,Overpayments: Number,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
UNITED STATES,-1.0,,1,135257620,52607676,80455243,273738434,7364640131,114060887,5161583318,59553985,...,4067599,17269976,92646159,938184168,100222174,989191350,25495750,117979278,102297909,-235357330
UNITED STATES,0.0,,2,92150166,20743943,53622647,159649737,1797097083,75422766,1541276272,28527550,...,71712,184961,50596829,88375052,57868419,105786470,12612333,12413009,74147484,-130883069
UNITED STATES,50000.0,Size of...,3,18221115,11329459,11025624,44189517,1119634632,16299827,896339313,10891905,...,121078,146147,17341394,92532758,17587865,99360874,4249962,8724802,13466925,-36846281
UNITED STATES,75000.0,,4,10499106,8296546,6260725,28555195,905336768,9520214,721137490,7636612,...,192041,270330,10368748,87117677,10410537,92573567,2643935,7544529,7498704,-25452383
UNITED STATES,100000.0,,5,10797979,9193700,6678965,30919226,1429575727,9782173,1083175205,9092673,...,1481073,2742842,10757520,190475859,10771544,200393602,4014472,20657387,6184721,-26730572
UNITED STATES,200000.0,,6,3589254,3044028,2867282,10424759,2112995921,3035907,919655038,3405245,...,2201695,13925696,3581668,479682823,3583809,491076838,1975048,68639552,1000075,-15445026
ALABAMA,-1.0,,1,1955914,774753,1272190,4093805,88628735,1681957,63539055,673334,...,20554,82862,1232291,9842713,1351998,10434751,317518,1516820,1556618,-3682094
ALABAMA,0.0,,2,1428663,348843,930445,2670264,27817732,1206816,23832322,330827,...,457,2183,714132,1152021,831236,1383653,156445,151001,1211557,-2526388
ALABAMA,50000.0,Size of adjusted g...,3,240191,172740,155834,608134,14753559,216466,11901075,125656,...,1084,1253,232485,1187182,234585,1269166,56051,111451,178681,-471338
ALABAMA,75000.0,,4,134090,115859,82891,374372,11540510,122474,9271611,89402,...,1316,1999,133112,1120709,133456,1182492,35136,95606,94990,-301912


2019-03-25 18:54:45.691183 2004 Fetching data
https://www.irs.gov/pub/irs-soi/04in54cm.xls
2019-03-25 18:54:45.716889 2004 Realigning data
2019-03-25 18:54:45.725197 2004 Formatting data


Unnamed: 0_level_0,Item,nan,Unnamed: 3_level_0,Number of returns,Number of joint returns,Number with paid preparer's signature,Number of exemptions,Adjusted gross income (AGI),Salaries and wages in AGI: Number,Amount,Taxable interest: Number,...,Alternative Minimum Tax: Number,Amount,Income tax: Number,Amount,Total tax liability: Number,Amount,Tax due at time of filing: Number,Amount,Overpayments: Number,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
UNITED STATES,-1.0,,1,133092565,52225721,78125181,269662776,6745101602,112534497,4917551535,57737605,...,3146323,12895393,91150187,832385003,98477636,879938123,24058400,99126978,101423316,-223591997
UNITED STATES,0.0,,2,92277037,21593939,53216912,160217108,1788706363,75627439,1533846536,28707688,...,47856,136863,51183952,90089999,58262887,107208602,12230696,11921055,74446473,-127204417
UNITED STATES,50000.0,Size of...,3,17922902,11513644,10700352,44303111,1100740167,16166931,891303503,10674598,...,98603,117781,17195137,92285650,17401619,98951188,4093296,8099606,13285164,-35418444
UNITED STATES,75000.0,,4,10015637,8063363,5872471,27732581,862762850,9169619,699009194,7253908,...,152080,217788,9922579,84826602,9953464,89983669,2477947,6838542,7174722,-23768796
UNITED STATES,100000.0,,5,9815321,8438738,5946200,28429329,1294760029,8962650,999027855,8210418,...,1112316,2085226,9789959,175957351,9799984,184981959,3619280,17889342,5626313,-24012692
UNITED STATES,200000.0,,6,3061668,2616037,2389246,8980647,1698132194,2607858,794364445,2890993,...,1735468,10337735,3058560,389225401,3059682,398812705,1637181,54378433,890644,-13187648
ALABAMA,-1.0,,1,1910403,768809,1224390,4006258,80884836,1644807,60102429,660359,...,14056,56239,1200871,8606520,1317118,9153744,300911,1206589,1525679,-3454587
ALABAMA,0.0,,2,1416612,364035,909501,2658350,27332223,1196770,23361825,338944,...,367,938,713835,1153204,828044,1378721,153981,144024,1199921,-2402582
ALABAMA,50000.0,Size of adjusted g...,3,234243,173791,150198,603145,14380699,212474,11724325,125223,...,792,990,228362,1174718,230087,1252918,53960,102904,174169,-447497
ALABAMA,75000.0,,4,125827,110195,76742,355931,10814201,115699,8797915,85036,...,913,1527,125195,1076719,125410,1134103,32890,85963,88886,-274469


2019-03-25 18:54:45.833942 2003 Fetching data
https://www.irs.gov/pub/irs-soi/03in54cm.xls
2019-03-25 18:54:45.856234 2003 Realigning data
2019-03-25 18:54:45.888276 2003 Formatting data


Unnamed: 0_level_0,Item,Unnamed: 2_level_0,Number of returns,Number of joint returns,Number with paid preparer's signature,Number of exemptions,Adjusted gross income (AGI),Salaries and wages in AGI: Number,Amount,Taxable interest: Number,Amount,...,Number,Amount,Income tax: Number,Amount,Total tax liability: Number,Amount,Tax due at time of filing: Number,Amount,Overpayments: Number,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ALABAMA,-1.0,1,1883765,765108,1233389,3955727,74842665,1623767,57175727,678245,1403119,...,441153,891981,1211116,7896942,1317119,8396882,263440,868834,1533477,-3340011
ALABAMA,0.0,2,1088495,216612,714354,1933049,14335594,903091,12369967,233923,338816,...,429515,886806,452676,397106,550457,540628,95776,67139,936546,-1738737
ALABAMA,30000.0,3,332057,163006,218737,746319,12931107,298511,10783225,129238,178162,...,11638,5175,298585,816275,305745,888101,52542,69690,272033,-543134
ALABAMA,50000.0,4,229168,175583,149489,600134,14052671,208667,11511838,131522,189683,...,0,0,226341,1201958,227209,1274943,46969,87992,175114,-430889
ALABAMA,75000.0,5,117246,103678,72346,333786,10063119,108151,8228188,83768,136549,...,0,0,116895,1048888,117021,1099886,25796,70692,86772,-265545
ALABAMA,100000.0,6,92505,84501,57900,270018,12021949,84694,9070679,76751,198829,...,0,0,92359,1709619,92415,1795590,31238,166562,54760,-223577
ALABAMA,200000.0,7,24294,21728,20563,72421,11438224,20653,5211831,23043,361080,...,0,0,24260,2723095,24272,2797732,11119,406759,8252,-138129
ALASKA,-1.0,1,343032,120116,170354,644384,14832589,276580,10998285,157975,215482,...,32712,50657,277302,1795993,288599,1916777,90408,190153,234629,-489812
ALASKA,0.0,2,178298,19740,81895,233781,1893925,126336,1578062,53300,32628,...,30913,49864,119726,82847,129011,103071,52339,16243,114797,-150068
ALASKA,30000.0,3,59411,21401,29659,124363,2333676,53989,1920018,27297,23195,...,1799,791,53273,175855,54921,192910,10726,17342,47323,-92666


2019-03-25 18:54:46.049644 2002 Fetching data
https://www.irs.gov/pub/irs-soi/02in54cm.xls
2019-03-25 18:54:46.075779 2002 Realigning data
2019-03-25 18:54:46.115405 2002 Formatting data


Unnamed: 0_level_0,Item,nan,Unnamed: 3_level_0,Number of returns,Number of joint returns,Number with paid preparer's signature,Number of exemptions,Adjusted gross income (AGI),Salaries and wages in AGI: Number,Amount,Taxable interest: Number,...,Number,Amount,Income tax: Number,Amount,Total tax liability: Number,Amount,Tax due at time of filing: Number,Amount,Overpayments: Number,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
UNITED STATES,-1.0,,1,130836098,51477158,74832886,263875135,6015047033,111008171,4562427437,63533259,...,18667041,33257814,93923947,798633935,99895281,840519869,22645999,81530859,100550368,207677688
UNITED STATES,0.0,,2,50248794,7387934,27881024,73922834,385367487,38751335,373560036,15306787,...,13805496,27148419,19685790,10536472,24306193,17009070,5284474,3064407,40543492,53992712
UNITED STATES,20000.0,,3,18650546,5090957,10746359,36043469,461857601,16246786,376820992,6728241,...,4330873,5897804,14549855,22809368,15326616,26099638,2460327,2495173,15751987,28168945
UNITED STATES,30000.0,,4,24319476,10095791,13864118,51514627,950261330,21797281,785529009,12259018,...,530672,211596,22383829,70512619,22856763,76312341,4196138,5896366,19448909,36781025
UNITED STATES,50000.0,Size of adjusted gross income,5,17628351,11977409,10272065,44818405,1081036334,16000889,881900609,12082373,...,0,0,17371314,103015925,17450543,109322716,3919342,7765562,13065684,33160763
UNITED STATES,75000.0,,6,9129894,7511701,5203512,25737602,784946802,8400593,642597210,7340021,...,0,0,9093056,92759077,9107062,97351271,2658071,7300421,6070570,19331885
UNITED STATES,100000.0,,7,8391951,7285992,4962840,24515756,1103520609,7681868,857977162,7452556,...,0,0,8376355,172994973,8383039,180947338,3026112,15660093,4777095,21497631
UNITED STATES,200000.0,,8,2467086,2127374,1902968,7322442,1248056881,2129419,644042424,2364263,...,0,0,2463748,326005498,2465065,333477492,1101535,39348828,892631,14744724
ALABAMA,-1.0,,1,1882572,766190,1176133,3938787,72426176,1626700,55649850,736849,...,431673,860611,1246062,8272612,1342295,8748726,287372,901096,1509132,3125097
ALABAMA,0.0,,2,816584,132892,504101,1357965,7373588,666992,6607426,179627,...,332712,732049,276439,142252,355359,235911,65796,35990,700440,1201343


2019-03-25 18:54:46.248549 2001 Fetching data
https://www.irs.gov/pub/irs-soi/01in54cm.xls
2019-03-25 18:54:46.283633 2001 Realigning data
2019-03-25 18:54:46.348346 2001 Formatting data


Unnamed: 0_level_0,Item,nan,nan,Returns Count,Joint Returns Count,Single Returns Count,Head of Household Count,Number of Farm Returns,Paid Preparer Returns Count,Number of Returns,Amount,...,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
United States,-1.0,Total,Returns,130977219,51193206,58376029,18598611,2048843,73973374,111521029,4555746889,...,37379429,199410093,39025354,103218020,44416103,315005855,37113170,356571897,39430635,137754032
United States,0.0,and,Loss,1752721,516701,997706,114269,126134,1132623,443166,11347668,...,176422,1152591,260376,1060546,318157,2665590,258232,4845145,0,0
United States,0.01,Under,10000,25148294,1929894,19376052,3408644,193078,12506368,20093979,101024299,...,498342,957213,577822,1176046,846926,2287255,498441,3617596,541090,572450
United States,10000.0,Under,20000,23493788,4606740,12736825,5646151,261883,13762391,18975517,265221497,...,1442165,1498222,1696971,3161480,2169924,5334919,1432090,9641526,1730996,2800889
United States,20000.0,Under,30000,18741155,5064275,8949946,4161891,250445,10617934,16336925,378995122,...,2713904,3099384,2727530,4662127,3438966,8613852,2569464,17116348,2834566,5035448
United States,30000.0,Under,50000,24343572,10240186,9893521,3510065,430112,13741400,21781241,781856685,...,7798528,13724990,7724827,13586883,9197046,29091289,7551114,52442036,7891349,15097821
United States,50000.0,Under,75000,17639483,12083689,4034341,1225130,377334,10228017,15942274,871919105,...,9649317,26952123,9941572,20259144,11119425,49473335,9662138,75516125,10009362,22132085
United States,75000.0,Under,100000,8943249,7349674,1205447,298626,177581,5077444,8176707,618192064,...,6336732,25723600,6713599,16897085,7292272,44505157,6441869,58393453,6817788,18300843
United States,100000.0,Under,150000,6372267,5491853,687723,143756,119318,3670706,5807557,591114972,...,5018024,30721249,5378621,17706373,5771654,50099951,5057283,56866795,5501402,19195961
United States,150000.0,Under,200000,1966655,1702026,208036,40182,40979,1254165,1755012,241136772,...,1612453,15592451,1727601,7588321,1847712,23940692,1594574,23273703,1770690,8509195


2019-03-25 18:54:46.620346 2000 Fetching data
https://www.irs.gov/pub/irs-soi/00in54cm.xls
2019-03-25 18:54:46.653848 2000 Realigning data
2019-03-25 18:54:46.710350 2000 Formatting data


Unnamed: 0_level_0,Item,nan,nan,Returns Count,Dependent Exemptions Count,Joint Returns Count,Single Returns Count,Head of Household Count,Number of Farm Returns,Paid Preparer Returns Count,Number of Returns,...,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,-1.0,Total,Returns,1904150,1366454,772772,705643,393607,51896,1125294,1654590,...,522970,1592985,474459,323608,554316,2030899,468164,3452849,505351,2021578
Alabama,0.0,and,Loss,13320,4871,5888,6298,550,2097,10381,4232,...,2060,7581,2459,3196,3028,12327,2616,32241,0,0
Alabama,0.01,Under,10000,415090,178778,38420,277191,94147,4559,215780,345606,...,6286,8061,6429,3080,9981,12133,6573,35943,7998,10889
Alabama,10000.0,Under,20000,423565,314916,87935,172825,154939,6701,257889,359949,...,26336,18224,22849,9501,33768,32123,22663,119710,29138,59128
Alabama,20000.0,Under,30000,286275,210288,92563,107537,78640,6305,168833,254341,...,51230,48998,41170,16727,56608,74352,42747,215681,49158,109100
Alabama,30000.0,Under,50000,331729,253883,179897,95104,49412,11121,201511,299445,...,125489,187481,106800,44422,131771,254009,109500,600968,117563,294628
Alabama,50000.0,Under,75000,228193,201719,183234,30663,12060,10220,140640,207254,...,138821,317198,128672,63606,143218,408156,128179,817210,132639,401953
Alabama,75000.0,Under,100000,102628,99754,91826,7997,2245,5016,60657,93693,...,81039,256860,77577,50748,82837,326309,75296,560095,78975,302532
Alabama,100000.0,Under,150000,62364,60887,56546,4505,979,3020,37562,55753,...,54226,237834,52397,50484,55216,303098,49088,459936,53174,271238
Alabama,150000.0,Under,200000,17429,17109,15627,1408,280,1038,12152,14904,...,15731,99188,15185,22390,15946,126469,13597,171786,15402,111711


In [285]:
# my_df[my_df['Returns Count'] != 'Returns Count']
# my_df.columns[3]


Unnamed: 0_level_0,Item,nan,nan,Returns Count,Dependent Exemptions Count,Joint Returns Count,Single Returns Count,Head of Household Count,Number of Farm Returns,Paid Preparer Returns Count,Number of Returns,...,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount,Number of Returns,Amount
place,amt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,-1.00,Total,Returns,1904150,1366454,772772,705643,393607,51896,1125294,1654590,...,522970,1592985,474459,323608,554316,2030899,468164,3452849,505351,2021578
Alabama,0.00,and,Loss,13320,4871,5888,6298,550,2097,10381,4232,...,2060,7581,2459,3196,3028,12327,2616,32241,0,0
Alabama,0.01,Under,10000,415090,178778,38420,277191,94147,4559,215780,345606,...,6286,8061,6429,3080,9981,12133,6573,35943,7998,10889
Alabama,10000.00,Under,20000,423565,314916,87935,172825,154939,6701,257889,359949,...,26336,18224,22849,9501,33768,32123,22663,119710,29138,59128
Alabama,20000.00,Under,30000,286275,210288,92563,107537,78640,6305,168833,254341,...,51230,48998,41170,16727,56608,74352,42747,215681,49158,109100
Alabama,30000.00,Under,50000,331729,253883,179897,95104,49412,11121,201511,299445,...,125489,187481,106800,44422,131771,254009,109500,600968,117563,294628
Alabama,50000.00,Under,75000,228193,201719,183234,30663,12060,10220,140640,207254,...,138821,317198,128672,63606,143218,408156,128179,817210,132639,401953
Alabama,75000.00,Under,100000,102628,99754,91826,7997,2245,5016,60657,93693,...,81039,256860,77577,50748,82837,326309,75296,560095,78975,302532
Alabama,100000.00,Under,150000,62364,60887,56546,4505,979,3020,37562,55753,...,54226,237834,52397,50484,55216,303098,49088,459936,53174,271238
Alabama,150000.00,Under,200000,17429,17109,15627,1408,280,1038,12152,14904,...,15731,99188,15185,22390,15946,126469,13597,171786,15402,111711


In [144]:
import datetime

RANGES = {
    'Under $50k': (0, 5e4),     #    0 <= ... <  50k
    '$50-$100k': (5e4, 1e5),   #  50k <= ... < 100k
    '$100-$200k': (1e5, 2e5),  # 100k <= ... < 200k
    '$200k and greater': (2e5,)    # 200k <= ...
}

DIMENSIONS = [
    'Number of returns',
    'Adjusted gross income (AGI)',
]

def build_query(my_place, lte, gt=None):
    my_query = ''
    
    my_query += f'{lte} <= amt'
    
    if (gt):
        my_query += f' < {gt} '
    
    my_query += f' and place == "{my_place}"'
    
    return my_query

data = []

def format_place(x):
    df = df_state_refs[df_state_refs.name.str.contains(x, case=False)]
    return df.abbrev.iloc[0]

for year in range(2016, 2004, -1):
    print(datetime.datetime.now(), year, 'Fetching data')
    df = get_soi_df(('abbrev', 'cm'), year)
    print(datetime.datetime.now(), year, 'Realigning data')
    df = realign_tables(df)
    print(datetime.datetime.now(), year, 'Formatting data')
    my_df = format_soi_df(df)
    print()
    
    for place in df_state_refs.name:
        my_place = place.upper()
        
        for range_name, my_range in RANGES.items():
            my_query = build_query(my_place, *my_range)
#             my_value = my_df.query(my_query)['Number of returns'].sum()
            for my_dimension in DIMENSIONS:
                my_value = my_df.query(my_query)[my_dimension].sum()

                data.append((
                    my_dimension,
                    f'1/1/{year}',
                    format_place(my_place),
                    range_name,
                    my_value
                ))
        
df = pd.DataFrame(data)

2019-03-25 17:12:24.163891 2016 Fetching data
2019-03-25 17:12:24.164969 2016 Realigning data
2019-03-25 17:12:24.166643 2016 Formatting data

2019-03-25 17:12:27.219924 2015 Fetching data
2019-03-25 17:12:27.220027 2015 Realigning data
2019-03-25 17:12:27.220725 2015 Formatting data

2019-03-25 17:12:29.985256 2014 Fetching data
2019-03-25 17:12:29.985361 2014 Realigning data
2019-03-25 17:12:29.986052 2014 Formatting data

2019-03-25 17:12:32.655898 2013 Fetching data
2019-03-25 17:12:32.656002 2013 Realigning data
2019-03-25 17:12:32.656682 2013 Formatting data

2019-03-25 17:12:35.167298 2012 Fetching data
2019-03-25 17:12:35.167407 2012 Realigning data
2019-03-25 17:12:35.168090 2012 Formatting data

2019-03-25 17:12:37.782952 2011 Fetching data
2019-03-25 17:12:37.783055 2011 Realigning data
2019-03-25 17:12:37.783741 2011 Formatting data

2019-03-25 17:12:40.294166 2010 Fetching data
2019-03-25 17:12:40.294279 2010 Realigning data
2019-03-25 17:12:40.307614 2010 Formatting data


In [145]:
df.columns = ('dimension', 'date', 'group', 'range', 'val')

pivot = pd.pivot_table(df, values='val', columns='range', index=['dimension', 'group', 'date'])

pivot = pivot[['Under $50k', '$50-$100k', '$100-$200k', '$200k and greater']]

In [138]:
subset = pivot.xs('1/1/2007', level=1, drop_level=False)
subset[:] = -1
pivot.update(subset)
pivot.query('group not in ["cm", "oa"]').query('date in ["1/1/2006", "1/1/2016"]').to_csv('sample.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc._setitem_with_indexer(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return self._setitem_slice(indexer, value)


In [146]:
pivot.query('group in ["il"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,range,Under $50k,$50-$100k,$100-$200k,$200k and greater
dimension,group,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adjusted gross income (AGI),il,1/1/2005,75252964.0,93382961.0,67811056.0,98874476.0
Adjusted gross income (AGI),il,1/1/2006,75009056.0,96364475.0,75895156.0,114966917.0
Adjusted gross income (AGI),il,1/1/2007,76096097.0,99868100.0,84171574.0,132529144.0
Adjusted gross income (AGI),il,1/1/2008,73365620.0,98937338.0,86444393.0,115801241.0
Adjusted gross income (AGI),il,1/1/2009,77801110.0,95598997.0,83605198.0,93433098.0
Adjusted gross income (AGI),il,1/1/2010,72224644.0,96752757.0,86919503.0,106365303.0
Adjusted gross income (AGI),il,1/1/2011,70649498.0,96883967.0,91664666.0,112216060.0
Adjusted gross income (AGI),il,1/1/2012,70008538.0,97152886.0,96566785.0,139391854.0
Adjusted gross income (AGI),il,1/1/2013,69667300.0,98076995.0,101512794.0,129944416.0
Adjusted gross income (AGI),il,1/1/2014,69498881.0,99289262.0,106932210.0,148468300.0
