In [2]:
import pandas as pd
import requests, zipfile, io, sys
sys.path.insert(1, '../../../scripts/')
from s3_support import *

IRS data publicly available at [link](https://www.irs.gov/statistics/soi-tax-stats-annual-extract-of-tax-exempt-organization-financial-data) with extract documentation, 990 extract, 990-EZ extract, and 990-PF extract

- [2018](https://www.irs.gov/pub/irs-soi/18eoextract990.xlsx)
- [2017](https://www.irs.gov/pub/irs-soi/17eofinextract990.dat)
- [2016](https://www.irs.gov/pub/irs-soi/16eofinextract990.dat)
- [2015](https://www.irs.gov/pub/irs-soi/15eofinextract990.dat.dat)
- [2014](https://www.irs.gov/pub/irs-soi/14eofinextract990.zip)
- [2013](https://www.irs.gov/pub/irs-soi/13eofinextract990.zip)
- [2012](https://www.irs.gov/pub/irs-soi/12eofinextract990.zip)

In [4]:
# Global Helper Variables 

years = range(12, 19)

# Loading the data

In [97]:
# This will load the data from the irl website and save it in this directory.
# Turns out that the irs likes to keep people on their toes and this only works for 2012-1014

irs_990_url = 'https://www.irs.gov/pub/irs-soi/{}eofinextract990.zip'

for year in years:
    # Fetch the zip file from the IRS website and extract it
    file = zipfile.ZipFile(
        io.BytesIO(
            requests.get(irs_990_url.format(year)).content
        )
    ).extractall()

# Exploring the data

In [98]:
year_2012_data_frame = pd.read_csv('./py13_990.dat', sep=' ')

In [99]:
year_2012_data = year_2012_data_frame[['EIN', 'totrevenue']]

In [100]:
year_2012_data['year'] = 2012
year_2012_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,EIN,totrevenue,year
0,751756215,2806191,2012
1,521601960,11187816,2012
2,752538384,9359542,2012
3,226104478,1500480,2012
4,412176501,527178,2012
...,...,...,...
289598,870334471,316041,2012
289599,46151880,16546216,2012
289600,135655186,8439362,2012
289601,113277669,19007325,2012


# Gathering all of the data

In [107]:
# combined_data = pd.DataFrame(columns=['ein', 'year', 'revenue'])

data_frames = []

for year in years:
    # Load the data from 
    if year == 18:
        df = pd.read_csv('./py18_990.csv') # Apparently microsoft doesnt let you export .dat anymore
        df['EIN'] = df['ein']
    else:
        df = pd.read_csv('./py{}_990.dat'.format(year), sep=' ')
        
    columns = ['EIN', 'totrevenue']
        
    df = df[columns]
    df['year'] = year + 2000
    df = df.rename(columns={'EIN': 'ein', 'totrevenue': 'revenue'})
    
    data_frames.append(df)
    print('Finished processing year {}.'.format(year))

Finished processing year 12.
Finished processing year 13.
Finished processing year 14.
Finished processing year 15.
Finished processing year 16.
Finished processing year 17.
Finished processing year 18.


In [109]:
#Combine the data
combined_data = pd.concat(data_frames)
combined_data['ein'] = combined_data['ein'].astype(str)

In [110]:
combined_data

Unnamed: 0,ein,revenue,year
0,331151592,384423,2012
1,850403577,233545,2012
2,412200005,351941,2012
3,381581795,251602,2012
4,546052431,264264,2012
...,...,...,...
313334,520595110,6473856000,2018
313335,135562308,6607886919,2018
313336,231352685,6710820000,2018
313337,340714585,5631825986,2018


# Filtering orgs that arent our clients

In [113]:
# Fetch all the client tax ids
client_eins = get_dataframe_from_file("qgiv-stats-data", "organizations.csv")['Tax ID'].dropna().tolist()

In [116]:
# Filter the combined data 
mask = combined_data.ein.isin(client_eins)
filted_irs_data = combined_data[mask]
len(filted_irs_data)

10384

In [117]:
filted_irs_data

Unnamed: 0,ein,revenue,year
7979,840186760,7545267,2012
9519,900843840,311712,2012
23857,263403018,779096,2012
24403,271482731,1135769,2012
36141,912043423,296853,2012
...,...,...,...
312908,930728816,94927004,2018
312955,362179782,94842673,2018
312963,941744108,100955209,2018
313063,350893506,114617345,2018


In [118]:
# Save the final dataset to S3
save_dataframe_to_file('tax-info', 'irs_990.csv', filted_irs_data)

uploading to S3
Done


In [1]:
# Storing the raw IRS files

In [5]:
for year in years:
    # Load the data from 
    if year == 18:
        df = pd.read_csv('./py18_990.csv') # Apparently microsoft doesnt let you export .dat anymore
        df['EIN'] = df['ein']
    else:
        df = pd.read_csv('./py{}_990.dat'.format(year), sep=' ')
        
    save_dataframe_to_file('tax-info', 'original_irs_990_{}.csv'.format(year), df)

uploading to S3
Done
uploading to S3
Done
uploading to S3
Done
uploading to S3
Done
uploading to S3
Done
uploading to S3
Done
uploading to S3
Done
