In [1]:
import requests, json, sys
import pandas as pd

sys.path.insert(1, '../../../scripts/')
from s3_support import *
import time

KeyboardInterrupt: 

# Get client list w/ EIN's

In [None]:
_ = list_files("qgiv-stats-data", search_key="organizations")

In [None]:
orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.csv")

In [None]:
orgs[['Id', 'Org Name', 'Tax ID', 'Go Live Date']].head()

# Query for all data

In [None]:
years = [i for i in range(2000, 2020)]
client_eins = orgs['Tax ID'].dropna()

### ProPublica

In [None]:
url = "https://projects.propublica.org/nonprofits/api/v2/organizations/{}.json"

In [None]:
ein_data = []

print("checking on {} EINs".format(len(client_eins)))
counter = 0
for ein in client_eins:
    time.sleep(1)
    rsp = requests.get(url.format(ein))
    if rsp.status_code != 404:
        data = json.loads(rsp.content)
        for filing in data['filings_with_data']:
            ein_data.append({
                'ein': ein,
                'year': filing['tax_prd_yr'],
                'revenue': filing['totrevenue']
            })
            
    counter += 1
    if counter != 0 and counter % 250 == 0:
        print("done with {} EINs".format(counter))
        
data_frame = pd.DataFrame(ein_data)
save_dataframe_to_file('tax-info', 'propublica_990.csv', data_frame)

In [None]:
data_frame = pd.DataFrame(ein_data)
data_frame


## OpenData

In [None]:
index_url = "https://s3.amazonaws.com/irs-form-990/index_{}.csv"
filing_url = "https://s3.amazonaws.com/irs-form-990/{}_public.xml"

In [None]:
from lxml import etree
from lxml import etree

def xml_to_dict(xml_str):
    return etree2dict(etree.fromstring(xml_str))

# converts an etree to dict, useful to convert xml to dict
def etree2dict(tree):
    root, contents = recursive_dict(tree)
    return {root: contents}

def recursive_dict(element):
    if element.attrib and 'type' in element.attrib and element.attrib['type'] == "array":
        return element.tag.replace('{http://www.irs.gov/efile}', ''), [(dict(map(recursive_dict, child)) or getElementValue(child)) for child in element]
    else:
        return element.tag.replace('{http://www.irs.gov/efile}', ''), dict(map(recursive_dict, element)) or getElementValue(element)

def getElementValue(element):
    if element.text:
        if element.attrib and 'type' in element.attrib:
            attr_type = element.attrib.get('type')
            if attr_type == 'integer':
                return int(element.text.strip())
            if attr_type == 'float':
                return float(element.text.strip())
            if attr_type == 'boolean':
                return element.text.lower().strip() == 'true'
            if attr_type == 'datetime':
                return element.text.strip()
        else:
            return element.text
    elif element.attrib:
        if 'nil' in element.attrib:
            return None
        else:
            return element.attrib
    else:
        return None

In [None]:
# iterate through each year, getting the index
ein_opendata = []

for year in years:
    # check if index exists
    try:
        year_index = pd.read_csv(index_url.format(year))
    except:
        print("{} index not found".format(year))
        continue
        
    # check for all clients in index
    clients_in_index = year_index[year_index['EIN'].isin(client_eins)]
    
    print("{} clients in {} index".format(len(clients_in_index), year))

    # request all filings
    print("{} clients in {} index".format(len(clients_in_index), year))
    start_filing_count = len(ein_opendata)
    for _, r in clients_in_index.iterrows():
        try:
            rsp = requests.get(url_filing.format(r['OBJECT_ID']))
            return_data = xml_to_dict(rsp.content)

            ein_opendata.append({
                'client': r['EIN'],
                'year': year,
                'revenue': return_data['CYTotalRevenue']
            })
        except:
            print("\terror requesting {} {} filing".format(r['EIN'], r['TAX_PERIOD']))
            
    end_filing_count = len(ein_opendata)
    print("\t{} EIN records found".format(end_filing_count - start_filing_count))