In [None]:
import requests, json, sys
import pandas as pd

sys.path.insert(1, '../../../scripts/')
from s3_support import *

In [None]:
victory = '592954281'
mercyships = '262414132'
propublica = '142007220'
other = '453578215'

In [None]:
url = "https://s3.amazonaws.com/irs-form-990/index_{}.csv"

In [None]:
df_2013_idx = pd.read_csv(url.format("2013"))

In [None]:
print("{} rows".format(len(df_2013_idx)))
print(df_2013_idx.columns)

## Search

In [None]:
for yr in [2013, 2014, 2015, 2016, 2017, 2018]:
    try:
        yr_idx = pd.read_csv(url.format(yr))
        orgs_lst = yr_idx['EIN'].tolist()
        
        print("{} ({} orgs):".format(yr, len(yr_idx)))
        print("victory {}; mercyships: {}; propublica: {}".format(int(victory) in orgs_lst, int(mercyships) in orgs_lst, int(propublica) in orgs_lst))
    except:
        print("error for year {}".format(yr))

## Filings

use the OBJECT_ID to map to filing info

In [None]:
# This was here already but caused an undefined error
# rsp.content[:500]

In [None]:
from lxml import etree
from lxml import etree

def xml_to_dict(xml_str):
    return etree2dict(etree.fromstring(xml_str))

# converts an etree to dict, useful to convert xml to dict
def etree2dict(tree):
    root, contents = recursive_dict(tree)
    return {root: contents}

def recursive_dict(element):
    if element.attrib and 'type' in element.attrib and element.attrib['type'] == "array":
        return element.tag.replace('{http://www.irs.gov/efile}', ''), [(dict(map(recursive_dict, child)) or getElementValue(child)) for child in element]
    else:
        return element.tag.replace('{http://www.irs.gov/efile}', ''), dict(map(recursive_dict, element)) or getElementValue(element)

def getElementValue(element):
    if element.text:
        if element.attrib and 'type' in element.attrib:
            attr_type = element.attrib.get('type')
            if attr_type == 'integer':
                return int(element.text.strip())
            if attr_type == 'float':
                return float(element.text.strip())
            if attr_type == 'boolean':
                return element.text.lower().strip() == 'true'
            if attr_type == 'datetime':
                return element.text.strip()
        else:
            return element.text
    elif element.attrib:
        if 'nil' in element.attrib:
            return None
        else:
            return element.attrib
    else:
        return None

In [None]:
yr_idx[yr_idx['EIN']==int(mercyships)]

In [None]:
# .format(year, EIN)
url_filing = "https://s3.amazonaws.com/irs-form-990/{}_public.xml"

In [None]:
rsp = requests.get(url_filing.format("201812439349301301"))
return_data = xml_to_dict(rsp.content)

# [ReturnData][PYTotalRevenueAmt] and [ReturnData][CYTotalRevenueAmt], not sure what

In [None]:
print("PYTotalRevenueAmt: {}".format(return_data['Return']['ReturnData']['IRS990']['PYTotalRevenueAmt']))
print("CYTotalRevenueAmt: {}".format(return_data['Return']['ReturnData']['IRS990']['CYTotalRevenueAmt']))

# Download Data for each org

# Get client list w/ EIN's

In [None]:
orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.csv")

In [None]:
orgs.head(1)

In [None]:
ein_data = []
missing_counter = 0

# Years to collect data (2011 seems to be the earliest year they have)
years = [i for i in range(2011, 2020)]

# Qgiv clients to collect data for
clients = get_dataframe_from_file("qgiv-stats-data", "organizations.csv")[['Id', 'Org Name', 'Tax ID', 'Go Live Date']]
clients['EIN_clean'] = clients['Tax ID'].astype(str)

for year in years:
    # The index containing the `OBJECT_ID` needed to fetch the orgs data
    # Line 39569 of one of files contains 10 elements instead of the expected 9 so we are skipping it
    year_index = pd.read_csv("https://s3.amazonaws.com/irs-form-990/index_{}.csv".format(year), error_bad_lines=False)
    year_index['EIN_clean'] = year_index['EIN'].astype(str)
    
    for index, client in clients.iterrows():
        # Skip rows that do not have a tax ID
        if pd.isna(client['Tax ID']): continue
        
        # Get the identifier from the index
        index = year_index[year_index['EIN_clean'] == client['EIN_clean']]
        if len(index.values) == 0: continue
        
        identifier = index['OBJECT_ID'].values[0]
        
        try:
            response = requests.get("https://s3.amazonaws.com/irs-form-990/{}_public.xml".format(identifier))
            tax_info = xml_to_dict(response.content)
            
            ein_data.append({
                'ein': client['Tax ID'],
                'year': year,
                'revenue': tax_info['Return']['ReturnData']['IRS990']['Revenue']
            })
        except:
            print('Error requesting tax information for {} in year {}'.format(client['Tax ID'], year)
            missing_counter += 1
            continue
            

print('There are {} EINs that could not be loaded.'.format(missing_counter))
    

data_frame = pd.DataFrame(ein_data)

In [None]:
len(ein_data)

In [None]:
data_frame = pd.DataFrame(ein_data)

In [None]:
data_frame.head()


In [None]:
save_dataframe_to_file('tax-info', 'opendata_990.csv', data_frame)

In [None]:
list_files('tax-info')