In [121]:
import requests, json, sys
import pandas as pd
import datetime
import numpy as np

sys.path.insert(1, '../../../scripts/')
from s3_support import *

# Fetch and cleanup organization data

In [122]:
def load_and_process_organizations():
    # Load the organizations and subset to just the fields we need
    orgs = get_dataframe_from_file('qgiv-stats-data', 'organizations.csv')
    orgs = orgs[['Id', 'Org Name', 'Tax ID', 'Go Live Date']]

    # Drop any orgs that dont have a Tax ID
    orgs = orgs.dropna(subset=['Tax ID'])

    # Drop any orgs that use a default or test Tax ID
    orgs = orgs.drop(orgs[orgs['Tax ID'] == '999999999'].index)
    orgs = orgs.drop(orgs[orgs['Tax ID'] == '99999999'].index)

    return orgs

orgs = load_and_process_organizations()

In [123]:
# There are a lot of orgs here with bad `Go Live Dates` so the transactions are a better indicator of the actual go live date.

orgs.sort_values('Go Live Date')

Unnamed: 0,Id,Org Name,Tax ID,Go Live Date
927,9,Cipher Beta,123456789,01/01/2000
1814,441416,"FULL CIRCLE FUSION, LLC",814959476,01/01/2000
3607,441500,Post Migration,123456789,01/01/2000
1019,15308,"College Park Baptist Church, Inc.",560713878,01/02/2013
2619,38927,Kingdom Nations Church,050573933,01/02/2014
...,...,...,...,...
1424,29754,Elect Sir Palmdale City Council,463268684,12/31/1969
4095,441537,Silent No Longer Foundation,821968148,12/31/1969
4092,1884,Signup QA Test5,123123,12/31/1969
4177,36848,Special Operations Warrior Foundation,521183585,12/31/1969


# Load Revenue Data

In [124]:
orgs

Unnamed: 0,Id,Org Name,Tax ID,Go Live Date
2,442134,Camp Laurelwood,060693092,11/06/2017
3,442293,Cass Community Social Services,383429921,01/29/2018
5,439357,Funeral Consumers Alliance of Western MA,061689862,01/10/2017
6,441842,Illinois Association of Municipal Management ...,363117888,09/18/2017
7,29731,Lincoln Theater,222491739,07/29/2013
...,...,...,...,...
5201,438377,YWCA Glendale,951644057,01/19/2017
5202,441708,YWCA York,231360889,07/28/2017
5207,442509,Zeno,205570858,12/31/1969
5209,442543,Zion Lutheran Church,846033364,12/31/1969


In [125]:
def load_and_process_revenue_data(orgs: pd.DataFrame):
    revenue = get_dataframe_from_file('tax-info', 'org_revenue_data.csv')
    # revenue = get_dataframe_from_file('tax-info', 'propublica_990.csv')


    # Setting these data types will be helpful later on
    revenue['ein'] = revenue['ein'].astype(str)
    revenue['revenue'] = revenue['revenue'].astype(int)

    # We are only interested years with a revenue larger than zero
    revenue = revenue[revenue['revenue'] > 0]

    # We are adding the organizations ID to the revenue for convenience later on

    # Returns the organization id for a given tax id
    def get_org_id(row):
        org_data = orgs.loc[orgs['Tax ID'] == row['ein']]
        if org_data.empty == True:
            return 'None'
        else:
            return org_data['Id'].values[0]

    revenue['Org ID'] = revenue.apply(get_org_id, axis=1)

    return revenue

revenue = load_and_process_revenue_data(orgs=orgs)

In [126]:
# We have some missing values here
revenue['Org ID'].value_counts()

None      750
441879     28
31823      21
161795     18
685        16
         ... 
431199      1
442040      1
442041      1
442042      1
441811      1
Name: Org ID, Length: 1961, dtype: int64

# Add the `Go Live` date

In [127]:
# The `Go Live Date` is either missing or incorrect for some orgs so we need clean that property up here

# Load transaction data in order to fill as many live dates as possible
transactions = get_dataframe_from_file('trans-records', 'trans-dates.csv')
transactions['date'] = pd.to_datetime(transactions['date'])

In [128]:
# Attempts to set the `Go Live Date` for each org based on either the current `Go Live Date` on the organization or by finding the earliest transaction processed.
def get_go_live_date(row):
    org_data = orgs.loc[orgs['Tax ID'] == row['ein']]
    
    if org_data.empty == False and pd.to_datetime(org_data['Go Live Date'].values[0]).year >= 2007:
        return org_data['Go Live Date'].values[0]
    else:
        org_transactions = transactions[transactions['id'] == row['Org ID']]
        return org_transactions['date'].min()

revenue['Go Live Date'] = revenue.apply(get_go_live_date, axis=1)

In [129]:
# Dropping any rows missing a `Go Live Date` and the col to datetime
revenue = revenue.dropna(subset=['Go Live Date'])
revenue['Go Live Date'] = pd.to_datetime(revenue['Go Live Date'])

In [130]:
revenue

Unnamed: 0,ein,year,revenue,Org ID,Go Live Date
7,383429921,2017,6631599,442293,2018-01-29
8,383429921,2016,5266543,442293,2018-01-29
9,383429921,2015,5377085,442293,2018-01-29
10,383429921,2014,6120592,442293,2018-01-29
11,383429921,2013,5576486,442293,2018-01-29
...,...,...,...,...,...
13005,205570858,2015,815357,442509,2012-04-13
13006,205570858,2014,875616,442509,2012-04-13
13007,205570858,2013,839319,442509,2012-04-13
13008,205570858,2012,847372,442509,2012-04-13


# Get YOY for each org

In [148]:
# This shows the duplicated rows that need to be dropped
# revenue[revenue.duplicated(subset=['ein', 'year'])].sort_values(['ein', 'year']).head(30)

In [132]:
# revenue.sort_values(['ein', 'year', 'revenue']).drop_duplicates(subset=['ein', 'year']).sort_values(['ein', 'year']).head(30)

In [133]:
# Dropping the duplicated years for the same ein
revenue = revenue.sort_values(['ein', 'year', 'revenue'], ascending=False).drop_duplicates(subset=['ein', 'year'])

# Sort by the ein and year 
revenue = revenue.sort_values(['ein', 'year'])

# Calculate the percent change yoy for each org
revenue['Percent Change'] = revenue.groupby('ein')['revenue'].pct_change() * 100

revenue['Amount Change'] = revenue.groupby('ein')['revenue'].diff()

revenue = revenue.dropna()

In [134]:
revenue.sort_values('Percent Change', ascending=False)

Unnamed: 0,ein,year,revenue,Org ID,Go Live Date,Percent Change,Amount Change
10723,752861429,2012,82720,29720,2013-06-24,8271900.000,82719.000
6699,460566566,2014,10906,425996,2015-08-03,1090500.000,10905.000
3959,272568814,2012,776508,442021,2017-10-24,776408.000,776408.000
1686,473845032,2017,79434,442286,2012-04-13,62942.857,79308.000
4290,201794347,2015,494742,440341,2016-12-20,43643.767,493611.000
...,...,...,...,...,...,...,...
8524,462600505,2015,33423,38908,2013-12-06,-99.174,-4010576.000
12755,133858323,2015,987,430165,2016-04-26,-99.437,-174337.000
1586,237427232,2017,59969,442275,2018-01-17,-99.533,-12778477.000
2004,264380050,2015,1,884,2010-11-16,-99.925,-1327.000


In [136]:
# Get means before and after the org started using Qgiv

In [137]:
# Returns the mean of percent changes before the org started using Qgiv
def mean_before_qgiv_live_date(group):
    go_live_date = group['Go Live Date'].values[0]
    go_live_year = pd.to_datetime(go_live_date).year
    return (group[group['year'] < go_live_year]['Percent Change']).mean()


# revenue.groupby('ein').apply(mean_before_qgiv_live_date)

In [138]:
# Returns the mean of percent changes after the org started using Qgiv
def mean_after_qgiv_live_date(group):
    go_live_date = group['Go Live Date'].values[0]
    go_live_year = pd.to_datetime(go_live_date).year
    return (group[group['year'] >= go_live_year]['Percent Change']).mean()

# revenue.groupby('ein').apply(mean_after_qgiv_live_date)

# Make the final dataset.

In [139]:
# Gets the means before and after going with Qgiv and creates a data frame.

mean_pct_change_before_qgiv = revenue.groupby('ein').apply(mean_before_qgiv_live_date).reset_index()
mean_pct_change_after_qgiv = revenue.groupby('ein').apply(mean_after_qgiv_live_date).reset_index()

percent_changes = pd.DataFrame({
    'ein': mean_pct_change_before_qgiv['ein'],
    'Mean Percent Change Before Qgiv': mean_pct_change_before_qgiv[0],
    'Mean Percent Change After Qgiv': mean_pct_change_after_qgiv[0]
})

In [151]:
percent_changes.sort_values('Mean Percent Change After Qgiv', ascending=False).dropna()

Unnamed: 0,ein,Mean Percent Change Before Qgiv,Mean Percent Change After Qgiv
41,133858323,33.110,5700.099
373,262497802,2215.202,2832.142
909,460566566,545200.010,813.324
379,262744380,38.154,722.918
180,208989213,-27.080,658.871
...,...,...,...
332,260111954,-8.796,-61.526
675,364415624,28.402,-64.974
95,201410413,14.311,-65.669
768,391676894,6.726,-90.048


In [141]:
# save_dataframe_to_file('tax-info', 'percent_changes.csv', percent_changes)

In [154]:
revenue[revenue['ein'] == '262497802'].style.format({"revenue": "{:,.0f}"})

Unnamed: 0,ein,year,revenue,Org ID,Go Live Date,Percent Change,Amount Change
9979,262497802,2011,1876257,430131,2016-04-08 00:00:00,-13.1828,-284901.0
9978,262497802,2012,1086141,430131,2016-04-08 00:00:00,-42.1113,-790116.0
9977,262497802,2013,42102,430131,2016-04-08 00:00:00,-96.1237,-1044040.0
9976,262497802,2014,4810448,430131,2016-04-08 00:00:00,11325.7,4768350.0
9975,262497802,2015,83104,430131,2016-04-08 00:00:00,-98.2724,-4727340.0
9974,262497802,2016,4867791,430131,2016-04-08 00:00:00,5757.47,4784690.0
9973,262497802,2017,331708,430131,2016-04-08 00:00:00,-93.1857,-4536080.0


In [144]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
percent_changes.mean()

ein                                    inf
Mean Percent Change Before Qgiv   7548.015
Mean Percent Change After Qgiv      84.006
dtype: float64

In [145]:
len(percent_changes.dropna())

1078

In [146]:
percent_changes.head(60)

Unnamed: 0,ein,Mean Percent Change Before Qgiv,Mean Percent Change After Qgiv
0,111630780,0.154,15.585
1,111987282,6.913,-6.324
2,112729585,-4.223,-3.39
3,112925751,6.468,8.271
4,112981112,,1.941
5,113028366,-8.871,-16.31
6,113186856,9.12,9.344
7,113298776,-8.235,516.797
8,113305406,43.931,11.563
9,113348359,,49.316


In [147]:
revenue.drop_duplicates(subset=['ein', 'Go Live Date'])['Go Live Date'].map(lambda date: date.year).value_counts().sort_index()

2007     11
2008     45
2009     97
2010     80
2011    113
2012    154
2013    229
2014    197
2015    212
2016    230
2017    368
2018    110
Name: Go Live Date, dtype: int64