In [1]:
import pandas as pd

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

# identify top 100 forms & orgs w/ receipt edits

In [2]:
q = "select * from syslog_logs where message like '%receipt%' and message like '%update%'"
df = redshift_query_read(q, schema='production')

In [3]:
forms = df[['form', 'created']]
orgs = df[['org', 'created']]

print("{:,} entries".format(len(forms)))
print("{:,} forms".format(len(forms['form'].unique())))
print("{:,} orgs".format(len(orgs['org'].unique())))
print("{} earliest date".format(df['created'].min()))

221,307 entries
6,141 forms
5,532 orgs
2021-07-06 13:48:32 earliest date


In [4]:
forms = forms.groupby('form')['created'].count().reset_index()
forms.columns = ['form', 'edits']
forms = forms[forms['form']!=0]

orgs = orgs.groupby('org')['created'].count().reset_index()
orgs.columns = ['org', 'edits']
orgs = orgs[orgs['org']!=0]

In [5]:
top_100_forms = pd.read_csv("form_download.csv")
top_100_orgs = pd.read_csv("org_download.csv")

In [6]:
len(top_100_forms), len(top_100_orgs)

(9150, 100)

In [7]:
inc_orgs = top_100_orgs['Id'].isin(orgs['org'].tolist()).sum()

print("{} orgs; {:.1f}%".format(inc_orgs, (inc_orgs / len(orgs)) * 100.))

90 orgs; 1.6%


In [8]:
inc_forms = top_100_forms['Form ID'].isin(forms['form'].tolist()).sum()

print("{} of {:,} forms; {:.1f}%".format(inc_forms, len(top_100_forms), (inc_forms / len(forms)) * 100.))

67 of 9,150 forms; 1.1%


# compare to fundraising

## orgs

In [26]:
q = '''select
            org,
            date_part('year', date) as year,
            sum(amount) as vol
        from transactions
        group by org, date_part('year', date)'''
df = redshift_query_read(q, schema='production')

In [27]:
org_annuals = df.groupby('org')['vol'].mean().reset_index()
org_annuals['is_top_100'] = org_annuals['org'].isin(top_100_orgs['Id'].tolist())
org_annuals['has_receipt_edits'] = org_annuals['org'].isin(orgs['org'].tolist())

In [28]:
org_annuals.tail(3)

Unnamed: 0,org,vol,is_top_100,has_receipt_edits
11621,456035,1.0,False,False
11622,456068,2.0,False,False
11623,456091,1.0,False,False


In [29]:
org_annuals.groupby(['is_top_100', 'has_receipt_edits'])['vol'].mean()

is_top_100  has_receipt_edits
False       False                6.225637e+05
            True                 5.686534e+05
True        False                5.028755e+06
            True                 1.420911e+07
Name: vol, dtype: float64

In [30]:
print("Top 100:")
print("All: ${:,.2f} annual vol".format(org_annuals[org_annuals['is_top_100']]['vol'].median()))
print("No receipt edits: ${:,.2f} annual vol".format(org_annuals[org_annuals['is_top_100']&~org_annuals['has_receipt_edits']]['vol'].median()))
print("Has receipt edits: ${:,.2f} annual vol".format(org_annuals[org_annuals['is_top_100']&org_annuals['has_receipt_edits']]['vol'].median()))
print()
print("Non Top 100:")
print("All: ${:,.2f} annual vol".format(org_annuals[~org_annuals['is_top_100']]['vol'].median()))
print("No receipt edits: ${:,.2f} annual vol".format(org_annuals[~org_annuals['is_top_100']&~org_annuals['has_receipt_edits']]['vol'].median()))
print("Has receipt edits: ${:,.2f} annual vol".format(org_annuals[~org_annuals['is_top_100']&org_annuals['has_receipt_edits']]['vol'].median()))

Top 100:
All: $1,320,802.14 annual vol
No receipt edits: $927,492.35 annual vol
Has receipt edits: $1,423,942.09 annual vol

Non Top 100:
All: $10,099.92 annual vol
No receipt edits: $3,072.85 annual vol
Has receipt edits: $31,324.63 annual vol


## orgs < $300k per year

In [34]:
sub300 = df[(df['year']==2023)&(df['vol']<300000)].copy()

In [35]:
print("{:,} entries".format(len(sub300)))
print("${:,.2f} mean vol".format(sub300['vol'].mean()))
print("${:,.2f} median vol".format(sub300['vol'].median()))

4,108 entries
$49,338.00 mean vol
$18,249.72 median vol


In [37]:
sub300['has_receipt_edits'] = sub300['org'].isin(orgs['org'].tolist())

sub300.groupby('has_receipt_edits')['vol'].agg(['mean', 'median']).reset_index()

Unnamed: 0,has_receipt_edits,mean,median
0,False,21066.508839,3915.0
1,True,63468.583611,32165.0


## forms

In [14]:
q = '''select
            form,
            date_part('year', date) as year,
            sum(amount) as vol
        from transactions
        group by form, date_part('year', date)'''
df = redshift_query_read(q, schema='production')

In [15]:
form_annuals = df.groupby('form')['vol'].mean().reset_index()
form_annuals['is_top_100'] = form_annuals['form'].isin(top_100_forms['Form ID'].tolist())
form_annuals['has_receipt_edits'] = form_annuals['form'].isin(forms['form'].tolist())

In [16]:
form_annuals.tail(3)

Unnamed: 0,form,vol,is_top_100,has_receipt_edits
72761,1053965,5.0,False,False
72762,1053971,5.0,False,False
72763,1054046,10.0,False,False


In [17]:
print("Top 100:")
print("All: ${:,.2f} annual vol".format(form_annuals[form_annuals['is_top_100']]['vol'].median()))
print("No receipt edits: ${:,.2f} annual vol".format(form_annuals[form_annuals['is_top_100']&~form_annuals['has_receipt_edits']]['vol'].median()))
print("Has receipt edits: ${:,.2f} annual vol".format(form_annuals[form_annuals['is_top_100']&form_annuals['has_receipt_edits']]['vol'].median()))
print()
print("Non Top 100:")
print("All: ${:,.2f} annual vol".format(form_annuals[~form_annuals['is_top_100']]['vol'].median()))
print("No receipt edits: ${:,.2f} annual vol".format(form_annuals[~form_annuals['is_top_100']&~form_annuals['has_receipt_edits']]['vol'].median()))
print("Has receipt edits: ${:,.2f} annual vol".format(form_annuals[~form_annuals['is_top_100']&form_annuals['has_receipt_edits']]['vol'].median()))

Top 100:
All: $2,688.26 annual vol
No receipt edits: $2,674.92 annual vol
Has receipt edits: $5,431.50 annual vol

Non Top 100:
All: $2,534.01 annual vol
No receipt edits: $2,506.48 annual vol
Has receipt edits: $8,613.57 annual vol
