In [3]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# data load

## processing

In [4]:
q = '''select
            t.org,
            t.form,
            date_trunc('week', t.date) as week,
            count(t.id) as trans_count,
            sum(t.amount) as trans_vol,
            sum(t.donations_count) as donations_count,
            sum(t.donations_amt) as donations_vol,
            sum(t.purchases_count) as purchases_count,
            sum(t.purchases_amt) as purchases_vol,
            sum(t.registrations_count) as registrations_count,
            sum(t.registrations_amt) as registrations_vol,
            sum(t.events_amt) as events_vol,
            sum(t.events_count) as events_count
        from transactions as t
            left join form as f on f.id=t.form
        where
            t.status='A' and
            f.type=3
        group by date_trunc('week', t.date), t.form, t.org'''
trans = redshift_query_read(q, schema='production')

## p2p analytics

In [5]:
q = "select * from analyticsp2p_weekly"
df = redshift_query_read(q, schema='public')

## merging

In [6]:
# $41k floor for top 10 volume
top_10_vol_floor = 41000

form_totals = trans.groupby(['form', 'org'])[['trans_vol', 'donations_vol', 'purchases_count', 'purchases_vol', 'registrations_count', 'registrations_vol']].sum().reset_index()
form_totals['is_top_performer'] = form_totals['trans_vol']>top_10_vol_floor
top_performers = form_totals[form_totals['is_top_performer']]['form'].tolist()

In [9]:
len_all = len(form_totals)
bottom_90perc = len_all - len(top_performers)

not_top_10_total_vol = form_totals.sort_values('trans_vol', ascending=False).tail(bottom_90perc)
bottom_performers = not_top_10_total_vol['form'].tolist()

In [26]:
trans['is_top_performer'] = trans['form'].isin(top_performers)
df['is_top_performer'] = df['form'].isin(top_performers)

trans['is_bottom_performer'] = trans['form'].isin(bottom_performers)
df['is_bottom_performer'] = df['form'].isin(bottom_performers)
form_totals['is_bottom_performer'] = form_totals['form'].isin(bottom_performers)

# categories

In [11]:
df.groupby(['is_top_performer', 'is_bottom_performer'])['cat_count'].agg(['mean', 'median']).reset_index()

Unnamed: 0,is_top_performer,is_bottom_performer,mean,median
0,False,False,1.022415,1.0
1,False,True,1.812933,1.0
2,True,False,2.886146,2.0


In [40]:
cat_groups = [(0, 1), (1, 2), (3, 5), (5, 10), (10)]
cat_data = []
for g in cat_groups:
    try:
        these_forms = df[(df['cat_count']>g[0])&(df['cat_count']<=g[1])]['form'].to_list()
    except:
        these_forms = df[df['cat_count']>g]['form'].to_list()
        
    these_totals = form_totals[form_totals['form'].isin(these_forms)]
    
    print("{} categories ({:,} forms):".format(g, len(these_forms)))
    
    this_data = {
        'categories': g,
        'sample_size': len(these_forms),
        'vol_mean': these_totals['trans_vol'].mean(), 
        'vol_median': these_totals['trans_vol'].median(),
        'vol_min': these_totals['trans_vol'].min(), 
        'vol_max': these_totals['trans_vol'].max(),
        'outperformer': these_totals['is_top_performer'].mean(), 
        'underperformer': these_totals['is_bottom_performer'].mean(),
        'registrations_count_mean': these_totals['registrations_count'].mean(), 
        'registrations_count_median': these_totals['registrations_count'].median(),
        'registrations_vol_mean': these_totals['registrations_vol'].mean(), 
        'registrations_vol_median': these_totals['registrations_vol'].median(),
        'donations_vol_mean': these_totals['donations_vol'].mean(), 
        'donations_vol_median': these_totals['donations_vol'].median()
    }
    
    cat_data.append(this_data)
    
    print("\tprocessing:")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['vol_mean'], this_data['vol_median']))
    print("\t\tvol range: ${:,.2f} to ${:,.2f}".format(this_data['vol_min'], this_data['vol_max']))
    print("\t\toutperformer: {:.2f}; underperformer: {:.2f}".format(this_data['outperformer'], this_data['underperformer']))
    
    print("\tregistrations")
    print("\t\tcount: mean {:,.2f}, median {:,.2f}".format(this_data['registrations_count_mean'], this_data['registrations_count_median']))
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['registrations_vol_mean'], this_data['registrations_vol_median']))
    
    print("\tdonations")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['donations_vol_mean'], this_data['donations_vol_median']))
    

(0, 1) categories (644,716 forms):
	processing:
		vol: mean $16,223.00, median $4,293.34
		vol range: $0.00 to $1,938,575.28
		outperformer: 0.09; underperformer: 0.91
	registrations
		count: mean 55.22, median 12.00
		vol: mean $386.40, median $0.00
	donations
		vol: mean $15,393.35, median $3,927.50
(1, 2) categories (212,405 forms):
	processing:
		vol: mean $26,862.46, median $8,238.32
		vol range: $0.00 to $1,974,183.55
		outperformer: 0.14; underperformer: 0.86
	registrations
		count: mean 135.62, median 46.00
		vol: mean $1,558.72, median $0.00
	donations
		vol: mean $24,519.34, median $6,896.00
(3, 5) categories (86,637 forms):
	processing:
		vol: mean $31,922.62, median $11,040.00
		vol range: $0.00 to $1,268,624.86
		outperformer: 0.22; underperformer: 0.78
	registrations
		count: mean 247.44, median 74.00
		vol: mean $4,432.48, median $775.00
	donations
		vol: mean $26,388.47, median $8,033.76
(5, 10) categories (52,087 forms):
	processing:
		vol: mean $26,555.32, median $12,

# classifications

In [12]:
df.groupby(['is_top_performer', 'is_bottom_performer'])['class_count'].agg(['mean', 'median']).reset_index()

Unnamed: 0,is_top_performer,is_bottom_performer,mean,median
0,False,False,0.499065,0.0
1,False,True,1.406926,0.0
2,True,False,5.945115,0.0


In [41]:
class_groups = [(0, 1), (1, 2), (3, 5), (5, 10), (10)]
class_data = []
for g in class_groups:
    try:
        these_forms = df[(df['class_count']>g[0])&(df['class_count']<=g[1])]['form'].to_list()
    except:
        these_forms = df[df['class_count']>g]['form'].to_list()
        
    these_totals = form_totals[form_totals['form'].isin(these_forms)]
    
    print("{} classifications ({:,} forms):".format(g, len(these_forms)))
    
    this_data = {
        'categories': g,
        'sample_size': len(these_forms),
        'vol_mean': these_totals['trans_vol'].mean(), 
        'vol_median': these_totals['trans_vol'].median(),
        'vol_min': these_totals['trans_vol'].min(), 
        'vol_max': these_totals['trans_vol'].max(),
        'outperformer': these_totals['is_top_performer'].mean(), 
        'underperformer': these_totals['is_bottom_performer'].mean(),
        'registrations_count_mean': these_totals['registrations_count'].mean(), 
        'registrations_count_median': these_totals['registrations_count'].median(),
        'registrations_vol_mean': these_totals['registrations_vol'].mean(), 
        'registrations_vol_median': these_totals['registrations_vol'].median(),
        'donations_vol_mean': these_totals['donations_vol'].mean(), 
        'donations_vol_median': these_totals['donations_vol'].median()
    }
    
    class_data.append(this_data)
    
    print("\tprocessing:")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['vol_mean'], this_data['vol_median']))
    print("\t\tvol range: ${:,.2f} to ${:,.2f}".format(this_data['vol_min'], this_data['vol_max']))
    print("\t\toutperformer: {:.2f}; underperformer: {:.2f}".format(this_data['outperformer'], this_data['underperformer']))
    
    print("\tregistrations")
    print("\t\tcount: mean {:,.2f}, median {:,.2f}".format(this_data['registrations_count_mean'], this_data['registrations_count_median']))
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['registrations_vol_mean'], this_data['registrations_vol_median']))
    
    print("\tdonations")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['donations_vol_mean'], this_data['donations_vol_median']))
    

(0, 1) classifications (153,258 forms):
	processing:
		vol: mean $18,146.36, median $5,655.00
		vol range: $0.00 to $1,268,624.86
		outperformer: 0.10; underperformer: 0.90
	registrations
		count: mean 107.87, median 34.00
		vol: mean $1,400.64, median $0.00
	donations
		vol: mean $16,322.95, median $4,969.70
(1, 2) classifications (69,533 forms):
	processing:
		vol: mean $27,292.04, median $8,616.00
		vol range: $0.00 to $1,268,624.86
		outperformer: 0.17; underperformer: 0.83
	registrations
		count: mean 181.82, median 61.00
		vol: mean $2,563.86, median $0.00
	donations
		vol: mean $23,989.59, median $7,109.25
(3, 5) classifications (29,953 forms):
	processing:
		vol: mean $39,582.20, median $14,071.06
		vol range: $0.00 to $741,000.60
		outperformer: 0.26; underperformer: 0.74
	registrations
		count: mean 233.98, median 70.00
		vol: mean $3,537.85, median $0.00
	donations
		vol: mean $34,922.59, median $11,117.19
(5, 10) classifications (26,816 forms):
	processing:
		vol: mean $55,