In [18]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# data load

## form categories & isvirtual

In [19]:
q = '''select * from form where type=3'''
forms_table = redshift_query_read(q, schema='production')

## processing

In [20]:
q = '''select
            t.org,
            t.form,
            date_trunc('week', t.date) as week,
            count(t.id) as trans_count,
            sum(t.amount) as trans_vol,
            sum(t.donations_count) as donations_count,
            sum(t.donations_amt) as donations_vol,
            sum(t.purchases_count) as purchases_count,
            sum(t.purchases_amt) as purchases_vol,
            sum(t.registrations_count) as registrations_count,
            sum(t.registrations_amt) as registrations_vol,
            sum(t.events_amt) as events_vol,
            sum(t.events_count) as events_count
        from transactions as t
            left join form as f on f.id=t.form
        where
            t.status='A' and
            f.type=3
        group by date_trunc('week', t.date), t.form, t.org'''
trans = redshift_query_read(q, schema='production')

## p2p analytics

In [21]:
q = "select * from analyticsp2p_weekly"
df = redshift_query_read(q, schema='public')

## merging

In [22]:
# $41k floor for top 10 volume
top_10_vol_floor = 41000

form_totals = trans.groupby(['form', 'org'])[['trans_vol', 'donations_vol', 'purchases_count', 'purchases_vol', 'registrations_count', 'registrations_vol']].sum().reset_index()
form_totals['is_top_performer'] = form_totals['trans_vol']>top_10_vol_floor
top_performers = form_totals[form_totals['is_top_performer']]['form'].tolist()

In [23]:
len_all = len(form_totals)
bottom_90perc = len_all - len(top_performers)

not_top_10_total_vol = form_totals.sort_values('trans_vol', ascending=False).tail(bottom_90perc)
bottom_performers = not_top_10_total_vol['form'].tolist()

In [24]:
trans['is_top_performer'] = trans['form'].isin(top_performers)
df['is_top_performer'] = df['form'].isin(top_performers)

trans['is_bottom_performer'] = trans['form'].isin(bottom_performers)
df['is_bottom_performer'] = df['form'].isin(bottom_performers)
form_totals['is_bottom_performer'] = form_totals['form'].isin(bottom_performers)

In [25]:
def form_cat(form):
    if form in forms_table['id'].tolist():
        return forms_table[forms_table['id']==form]['categorization'].iloc[0]
    return 0

form_totals['category'] = form_totals['form'].apply(form_cat)

In [26]:
def form_isvirtual(form):
    if form in forms_table['id'].tolist():
        return forms_table[forms_table['id']==form]['isvirtual'].iloc[0]
    return 0

form_totals['isvirtual'] = form_totals['form'].apply(form_isvirtual)

In [27]:
form_totals = form_totals.merge(df[['form', 'allows_teams', 'teams_count', 'allows_reg_team_create']], on='form')

# teams

In [39]:
print("All forms mean teams: {:,.2f}".format(form_totals['teams_count'].mean()))
print("Allows create team during registration mean teams: {:,.2f}".format(form_totals[form_totals['allows_reg_team_create']==1]['teams_count'].mean()))
print("Does not allow create team during registration mean teams: {:,.2f}".format(form_totals[form_totals['allows_reg_team_create']==0]['teams_count'].mean()))

All forms mean teams: 0.06
Allows create team during registration mean teams: 0.08
Does not allow create team during registration mean teams: 0.00


In [29]:
form_totals[['allows_teams', 'trans_vol', 'donations_vol', 'registrations_count']].corr().iloc[0]

allows_teams           1.000000
trans_vol              0.038582
donations_vol          0.032992
registrations_count    0.097309
Name: allows_teams, dtype: float64

In [30]:
form_totals.groupby('allows_teams')[['trans_vol', 'donations_vol', 'registrations_count']].agg(['mean', 'median']).reset_index()

Unnamed: 0_level_0,allows_teams,trans_vol,trans_vol,donations_vol,donations_vol,registrations_count,registrations_count
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median
0,0,12264.841787,3000.8,11208.169058,2545.55,29.118293,1.0
1,1,20896.207279,6798.85,18677.176427,5930.0,142.401207,40.0
2,2,20416.849412,9969.1,18265.863529,8256.0,112.24183,22.0
3,3,27197.64629,9243.7,24392.293065,7601.62,119.580645,16.5
4,4,15151.73125,5867.42,14739.66625,5366.86,76.5,47.5
5,5,15520.303681,3140.0,13042.321748,3140.0,84.780124,0.0


In [31]:
form_totals[['teams_count', 'trans_vol', 'donations_vol', 'registrations_count']].corr().iloc[0]

teams_count            1.000000
trans_vol              0.089709
donations_vol          0.087041
registrations_count    0.125659
Name: teams_count, dtype: float64

__Unsurprisingly, teams count has the strongest correlation to registrations count__

In [38]:
team_groups = [(0, 3), (3, 5), (5, 10), (10, 20),
              (20)]
team_data = []
for g in team_groups:
    try:
        these_forms = df[(df['teams_count']>g[0])&(df['teams_count']<=g[1])]['form'].to_list()
    except:
        these_forms = df[df['teams_count']>g]['form'].to_list()
        
    these_totals = form_totals[form_totals['form'].isin(these_forms)]
    
    print("{} teams ({:,} forms):".format(g, len(set(these_forms))))
    
    this_data = {
        'teams': g,
        'sample_size': len(set(these_forms)),
        'vol_mean': these_totals['trans_vol'].mean(), 
        'vol_median': these_totals['trans_vol'].median(),
        'vol_min': these_totals['trans_vol'].min(), 
        'vol_max': these_totals['trans_vol'].max(),
        'outperformer': these_totals['is_top_performer'].mean(), 
        'underperformer': these_totals['is_bottom_performer'].mean(),
        'registrations_count_mean': these_totals['registrations_count'].mean(), 
        'registrations_count_median': these_totals['registrations_count'].median(),
        'registrations_vol_mean': these_totals['registrations_vol'].mean(), 
        'registrations_vol_median': these_totals['registrations_vol'].median(),
        'donations_vol_mean': these_totals['donations_vol'].mean(), 
        'donations_vol_median': these_totals['donations_vol'].median()
    }
    
    team_data.append(this_data)
    
    print("\tprocessing:")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['vol_mean'], this_data['vol_median']))
    print("\t\tvol range: ${:,.2f} to ${:,.2f}".format(this_data['vol_min'], this_data['vol_max']))
    print("\t\toutperformer: {:.2f}; underperformer: {:.2f}".format(this_data['outperformer'], this_data['underperformer']))
    
    print("\tregistrations")
    print("\t\tcount: mean {:,.2f}, median {:,.2f}".format(this_data['registrations_count_mean'], this_data['registrations_count_median']))
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['registrations_vol_mean'], this_data['registrations_vol_median']))
    
    print("\tdonations")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['donations_vol_mean'], this_data['donations_vol_median']))
    

(0, 3) teams (3,133 forms):
	processing:
		vol: mean $26,563.35, median $8,976.00
		vol range: $0.00 to $1,940,727.06
		outperformer: 0.15; underperformer: 0.85
	registrations
		count: mean 172.17, median 59.00
		vol: mean $2,128.06, median $0.00
	donations
		vol: mean $23,715.93, median $7,634.00
(3, 5) teams (1,236 forms):
	processing:
		vol: mean $37,238.94, median $16,019.25
		vol range: $0.00 to $1,842,666.49
		outperformer: 0.24; underperformer: 0.76
	registrations
		count: mean 283.88, median 121.00
		vol: mean $3,846.10, median $0.00
	donations
		vol: mean $32,332.30, median $14,196.00
(5, 10) teams (934 forms):
	processing:
		vol: mean $44,711.33, median $20,011.98
		vol range: $0.00 to $1,268,645.86
		outperformer: 0.29; underperformer: 0.71
	registrations
		count: mean 383.71, median 175.00
		vol: mean $4,514.24, median $252.00
	donations
		vol: mean $39,142.16, median $17,120.00
(10, 20) teams (414 forms):
	processing:
		vol: mean $64,861.94, median $30,975.95
		vol range: 

# promo codes

In [33]:
metrics = form_totals[['form', 'trans_vol', 'donations_vol', 'registrations_count']]
metrics_promos = metrics.merge(df.groupby('form')['promo_count'].max().reset_index(), on='form')
metrics_promos[['promo_count', 'trans_vol', 'donations_vol', 'registrations_count']].corr().iloc[0]

promo_count            1.000000
trans_vol              0.078558
donations_vol          0.049194
registrations_count    0.261151
Name: promo_count, dtype: float64

In [34]:
print("Correlations with only forms with promo codes")
metrics_promos[metrics_promos['promo_count']>0][['promo_count', 'trans_vol', 'donations_vol', 'registrations_count']].corr().iloc[0]

Correlations with only forms with promo codes


promo_count            1.000000
trans_vol              0.082296
donations_vol          0.047035
registrations_count    0.251208
Name: promo_count, dtype: float64

__Promo codes has a 25%+ positive correlation to registrations count__

In [37]:
promo_groups = [(0), (1, 3), (3, 5), (5, 10), (10, 20),
              (20)]
promo_data = []
for g in promo_groups:
    if g == 0:
        these_forms = df[df['promo_count']==0]['form'].to_list()
    else:
        try:
            these_forms = df[(df['promo_count']>g[0])&(df['promo_count']<=g[1])]['form'].to_list()
        except:
            these_forms = df[df['promo_count']>g]['form'].to_list()
        
    these_totals = form_totals[form_totals['form'].isin(these_forms)]
    
    print("{} promo cods ({:,} forms):".format(g, len(set(these_forms))))
    
    this_data = {
        'promo_counts': g,
        'sample_size': len(set(these_forms)),
        'vol_mean': these_totals['trans_vol'].mean(), 
        'vol_median': these_totals['trans_vol'].median(),
        'vol_min': these_totals['trans_vol'].min(), 
        'vol_max': these_totals['trans_vol'].max(),
        'outperformer': these_totals['is_top_performer'].mean(), 
        'underperformer': these_totals['is_bottom_performer'].mean(),
        'registrations_count_mean': these_totals['registrations_count'].mean(), 
        'registrations_count_median': these_totals['registrations_count'].median(),
        'registrations_vol_mean': these_totals['registrations_vol'].mean(), 
        'registrations_vol_median': these_totals['registrations_vol'].median(),
        'donations_vol_mean': these_totals['donations_vol'].mean(), 
        'donations_vol_median': these_totals['donations_vol'].median()
    }
    
    team_data.append(this_data)
    
    print("\tprocessing:")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['vol_mean'], this_data['vol_median']))
    print("\t\tvol range: ${:,.2f} to ${:,.2f}".format(this_data['vol_min'], this_data['vol_max']))
    print("\t\toutperformer: {:.2f}; underperformer: {:.2f}".format(this_data['outperformer'], this_data['underperformer']))
    
    print("\tregistrations")
    print("\t\tcount: mean {:,.2f}, median {:,.2f}".format(this_data['registrations_count_mean'], this_data['registrations_count_median']))
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['registrations_vol_mean'], this_data['registrations_vol_median']))
    
    print("\tdonations")
    print("\t\tvol: mean ${:,.2f}, median ${:,.2f}".format(this_data['donations_vol_mean'], this_data['donations_vol_median']))
    

0 promo cods (10,616 forms):
	processing:
		vol: mean $16,142.62, median $4,520.00
		vol range: $0.00 to $1,997,325.60
		outperformer: 0.09; underperformer: 0.91
	registrations
		count: mean 79.26, median 14.00
		vol: mean $696.57, median $0.00
	donations
		vol: mean $15,033.52, median $4,048.00
(1, 3) promo cods (727 forms):
	processing:
		vol: mean $31,612.92, median $15,546.68
		vol range: $0.00 to $559,923.41
		outperformer: 0.24; underperformer: 0.76
	registrations
		count: mean 249.61, median 110.00
		vol: mean $5,782.29, median $2,320.00
	donations
		vol: mean $24,865.89, median $10,846.10
(3, 5) promo cods (386 forms):
	processing:
		vol: mean $40,815.08, median $23,633.59
		vol range: $0.00 to $559,923.41
		outperformer: 0.32; underperformer: 0.68
	registrations
		count: mean 364.50, median 180.00
		vol: mean $8,683.84, median $4,025.00
	donations
		vol: mean $30,795.32, median $15,763.41
(5, 10) promo cods (385 forms):
	processing:
		vol: mean $40,225.75, median $25,242.67
		