In [2]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# load data

In [3]:
q = '''select * from form where type=3'''
forms_table = redshift_query_read(q, schema='production')

In [4]:
forms_table.head(2)

Unnamed: 0,id,org,datecreated,datelive,path,status,type,template,enableenddate,enablestartdate,recurringplanenddaterequired,categorization,isvirtual
0,1637,1648,NaT,2011-11-10 11:27:42,andhob,11,3,9,True,False,False,0,0
1,1782,53,2012-01-13 09:44:43,2007-09-10 00:00:00,nubawh,11,3,9,True,False,False,0,0


In [5]:
forms_table['isvirtual'].value_counts()

0    13620
1      535
Name: isvirtual, dtype: int64

In [6]:
forms_table['categorization'].value_counts()

0     10370
8       843
6       639
1       624
5       387
9       329
3       188
4       147
11      144
7       143
15      101
2        97
12       48
14       46
13       26
10       23
Name: categorization, dtype: int64

## processing

In [7]:
q = '''select
            t.org,
            t.form,
            date_trunc('week', t.date) as week,
            count(t.id) as trans_count,
            sum(t.amount) as trans_vol,
            sum(t.donations_count) as donations_count,
            sum(t.donations_amt) as donations_vol,
            sum(t.purchases_count) as purchases_count,
            sum(t.purchases_amt) as purchases_vol,
            sum(t.registrations_count) as registrations_count,
            sum(t.registrations_amt) as registrations_vol,
            sum(t.events_amt) as events_vol,
            sum(t.events_count) as events_count
        from transactions as t
            left join form as f on f.id=t.form
        where
            t.status='A' and
            f.type=3
        group by date_trunc('week', t.date), t.form, t.org'''
trans = redshift_query_read(q, schema='production')

## p2p 

In [8]:
q = "select * from analyticsp2p_weekly"
df = redshift_query_read(q, schema='public')

## merging

In [9]:
# $41k floor for top 10 volume
top_10_vol_floor = 41000

form_totals = trans.groupby(['form', 'org'])[['trans_vol', 'donations_vol', 'purchases_count', 'purchases_vol', 'registrations_count', 'registrations_vol']].sum().reset_index()
form_totals['is_top_performer'] = form_totals['trans_vol']>top_10_vol_floor
top_performers = form_totals[form_totals['is_top_performer']]['form'].tolist()

In [10]:
trans['is_top_performer'] = trans['form'].isin(top_performers)
df['is_top_performer'] = df['form'].isin(top_performers)

## top performers

top 10% processing

In [11]:
buckets = [(0, 250), (250, 500), (500, 1000), (1000, 2500), 
           (2500, 5000), (5000, 10000), (10000, 20000), 
           (20000, 50000), (50000, 100000), (100000)]
for b in buckets:
    try:
        len_forms = len(form_totals[(form_totals['trans_vol']>=b[0])&(form_totals['trans_vol']<b[1])])
    except:
        len_forms = len(form_totals[form_totals['trans_vol']>=b])
        
    perc_forms = (len_forms / len(form_totals)) * 100.
    print("{}: {:,} forms ({:.2f}%)".format(b, len_forms, perc_forms))

(0, 250): 1,182 forms (13.15%)
(250, 500): 401 forms (4.46%)
(500, 1000): 608 forms (6.77%)
(1000, 2500): 1,170 forms (13.02%)
(2500, 5000): 1,159 forms (12.90%)
(5000, 10000): 1,260 forms (14.02%)
(10000, 20000): 1,284 forms (14.29%)
(20000, 50000): 1,188 forms (13.22%)
(50000, 100000): 472 forms (5.25%)
100000: 262 forms (2.92%)


In [12]:
# top 10%
len_all = len(form_totals)
top_10perc = int(len_all/10)

top_10_total_vol = form_totals.sort_values('trans_vol', ascending=False).head(top_10perc)

print("Top 10% total volume:")
print("{:,} forms".format(len(top_10_total_vol)))
print("${:,.2f} median volume".format(top_10_total_vol['trans_vol'].median()))
print("${:,.2f} mean volume".format(top_10_total_vol['trans_vol'].mean()))
print("Volume range: ${:,.2f} to ${:,.2f}".format(top_10_total_vol['trans_vol'].min(), top_10_total_vol['trans_vol'].max()))
print()
print("{:,.2f} mean registrants".format(top_10_total_vol['registrations_count'].mean()))
print("{:,.2f} median registrants".format(top_10_total_vol['registrations_count'].median()))
print("registrants range: {:,} to {:,}".format(top_10_total_vol['registrations_count'].min(), top_10_total_vol['registrations_count'].max()))

Top 10% total volume:
898 forms
$69,161.88 median volume
$113,641.10 mean volume
Volume range: $42,785.78 to $1,997,325.60

449.89 mean registrants
191.00 median registrants
registrants range: 0 to 10,032


### not top performers

In [13]:
# not top 10%
len_all = len(form_totals)
bottom_90perc = len_all - top_10perc

not_top_10_total_vol = form_totals.sort_values('trans_vol', ascending=False).tail(bottom_90perc)

print("Not top 10% total volume:")
print("{:,} forms".format(len(not_top_10_total_vol)))
print("${:,.2f} median volume".format(not_top_10_total_vol['trans_vol'].median()))
print("${:,.2f} mean volume".format(not_top_10_total_vol['trans_vol'].mean()))
print("Volume range: ${:,.2f} to ${:,.2f}".format(not_top_10_total_vol['trans_vol'].min(), not_top_10_total_vol['trans_vol'].max()))
print()
print("{:,.2f} mean registrants".format(not_top_10_total_vol['registrations_count'].mean()))
print("{:,.2f} median registrants".format(not_top_10_total_vol['registrations_count'].median()))
print("registrants range: {:,} to {:,}".format(not_top_10_total_vol['registrations_count'].min(), not_top_10_total_vol['registrations_count'].max()))

Not top 10% total volume:
8,088 forms
$3,821.56 median volume
$7,942.56 mean volume
Volume range: $0.00 to $42,783.67

51.78 mean registrants
11.00 median registrants
registrants range: 0 to 2,643


## underperformers

bottom 10%

In [14]:
# top 10%
len_all = len(form_totals)
bottom_10perc = int(len_all/10)

bottom_10_total_vol = form_totals.sort_values('trans_vol', ascending=True).head(bottom_10perc)

print("Bottom 10% total volume:")
print("{:,} forms".format(len(bottom_10_total_vol)))
print("${:,.2f} median volume".format(bottom_10_total_vol['trans_vol'].median()))
print("${:,.2f} mean volume".format(bottom_10_total_vol['trans_vol'].mean()))
print("Volume range: ${:,.2f} to ${:,.2f}".format(bottom_10_total_vol['trans_vol'].min(), bottom_10_total_vol['trans_vol'].max()))
print()
print("{:,.2f} mean registrants".format(bottom_10_total_vol['registrations_count'].mean()))
print("{:,.2f} median registrants".format(bottom_10_total_vol['registrations_count'].median()))
print("registrants range: {:,} to {:,}".format(bottom_10_total_vol['registrations_count'].min(), bottom_10_total_vol['registrations_count'].max()))

Bottom 10% total volume:
898 forms
$4.50 median volume
$27.31 mean volume
Volume range: $0.00 to $110.00

5.46 mean registrants
1.00 median registrants
registrants range: 0 to 385


In [15]:
form_totals['is_bottom_performer'] = form_totals['form'].isin(bottom_10_total_vol['form'].tolist())

# org conistency

are over performers and under performers consistent within a given org? can we assume that if an org has performed within a certain category in the past that their future events will perform within the same category?

In [16]:
org_consistency = form_totals.groupby('org')[['is_top_performer', 'is_bottom_performer']].median().reset_index()
org_consistency[['is_top_performer', 'is_bottom_performer']].value_counts()

is_top_performer  is_bottom_performer
0.0               0.0                    1044
1.0               0.0                     151
0.0               1.0                      91
0.5               0.0                      35
0.0               0.5                      25
0.5               0.5                       3
dtype: int64

the values appear to be very consistent with only 61 org's having a value that's not 1.0 or 0.0

In [17]:
org_counts = form_totals.groupby('org')['is_top_performer'].count().reset_index()
org_counts.columns = ['org', 'event_count']

e_cs = [(1), (2), (3, 5), (5)]
print("Events per org")
print("-"*40)
print("Orgs with 1 event: {:,}".format(len(org_counts[org_counts['event_count']==1])))
print("Orgs with 2 events: {:,}".format(len(org_counts[org_counts['event_count']==2])))
print("Orgs with 3 to 5 events: {:,}".format(len(org_counts[(org_counts['event_count']>=3)&(org_counts['event_count']<=5)])))
print("Orgs with 6+ events: {:,}".format(len(org_counts[org_counts['event_count']>=6])))

print()
print("Orgs with more than 1 event: {:,}".format(len(org_counts[org_counts['event_count']>=2])))

Events per org
----------------------------------------
Orgs with 1 event: 480
Orgs with 2 events: 241
Orgs with 3 to 5 events: 304
Orgs with 6+ events: 324

Orgs with more than 1 event: 869


There __are only 61 orgs with variable performance categories across events__ and __there are 835 orgs with more than 1 event__. This is solid evidence of consistency of performance category across multiple events as only 7% have changed categories between events.

# settings

In [18]:
aggs = {
    'reg_count': 'sum',
    'sub_reg_count': 'sum',
    'teams_count': 'sum',
    'reg_volume': 'sum',
    'don_volume': 'sum',
    'don_count': 'sum',
    'class_count': 'mean',
    'cat_count': 'mean',
    'promo_count': 'mean',
    'rest_count': 'mean',
    'amt_count': 'mean',
    'ded_count': 'mean',
    'fields': 'mean',
    'opt_fields': 'mean',
    'req_fields': 'mean',
    'allows_reg_ind': 'mean',
    'allows_teams': 'mean',
    'allows_reg_team_create': 'mean',
    'allows_sub_reg': 'mean',
    'allows_sub_reg_pfp': 'mean',
    'allows_other_don_amt': 'mean',
    'allows_pfp_off_don': 'mean',
    'allows_tfp_off_don': 'mean',
    'share_tfp': 'mean',
    'share_therm': 'mean',
    'share_donation': 'mean',
    'allows_social': 'mean',
    'social_templt_count': 'mean',
    'social_auto': 'mean',
    'pcnt_posts': 'mean',
    'mon_posts': 'mean',
    'count_posts': 'sum',
    'date_posts': 'mean',
    'email_templt_count': 'mean',
    'sponsors_count': 'mean'
}
event_stats = df.groupby('form').agg(aggs).reset_index()

In [19]:
event_stats['is_top_performer'] = event_stats['form'].isin(top_10_total_vol['form'].tolist())
event_stats['is_bottom_performer'] = event_stats['form'].isin(bottom_10_total_vol['form'].tolist())

In [21]:
pd.set_option('display.max_rows', 100)

tbl = event_stats.groupby(['is_top_performer', 'is_bottom_performer']).agg(['mean', 'median']).transpose()
tbl.columns = ['middle 80%', 'bottom 10%', 'top 10%']
tbl

Unnamed: 0,Unnamed: 1,middle 80%,bottom 10%,top 10%
form,mean,904990.489101,903457.090305,920687.985092
form,median,954434.5,953918.0,958414.5
reg_count,mean,0.0,0.0,0.0
reg_count,median,0.0,0.0,0.0
sub_reg_count,mean,2.405189,0.585657,23.497706
sub_reg_count,median,0.0,0.0,0.0
teams_count,mean,3.459105,0.633466,31.036697
teams_count,median,0.0,0.0,5.0
reg_volume,mean,0.0,0.0,0.0
reg_volume,median,0.0,0.0,0.0


### settings by event category

In [22]:
def form_cat(form):
    if form in forms_table['id'].tolist():
        return forms_table[forms_table['id']==form]['categorization'].iloc[0]
    return 0

event_stats['category'] = event_stats['form'].apply(form_cat)

In [23]:
event_stats.groupby(['category', 'is_top_performer', 'is_bottom_performer']).median().transpose()

category,0,0,0,1,1,1,2,2,2,3,...,12,13,13,13,14,14,14,15,15,15
is_top_performer,False,False,True,False,False,True,False,False,True,False,...,False,False,False,True,False,False,True,False,False,True
is_bottom_performer,False,True,False,False,True,False,False,True,False,False,...,True,False,True,False,False,True,False,False,True,False
form,947271.0,946487.0,943400.5,984672.5,980278.0,981321.0,984220.0,973873.0,982314.0,981315.5,...,976820.0,980654.0,983209.0,947646.0,982786.0,981371.5,960088.0,979735.0,979103.5,971837.0
reg_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sub_reg_count,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
teams_count,0.0,0.0,2.0,1.0,1.0,34.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
reg_volume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
don_volume,0.0,0.0,44006.465,1408.0,0.0,58446.75,1150.0,65.0,59478.56,2759.35,...,0.0,1890.0,20.0,434832.96,0.0,0.0,149660.65,0.0,27.5,68331.19
don_count,0.0,0.0,314.5,21.0,0.0,623.0,11.0,2.0,231.0,26.5,...,0.0,14.5,1.0,5106.0,0.0,0.0,715.0,0.0,1.0,386.0
class_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.962963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cat_count,1.0,1.0,2.0,2.0,2.0,2.5,2.0,1.0,1.973684,2.0,...,1.0,0.45098,1.0,0.0,2.0,1.528302,1.0,1.0,1.0,2.0
promo_count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### form settings by isvirtual

In [24]:
def form_isvirtual(form):
    if form in forms_table['id'].tolist():
        return forms_table[forms_table['id']==form]['isvirtual'].iloc[0]
    return 0

event_stats['isvirtual'] = event_stats['form'].apply(form_isvirtual)

In [25]:
event_stats.groupby(['isvirtual', 'is_top_performer', 'is_bottom_performer']).median().transpose()

isvirtual,0,0,0,1,1,1
is_top_performer,False,False,True,False,False,True
is_bottom_performer,False,True,False,False,True,False
form,953684.0,953606.0,956204.0,982488.0,982957.0,979981.0
reg_count,0.0,0.0,0.0,0.0,0.0,0.0
sub_reg_count,0.0,0.0,0.0,0.0,0.0,0.0
teams_count,0.0,0.0,6.0,0.0,0.0,0.0
reg_volume,0.0,0.0,0.0,0.0,0.0,0.0
don_volume,0.0,0.0,48212.84,2767.0,0.0,59718.3
don_count,0.0,0.0,359.0,23.0,0.0,376.0
class_count,0.0,0.0,0.0,0.0,0.0,0.0
cat_count,1.0,1.0,2.0,1.852941,2.0,1.933333
promo_count,0.0,0.0,0.0,0.0,0.0,0.0


# category representation by performance group

In [34]:
cat_counts = event_stats.groupby('category')['form'].nunique()

In [32]:
event_stats.groupby('category')['isvirtual', 'is_top_performer', 'is_bottom_performer'].mean().reset_index().merge(cat_counts, on='category')

  """Entry point for launching an IPython kernel.


Unnamed: 0,category,isvirtual,is_top_performer,is_bottom_performer,form
0,0,0.0,0.058791,0.062975,9083
1,1,0.078723,0.095745,0.03617,470
2,2,0.1875,0.0875,0.025,80
3,3,0.19708,0.160584,0.036496,137
4,4,0.168317,0.19802,0.069307,101
5,5,0.064748,0.26259,0.028777,278
6,6,0.137868,0.139706,0.060662,544
7,7,0.166667,0.055556,0.101852,108
8,8,0.097633,0.048817,0.081361,676
9,9,0.316017,0.134199,0.077922,231
