In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# Calculate growth stats from transactions

In [16]:
q = '''select 
        form,
        org, 
        date_trunc('month', date) as month, 
        count(id) as count, 
        sum(amount) as volume
    from transactions
        where status='A'
        group by form, org, date_trunc('month', date)'''
df = redshift_query_read(q)

In [17]:
df.tail(3)

Unnamed: 0,form,org,month,count,volume
233753,955243,185347,2020-04-01,1,102.75
233754,950247,444400,2020-04-01,1,26.25
233755,939782,443313,2020-04-01,1,1500.0


In [18]:
print("iterating through {} forms".format(len(df['form'].unique())))

counter = 0
form_data = []

for form in df['form'].unique():
    this_form_df = df[df['form']==form].copy()
    if len(this_form_df) < 3:
        continue
        
    this_form_df.sort_values('month', ascending=True)
    this_form_df['count_growth'] = this_form_df['count'].diff() / this_form_df['count'].shift(1)
    this_form_df['volume_growth'] = this_form_df['volume'].diff() / this_form_df['volume'].shift(1)

    form_data.append({
        'form': form,
        'org': this_form_df['org'].iloc[0],
        'samples': len(this_form_df),
        'count_growth': this_form_df['count_growth'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
        'volume_growth': this_form_df['volume_growth'].replace([np.inf, -np.inf], np.nan).dropna().mean()
    })
    
    counter = counter + 1
    if counter % 2500 == 0:
        print("\tdone with {} forms".format(counter))

iterating through 21797 forms
	done with 2500 forms
	done with 5000 forms
	done with 7500 forms
	done with 10000 forms
	done with 12500 forms


In [19]:
form_growth_df = pd.DataFrame(form_data)
form_growth_df.tail(3)

Unnamed: 0,count_growth,form,org,samples,volume_growth
13035,0.041667,954811,445227,3,-0.011256
13036,0.0,954791,380931,3,-1.0
13037,0.0,954063,443192,3,-1.0


In [20]:
print("{} forms with {:.2f} mean observations".format(len(form_growth_df), form_growth_df['samples'].mean()))
print("\t{:.2f} mean count growth; {:.2f} mean volume growth".format(form_growth_df['count_growth'].mean(), form_growth_df['volume_growth'].mean()))

for i in [6, 12, 24, 36]:
    form_growth_subset = form_growth_df[form_growth_df['samples']<= i]
    this_len = len(form_growth_subset)
    this_perc = (float(this_len) / float(len(form_growth_df))) * 100.
    
    print("{} forms ({:.2f}%) with fewer than {} observations".format(this_len, this_perc, i))
    print("\t{:.2f} mean count growth; {:.2f} mean volume growth".format(form_growth_subset['count_growth'].mean(), form_growth_subset['volume_growth'].mean()))

13038 forms with 16.94 mean observations
	2.72 mean count growth; 53.68 mean volume growth
6307 forms (48.37%) with fewer than 6 observations
	3.78 mean count growth; 66.60 mean volume growth
8521 forms (65.36%) with fewer than 12 observations
	3.54 mean count growth; 66.27 mean volume growth
10394 forms (79.72%) with fewer than 24 observations
	3.14 mean count growth; 61.61 mean volume growth
11279 forms (86.51%) with fewer than 36 observations
	2.99 mean count growth; 59.29 mean volume growth


# Calculate growth stats from analytics

In [21]:
q = '''select
            form,
            org,
            date_trunc('month', date) as month,
            sum(vt_trans_count) as vt_count,
            sum(don_form_trans_count) as donform_count,
            sum(kiosk_trans_count) as kiosk_count,
            sum(p2p_trans_count) as p2p_count,
            sum(mobile_trans_count) as mobile_count,
            sum(sms_trans_count) as sms_count,
            sum(fb_trans_count) as fb_count,
            sum(vt_trans_vol) as vt_volume,
            sum(don_form_trans_vol) as donform_volume,
            sum(kiosk_trans_vol) as kiosk_volume,
            sum(p2p_trans_vol) as p2p_volume,
            sum(mobile_trans_vol) as mobile_volume,
            sum(sms_trans_vol) as sms_volume,
            sum(fb_trans_vol) as fb_volume
        from analytics
            group by form, org, date_trunc('month', date)'''
df = redshift_query_read(q)

In [22]:
df.tail(3)

Unnamed: 0,form,org,month,vt_count,donform_count,kiosk_count,p2p_count,mobile_count,sms_count,fb_count,vt_volume,donform_volume,kiosk_volume,p2p_volume,mobile_volume,sms_volume,fb_volume
687919,956314,442196,2020-04-01,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
687920,956319,444852,2020-04-01,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
687921,956267,445377,2020-04-01,0,1,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [23]:
print("iterating through {} forms".format(len(df['form'].unique())))

counter = 0
form_data = []

for form in df['form'].unique():
    this_form_df = df[df['form']==form].copy()
    if len(this_form_df) < 3:
        continue
        
    this_form_df.sort_values('month', ascending=True)
    growth_cols = []
    for col in this_form_df.columns:
        if '_count' in col or '_volume' in col:
            growth_cols.append("{}_growth".format(col))
            this_form_df["{}_growth".format(col)] = this_form_df[col].diff() / this_form_df[col].shift(1)
    
    form_data.append({**{
        'form': form,
        'org': this_form_df['org'].iloc[0],
        'samples': len(this_form_df)
    }, **dict(this_form_df[growth_cols].replace([np.inf, -np.inf], np.nan).fillna(0).mean())})
    
    counter = counter + 1
    if counter % 5000 == 0:
        print("\tdone with {} forms".format(counter))

iterating through 32448 forms
	done with 5000 forms
	done with 10000 forms
	done with 15000 forms
	done with 20000 forms
	done with 25000 forms
	done with 30000 forms


In [24]:
form_analytics_growth_df = pd.DataFrame(form_data)
form_analytics_growth_df.tail(3)

Unnamed: 0,donform_count_growth,donform_volume_growth,fb_count_growth,fb_volume_growth,form,kiosk_count_growth,kiosk_volume_growth,mobile_count_growth,mobile_volume_growth,org,p2p_count_growth,p2p_volume_growth,samples,sms_count_growth,sms_volume_growth,vt_count_growth,vt_volume_growth
31234,-0.333333,-0.333333,0.0,0.0,955127,0.0,0.0,0.0,0.0,80899,0.0,0.0,3,0.0,0.0,0.0,0.0
31235,0.0,0.0,0.0,0.0,955128,0.0,0.0,0.0,0.0,444340,0.0,0.0,3,0.0,0.0,0.0,0.0
31236,0.0,0.0,0.0,0.0,955129,0.0,0.0,0.0,0.0,438316,-0.333333,-0.333333,3,0.0,0.0,0.0,0.0


In [25]:
for col in form_analytics_growth_df.columns:
    if col != 'org' and col != 'form':
        print("{}: {:.4f}".format(col, form_analytics_growth_df[col].mean()))

donform_count_growth: 0.0741
donform_volume_growth: 3.6168
fb_count_growth: -0.0000
fb_volume_growth: 0.0015
kiosk_count_growth: 0.0003
kiosk_volume_growth: 0.0125
mobile_count_growth: 0.0038
mobile_volume_growth: 0.0529
p2p_count_growth: 0.0690
p2p_volume_growth: 0.3241
samples: 21.9497
sms_count_growth: 0.0056
sms_volume_growth: 0.3507
vt_count_growth: 0.0055
vt_volume_growth: 0.1349


# Joining transaction growth to analytics growth

In [30]:
mrgd = form_growth_df.merge(form_analytics_growth_df, on="form")
cols = [col for col in mrgd.columns if 'volume_growth' in col]
mrgd_vol = mrgd[['form'] + cols]

In [33]:
mrgd_vol.corr()['volume_growth']

form                     0.009115
volume_growth            1.000000
donform_volume_growth    0.042621
fb_volume_growth        -0.000726
kiosk_volume_growth     -0.000459
mobile_volume_growth    -0.001624
p2p_volume_growth        0.018082
sms_volume_growth        0.014476
vt_volume_growth         0.019099
Name: volume_growth, dtype: float64

The analytics source growth has some meaningful correlations, but not many and not strong. These could nonetheless prove useful in modeling.

# monthly analytics correlations

query monthly aggregates of base and qgiv analytics, calculate percent change month to month and examine correlations

In [2]:
q = "select * from analytics_month where date >= 2018"
df_base = redshift_query_read(q)

q = "select * from analyticsqgiv_month where date >= 2018"
df_qgiv = redshift_query_read(q)

## qgiv

In [23]:
df = df_base.merge(df_qgiv, on=["org", "form", "date"])
df.drop(['p2p_trans_count', 'p2p_trans_vol'], axis=1, inplace=True)
df.sort_values('date', ascending=True, inplace=True)

In [25]:
agg_change = df.drop('date', axis=1).groupby(['org', 'form']).pct_change()

In [27]:
c = agg_change[[c for c in agg_change.columns if c not in ['date', 'org', 'form', 'product']]].corr()
s = c.unstack()
so = s.sort_values(kind='quicksort')

In [30]:
so = so.dropna()
so = so[so!=1.]
so.tail(50)

mobile_trans_count        dl_trans_volume             0.652211
dl_trans_volume           mobile_trans_count          0.652211
mobile_trans_count        dl_trans_count              0.655954
dl_trans_count            mobile_trans_count          0.655954
collect_optin             permit_create_own_pledge    0.660464
permit_create_own_pledge  collect_optin               0.660464
permit_anonymous          permit_create_own_pledge    0.694858
permit_create_own_pledge  permit_anonymous            0.694858
vt_trans_count            new_rec_count               0.696130
new_rec_count             vt_trans_count              0.696130
kiosk_trans_count         dl_new_rec_volume           0.728306
dl_new_rec_volume         kiosk_trans_count           0.728306
dl_new_rec_count          sms_trans_vol               0.772449
sms_trans_vol             dl_new_rec_count            0.772449
                          kiosk_trans_count           0.782433
kiosk_trans_count         sms_trans_vol               0

In [36]:
df_so = so.reset_index()
msk_1 = (df_so['level_0'].str.contains('vol'))&(~df_so['level_1'].str.contains('vol'))&(~df_so['level_1'].str.contains('count'))
msk_2 = (~df_so['level_0'].str.contains('vol'))&(df_so['level_1'].str.contains('vol'))&(~df_so['level_0'].str.contains('count'))
df_so[msk_1|msk_2].tail(25)

Unnamed: 0,level_0,level_1,0
1279,collect_address_mobile,sms_trans_vol,0.093719
1292,fb_trans_vol,permit_create_own_pledge,0.099377
1293,permit_create_own_pledge,fb_trans_vol,0.099377
1304,enable_sms,new_rec_volume,0.101835
1305,new_rec_volume,enable_sms,0.101835
1306,req_ded_flds,new_rec_volume,0.104018
1307,new_rec_volume,req_ded_flds,0.104018
1310,dl_trans_volume,req_ded_flds,0.105581
1311,req_ded_flds,dl_trans_volume,0.105581
1360,permit_anonymous,fb_trans_vol,0.123876


## p2p

In [37]:
q = "select * from analyticsp2p_month where date >= 2018"
df_p2p = redshift_query_read(q)

In [38]:
df = df_p2p.merge(df_base, on=["org", "form", "date"])
df.sort_values('date', ascending=True, inplace=True)

In [40]:
agg_change = df.drop('date', axis=1).groupby(['org', 'form']).pct_change()

c = agg_change[[c for c in agg_change.columns if c not in ['date', 'org', 'form', 'product']]].corr()
s = c.unstack()
so = s.sort_values(kind='quicksort')

In [45]:
so = so.dropna()
so = so[so!=1.]
so.tail(75)

don_count                social_auto                0.255400
allows_sub_reg_pfp       allows_reg_team_create     0.256142
allows_reg_team_create   allows_sub_reg_pfp         0.256142
allows_sub_reg_pfp       allows_pfp_off_don         0.262080
                         allows_tfp_off_don         0.262080
allows_tfp_off_don       allows_sub_reg_pfp         0.262080
allows_pfp_off_don       allows_sub_reg_pfp         0.262080
allows_opt_reg_donation  allows_teams               0.262388
allows_teams             allows_opt_reg_donation    0.262388
don_volume               social_auto                0.263255
social_auto              don_volume                 0.263255
allows_social            allows_sub_reg_pfp         0.267958
allows_sub_reg_pfp       allows_social              0.267958
count_posts              p2p_trans_vol              0.282261
p2p_trans_vol            count_posts                0.282261
don_volume               fb_trans_vol               0.297117
fb_trans_vol            

There are certainly meaningful connections between processing and social posts.

Data points to explore:

- social post settings/counts
- teams settings

In [52]:
df_so = so.reset_index()
df_so[df_so['level_0'].str.contains('posts')].tail(20)

Unnamed: 0,level_0,level_1,0
462,pcnt_posts,allows_opt_reg_donation,0.008059
470,count_posts,amt_count,0.009204
511,count_posts,cat_count,0.014594
530,count_posts,fields,0.017255
602,count_posts,allows_reg_team_join,0.034254
616,count_posts,allows_reg_team_create,0.035783
648,pcnt_posts,allows_pfp_off_don,0.041707
649,pcnt_posts,allows_tfp_off_don,0.041707
686,count_posts,ded_count,0.052099
747,count_posts,p2p_trans_count,0.071913


In [53]:
df_so[df_so['level_0'].str.contains('team')].tail(20)

Unnamed: 0,level_0,level_1,0
862,teams_count,fb_trans_vol,0.144035
865,teams_count,fb_trans_count,0.144035
866,allows_teams,allows_pfp_off_don,0.148557
867,allows_teams,allows_tfp_off_don,0.148557
877,allows_reg_team_join,fields,0.162531
880,allows_reg_team_create,fields,0.175988
884,allows_teams,allows_social,0.186894
888,allows_reg_team_join,req_fields,0.188911
891,allows_reg_team_join,cat_count,0.201642
894,allows_reg_team_create,cat_count,0.203148
