In [9]:
!pwd

/home/ubuntu/recommendation


In [29]:
import sys, datetime
sys.path.append("scripts/")
sys.path.append("../../../scripts/")
from s3_support import *

import pandas as pd
import numpy as np

# load data

In [30]:
START_DATE = '2024-01-01'

### load cached data

@TODO add build script for these files

In [31]:
bucket = 'qgiv-stats-data'
filename_dailies = 'new_forms.dailies.csv'
filename_trans = 'new_for from ms.trans.csv'

In [32]:
dailies = get_dataframe_from_file(bucket, filename_dailies)

In [33]:
print("{:,} dailies entries".format(len(dailies)))
print("{} to {}".format(dailies['date'].min(), dailies['date'].max()))
print("{:,} forms".format(len(dailies['form'].unique())))
print()
print("{:.2f}% mean conversion".format(dailies['conversion'].mean() * 100.))
print("{:.2f}% median conversion".format(dailies['conversion'].median() * 100.))
print("{:.2f}% new forms mean conversion".format(dailies[dailies['is new template']]['conversion'].mean() * 100.))
print("{:.2f}% new forms median conversion".format(dailies[dailies['is new template']]['conversion'].median() * 100.))
print("{:.2f}% old forms mean conversion".format(dailies[~dailies['is new template']]['conversion'].mean() * 100.))
print("{:.2f}% old forms median conversion".format(dailies[~dailies['is new template']]['conversion'].median() * 100.))

496,916 dailies entries
2024-01-01 to 2025-01-22
32,954 forms

11.51% mean conversion
0.00% median conversion
16.06% new forms mean conversion
4.55% new forms median conversion
2.47% old forms mean conversion
0.00% old forms median conversion


In [34]:
trans_all = get_dataframe_from_file(bucket, filename_trans)

In [35]:
print("{:,} transactions".format(len(trans_all)))
print("{} to {}".format(trans_all['date'].min(), trans_all['date'].max()))
print("{:,} transactions".format(len(trans_all['form'].unique())))

3,135,756 transactions
2024-01-01 to 2025-01-22
27,843 transactions


In [36]:
form_rec_counts = dailies.groupby('form')['trans_count_recurring'].sum().reset_index()

In [37]:
no_rec_forms = form_rec_counts[form_rec_counts['trans_count_recurring']==0]

In [38]:
len(form_rec_counts), len(no_rec_forms)

(32954, 30045)

### new form settings

In [39]:
q = '''select 
            date, form, appearance, cta_before, 
            cta_after, conditional_fields
        from analyticsqgiv_weekly
        where date>='{}'
        order by date asc;'''.format(trans_all['date'].min())
nuform_extra = redshift_query_read(q, schema="public")

In [40]:
nuform_extra['date_end'] = nuform_extra['date'] + pd.Timedelta(days=7)

In [41]:
nuform_extra.head(3)

Unnamed: 0,date,form,appearance,cta_before,cta_after,conditional_fields,date_end
0,2024-01-01,933801,1,0,0,0,2024-01-08
1,2024-01-01,943563,2,1,0,0,2024-01-08
2,2024-01-01,968448,1,0,0,0,2024-01-08


### merge & prep data

In [42]:
nuform_extra['week'] = nuform_extra['date']
dailies['week'] = pd.to_datetime(dailies['date']).dt.to_period('W').apply(lambda r: r.start_time)
dailies = dailies.merge(nuform_extra[['week', 'form', 'cta_before', 'cta_after']], on=['week', 'form'], how='left')
dailies.tail(3)

Unnamed: 0,date,form,views,bounces,is new template,is embed,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring,conversion,conversion_onetime,conversion_recurring,trans_count,trans_vol,month,week,cta_before,cta_after
496913,2025-01-22,1060991,9.0,0.0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-01,2025-01-20,,
496914,2025-01-22,1061006,7.0,0.0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-01,2025-01-20,,
496915,2025-01-22,1061029,10.0,0.0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-01,2025-01-20,,


In [43]:
dailies['cta_after'].value_counts()

0.0    252997
1.0     35377
Name: cta_after, dtype: int64

In [44]:
dailies['week'].min(), dailies['week'].max()

(Timestamp('2024-01-01 00:00:00'), Timestamp('2025-01-20 00:00:00'))

# CTA conversion

In [18]:
dailies.groupby(['cta_after', 'cta_before'])['conversion'].agg(['mean', 'median', 'count']).reset_index()

Unnamed: 0,cta_after,cta_before,mean,median,count
0,0.0,0.0,0.155981,0.032258,51138
1,0.0,1.0,0.160476,0.05,201859
2,1.0,0.0,0.130086,0.0,4050
3,1.0,1.0,0.166353,0.072727,31327


In [19]:
cta = dailies[(dailies['cta_after']==1)|(dailies['cta_before']==1)]['conversion'].mean()
cta_after = dailies[dailies['cta_after']==1]['conversion'].mean()
cta_before = dailies[dailies['cta_before']==1]['conversion'].mean()
non_cta = dailies[(dailies['cta_after']!=1)|(dailies['cta_before']!=1)]['conversion'].mean()

print("CTA (either): {:.2f}%".format(cta * 100.))
print("CTA after: {:.2f}%".format(cta_after * 100.))
print("CTA before: {:.2f}%".format(cta_before * 100.))
print("No CTA: {:.2f}%".format(non_cta * 100.))
print()

cta = dailies[(dailies['cta_after']==1)|(dailies['cta_before']==1)]['conversion_recurring'].mean()
cta_after = dailies[dailies['cta_after']==1]['conversion_recurring'].mean()
cta_before = dailies[dailies['cta_before']==1]['conversion_recurring'].mean()
non_cta = dailies[(dailies['cta_after']!=1)|(dailies['cta_before']!=1)]['conversion_recurring'].mean()

print("CTA (either) recurring: {:.2f}%".format(cta * 100.))
print("CTA after recurring: {:.2f}%".format(cta_after * 100.))
print("CTA before recurring: {:.2f}%".format(cta_before * 100.))
print("No CTA recurring: {:.2f}%".format(non_cta * 100.))
print()

cta = dailies[(dailies['cta_after']==1)|(dailies['cta_before']==1)]['conversion_onetime'].mean()
cta_after = dailies[dailies['cta_after']==1]['conversion_onetime'].mean()
cta_before = dailies[dailies['cta_before']==1]['conversion_onetime'].mean()
non_cta = dailies[(dailies['cta_after']!=1)|(dailies['cta_before']!=1)]['conversion_onetime'].mean()

print("CTA (either) onetime: {:.2f}%".format(cta * 100.))
print("CTA after onetime: {:.2f}%".format(cta_after * 100.))
print("CTA before onetime: {:.2f}%".format(cta_before * 100.))
print("No CTA onetime: {:.2f}%".format(non_cta * 100.))

print()
print("One time/recurring")

cta = dailies[(dailies['cta_after']==1)|(dailies['cta_before']==1)]['trans_count_onetime'].sum() / dailies[(dailies['cta_after']==1)|(dailies['cta_before']==1)]['trans_count_recurring'].sum()
non_cta = dailies[(dailies['cta_after']!=1)&(dailies['cta_before']!=1)]['trans_count_onetime'].sum() / dailies[(dailies['cta_after']!=1)&(dailies['cta_before']!=1)]['trans_count_recurring'].sum()

print("CTA (either): {:.2f}".format(cta))
print("No CTA: {:.2f}".format(non_cta))

CTA (either): 16.07%
CTA after: 16.22%
CTA before: 16.13%
No CTA: 11.17%

CTA (either) recurring: 0.59%
CTA after recurring: 1.16%
CTA before recurring: 0.59%
No CTA recurring: 0.37%

CTA (either) onetime: 15.48%
CTA after onetime: 15.06%
CTA before onetime: 15.54%
No CTA onetime: 10.80%

One time/recurring
CTA (either): 30.18
No CTA: 16.15


In [20]:
one_time_only_forms = no_rec_forms['form'].unique().tolist()

ot_only = dailies[dailies['form'].isin(one_time_only_forms)]['conversion'].mean()
not_ot_only = dailies[~dailies['form'].isin(one_time_only_forms)]['conversion'].mean()

print("Conversion:")
print("One time only: {:.2f}%".format(ot_only * 100))
print("Non one time only: {:.2f}%".format(not_ot_only * 100))

Conversion:
One time only: 9.08%
Non one time only: 20.14%


In [21]:
dailies_recs = dailies[~dailies['form'].isin(one_time_only_forms)]

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['conversion'].mean()
cta_after = dailies_recs[dailies_recs['cta_after']==1]['conversion'].mean()
cta_before = dailies_recs[dailies_recs['cta_before']==1]['conversion'].mean()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['conversion'].mean()

print("Conversion (forms w/ onetime + recurring)")
print("CTA (either): {:.2f}%".format(cta * 100.))
print("CTA after: {:.2f}%".format(cta_after * 100.))
print("CTA before: {:.2f}%".format(cta_before * 100.))
print("No CTA: {:.2f}%".format(non_cta * 100.))
print()

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['conversion_recurring'].mean()
cta_after = dailies_recs[dailies_recs['cta_after']==1]['conversion_recurring'].mean()
cta_before = dailies_recs[dailies_recs['cta_before']==1]['conversion_recurring'].mean()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['conversion_recurring'].mean()

print("CTA (either) recurring: {:.2f}%".format(cta * 100.))
print("CTA after recurring: {:.2f}%".format(cta_after * 100.))
print("CTA before recurring: {:.2f}%".format(cta_before * 100.))
print("No CTA recurring: {:.2f}%".format(non_cta * 100.))
print()

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['conversion_onetime'].mean()
cta_after = dailies_recs[dailies_recs['cta_after']==1]['conversion_onetime'].mean()
cta_before = dailies_recs[dailies_recs['cta_before']==1]['conversion_onetime'].mean()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['conversion_onetime'].mean()

print("CTA (either) onetime: {:.2f}%".format(cta * 100.))
print("CTA after onetime: {:.2f}%".format(cta_after * 100.))
print("CTA before onetime: {:.2f}%".format(cta_before * 100.))
print("No CTA onetime: {:.2f}%".format(non_cta * 100.))

Conversion (forms w/ onetime + recurring)
CTA (either): 19.73%
CTA after: 19.03%
CTA before: 19.90%
No CTA: 20.22%

CTA (either) recurring: 1.64%
CTA after recurring: 2.05%
CTA before recurring: 1.64%
No CTA recurring: 1.88%

CTA (either) onetime: 18.09%
CTA after onetime: 16.97%
CTA before onetime: 18.26%
No CTA onetime: 18.34%


In [22]:
form_aggs = dailies_recs.groupby('form')['trans_count_onetime'].sum().reset_index()
form_aggs.sort_values("trans_count_onetime", inplace=True)
ten_perc = round(len(form_aggs) * .1)

middle_80 = form_aggs.iloc[ten_perc:-ten_perc]['form'].tolist()

print("Middle 80% - onetime + recurring forms")
dailies_recs = dailies_recs[dailies_recs['form'].isin(middle_80)]

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['conversion'].mean()
cta_after = dailies_recs[dailies_recs['cta_after']==1]['conversion'].mean()
cta_before = dailies_recs[dailies_recs['cta_before']==1]['conversion'].mean()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['conversion'].mean()

print("Conversion (forms w/ onetime + recurring)")
print("CTA (either): {:.2f}%".format(cta * 100.))
print("CTA after: {:.2f}%".format(cta_after * 100.))
print("CTA before: {:.2f}%".format(cta_before * 100.))
print("No CTA: {:.2f}%".format(non_cta * 100.))
print()

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['conversion_recurring'].mean()
cta_after = dailies_recs[dailies_recs['cta_after']==1]['conversion_recurring'].mean()
cta_before = dailies_recs[dailies_recs['cta_before']==1]['conversion_recurring'].mean()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['conversion_recurring'].mean()

print("CTA (either) recurring: {:.2f}%".format(cta * 100.))
print("CTA after recurring: {:.2f}%".format(cta_after * 100.))
print("CTA before recurring: {:.2f}%".format(cta_before * 100.))
print("No CTA recurring: {:.2f}%".format(non_cta * 100.))
print()

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['conversion_onetime'].mean()
cta_after = dailies_recs[dailies_recs['cta_after']==1]['conversion_onetime'].mean()
cta_before = dailies_recs[dailies_recs['cta_before']==1]['conversion_onetime'].mean()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['conversion_onetime'].mean()

print("CTA (either) onetime: {:.2f}%".format(cta * 100.))
print("CTA after onetime: {:.2f}%".format(cta_after * 100.))
print("CTA before onetime: {:.2f}%".format(cta_before * 100.))
print("No CTA onetime: {:.2f}%".format(non_cta * 100.))

Middle 80% - onetime + recurring forms
Conversion (forms w/ onetime + recurring)
CTA (either): 14.14%
CTA after: 14.73%
CTA before: 14.19%
No CTA: 15.55%

CTA (either) recurring: 1.42%
CTA after recurring: 1.71%
CTA before recurring: 1.42%
No CTA recurring: 1.82%

CTA (either) onetime: 12.72%
CTA after onetime: 13.02%
CTA before onetime: 12.78%
No CTA onetime: 13.74%


In [46]:
print("recurring/form")

cta_after_forms_len = len(dailies_recs[dailies_recs['cta_after']==1]['form'].unique())
cta_forms_len = len(dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)]['form'].unique())
non_cta_forms_len = len(dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)]['form'].unique())

cta = dailies_recs[(dailies_recs['cta_after']==1)|(dailies_recs['cta_before']==1)].groupby('form')['trans_count_recurring'].sum().agg(['mean', 'median']).reset_index()
cta_after = dailies_recs[dailies_recs['cta_after']==1].groupby('form')['trans_count_recurring'].sum().agg(['mean', 'median']).reset_index()
cta_before = dailies_recs[dailies_recs['cta_before']==1].groupby('form')['trans_count_recurring'].sum().agg(['mean', 'median']).reset_index()
non_cta = dailies_recs[(dailies_recs['cta_after']!=1)|(dailies_recs['cta_before']!=1)].groupby('form')['trans_count_recurring'].sum().agg(['mean', 'median']).reset_index()

print(cta_forms_len, cta_after_forms_len, non_cta_forms_len)
print("cta:")
print(cta)
print("cta_after:")
print(cta_after)
print("cta_before:")
print(cta_before)
print("non_cta:")
print(non_cta)

recurring/form


NameError: name 'dailies_recs' is not defined

In [24]:
nuform = dailies[dailies['is new template']]

cta = nuform[(nuform['cta_after']==1)|(nuform['cta_before']==1)]['trans_count_onetime'].sum() / nuform[(nuform['cta_after']==1)|(nuform['cta_before']==1)]['trans_count_recurring'].sum()
non_cta = nuform[(nuform['cta_after']!=1)&(nuform['cta_before']!=1)]['trans_count_onetime'].sum() / nuform[(nuform['cta_after']!=1)&(nuform['cta_before']!=1)]['trans_count_recurring'].sum()

print("CTA (either): {:.2f}".format(cta))
print("No CTA: {:.2f}".format(non_cta))

print("{:,} forms".format(len(nuform['form'].unique())))

CTA (either): 30.42
No CTA: 12.51
24,483 forms


In [25]:
# org NTEE
q = '''select 
            o.id as org, 
            f.id as form,
            o.segment as ntee 
        from organization as o
        left join form as f
            on f.org=o.id'''
orgs = redshift_query_read(q, schema='production')

In [26]:
dailies = dailies.merge(orgs, on='form', how='left')

In [27]:
pvt = dailies.groupby(['ntee', 'cta_before'])['conversion'].mean().reset_index()
pvt.pivot(index='ntee', columns='cta_before', values='conversion')

cta_before,0.0,1.0
ntee,Unnamed: 1_level_1,Unnamed: 2_level_1
A - Arts; Culture; and Humanities,0.131291,0.155216
B - Educational Institutions,0.168813,0.173838
C - Environmental Advocacy and Protection,0.171538,0.184615
D - Animal,0.135079,0.147925
E - Health; General and Rehabilitative,0.083499,0.147093
F - Mental Health and Crisis Intervention,0.197307,0.188926
G - Disease; Disorders; Medical Disciplines,0.12785,0.185558
H - Medical Research,0.165909,0.136989
I - Crime; Legal Related,0.1464,0.128595
J - Employment; Job Related,0.103333,0.126275


# weekly data build

In [2]:
START_DATE = "2024-01-01"

## get form transactions & settings data

In [3]:
q = """select
            *
        from analyticsqgiv_weekly
        where
            date>='{}'""".format(START_DATE)

df = redshift_query_read(q, schema='public')

In [4]:
print("{:,} entries".format(len(df)))
print("{:,} unique orgs".format(len(df['org'].unique())))
print("{:,} unique forms".format(len(df['form'].unique())))
print("{:%Y-%m-%d} to {:%Y-%m-%d}".format(df['date'].min(), df['date'].max()))

3,297,677 entries
9,739 unique orgs
79,810 unique forms
2024-01-01 to 2024-12-09


In [5]:
cols = ['date', 'form', 'org', 'new_rec_count', 'new_rec_volume',
        'cta_after', 'cta_before']

df[cols].tail()

Unnamed: 0,date,form,org,new_rec_count,new_rec_volume,cta_after,cta_before
3297672,2024-05-06,952777,444997,0,0.0,0,0
3297673,2024-10-14,966829,445468,0,0.0,0,1
3297674,2024-08-26,1014528,444852,0,0.0,0,1
3297675,2024-07-08,1007174,450413,0,0.0,0,0
3297676,2024-01-22,1013733,436247,0,0.0,0,1


## get traffic

In [6]:
q = """select
            date_trunc('week', date) as date,
            form,
            org,
            sum(views) as views
        from ga
        where
            date>='{}'
        group by form, org, date_trunc('week', date)""".format(START_DATE)
ga = redshift_query_read(q, schema='production')

In [7]:
print("{:,} entries".format(len(ga)))
print("{:,} unique orgs".format(len(ga['org'].unique())))
print("{:,} unique forms".format(len(ga['form'].unique())))
print("{:%Y-%m-%d} to {:%Y-%m-%d}".format(ga['date'].min(), ga['date'].max()))

176,666 entries
6,741 unique orgs
31,520 unique forms
2024-01-01 to 2024-12-09


## merge

In [8]:
mrgd = df[cols].merge(ga, on=['form', 'org', 'date'])

In [9]:
print("{:,} entries".format(len(mrgd)))
print("{:,} unique orgs".format(len(mrgd['org'].unique())))
print("{:,} unique forms".format(len(mrgd['form'].unique())))
print("{:%Y-%m-%d} to {:%Y-%m-%d}".format(mrgd['date'].min(), mrgd['date'].max()))

121,373 entries
5,586 unique orgs
22,729 unique forms
2024-01-01 to 2024-12-09


## CTA analysis

In [10]:
mrgd['rec_conversion'] = mrgd['new_rec_count'] / mrgd['views']

In [11]:
mrgd['cta_either'] = (mrgd['cta_after']!=0)|(mrgd['cta_before']!=0)

table_data = []
for q in ['cta_after', 'cta_before', 'cta_either']:
    len_forms = len(mrgd[mrgd[q]!=0]['form'].unique())
    perc_forms = (len_forms / len(mrgd)) * 100.
    
    table_data.append({
        'key': q,
        'entry count': f'{mrgd[q].sum():,}',
        'entry perc': f'{(mrgd[q].sum() / len(mrgd)) * 100.:,.2f}%',
        'form count': f'{len_forms:,}',
        'form perc': f'{perc_forms:,.2f}%'
    })
    
print("Recurring nudges and calls to action (CTA) representation in the dataset")
pd.DataFrame(table_data)

Recurring nudges and calls to action (CTA) representation in the dataset


Unnamed: 0,key,entry count,entry perc,form count,form perc
0,cta_after,15087,12.43%,2806,2.31%
1,cta_before,96214,79.27%,18029,14.85%
2,cta_either,98087,80.81%,18400,15.16%


In [12]:
_df = mrgd[mrgd['rec_conversion']<1.]

table_data = [{
    'key': 'cta_after',
    'mean, not grouped': _df[_df['cta_after']==1]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[_df['cta_after']==1].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'cta_before',
    'mean, not grouped': _df[_df['cta_before']==1]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[_df['cta_before']==1].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'cta_either',
    'mean, not grouped': _df[_df['cta_either']]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[_df['cta_either']].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'none',
    'mean, not grouped': _df[~_df['cta_either']]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[~_df['cta_either']].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
}]

print("Average recurring conversion rate by CTA setting, grouped by form, all observations")
pd.DataFrame(table_data)

Average recurring conversion rate by CTA setting, grouped by form, all observations


Unnamed: 0,key,"mean, not grouped","mean, grouped by form"
0,cta_after,0.018164,0.008956
1,cta_before,0.010176,0.004573
2,cta_either,0.010458,0.004684
3,none,0.01346,0.006975


In [13]:
w_rec_forms = mrgd[mrgd['new_rec_count']>0]['form'].unique()
_df = mrgd[mrgd['form'].isin(w_rec_forms)&(mrgd['rec_conversion']<1.)]

table_data = [{
    'key': 'cta_after',
    'mean, not grouped': _df[(_df['cta_after']==1)]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[(_df['cta_after']==1)].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'cta_before',
    'mean, not grouped': _df[(_df['cta_before']==1)]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[(_df['cta_before']==1)].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'cta_either',
    'mean, not grouped': _df[_df['cta_either']]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[_df['cta_either']].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'none',
    'mean, not grouped': _df[~_df['cta_either']]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[~_df['cta_either']].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
}]

print("Average recurring conversion rate by CTA setting, grouped by form, forms w/ recurring")
pd.DataFrame(table_data)

Average recurring conversion rate by CTA setting, grouped by form, forms w/ recurring


Unnamed: 0,key,"mean, not grouped","mean, grouped by form"
0,cta_after,0.032701,0.029826
1,cta_before,0.030256,0.027453
2,cta_either,0.030716,0.027756
3,none,0.05944,0.055299


In [15]:
# get the middle 80% by processing volume in past 2 years
q = '''select
            form,
            sum(amount) as volume
        from transactions
        where 
            status='A' and
            date>='{}' 
        group by form'''.format(START_DATE)
trans_sums = redshift_query_read(q, schema='production')

ten_perc = int(len(trans_sums) * .1)

middle_80_forms = trans_sums.sort_values("volume", ascending=True)[ten_perc:-ten_perc]['form'].tolist()

In [16]:
print(f'{len(trans_sums):,} forms retreived')
print(f"{len(middle_80_forms):,} forms in middle 80%")

32,271 forms retreived
25,817 forms in middle 80%


In [17]:
_df = mrgd[(mrgd['rec_conversion']<1)&mrgd['form'].isin(middle_80_forms)]

table_data = [{
    'key': 'cta_after',
    'mean, not grouped': _df[(_df['cta_after']==1)]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[(_df['cta_after']==1)].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'cta_before',
    'mean, not grouped': _df[(_df['cta_before']==1)]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[(_df['cta_before']==1)].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'cta_either',
    'mean, not grouped': _df[(_df['cta_either'])]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[(_df['cta_either'])].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
},{
    'key': 'none',
    'mean, not grouped': _df[(~_df['cta_either'])]['rec_conversion'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, grouped by form': _df[(~_df['cta_either'])].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean()
}]

print("Average recurring conversion rate by CTA setting, grouped by form, middle 80% by processing")
pd.DataFrame(table_data)

Average recurring conversion rate by CTA setting, grouped by form, middle 80% by processing


Unnamed: 0,key,"mean, not grouped","mean, grouped by form"
0,cta_after,0.010693,0.008205
1,cta_before,0.005042,0.004449
2,cta_either,0.0053,0.004557
3,none,0.007965,0.006075


In [24]:
rep_forms = pd.read_csv("research/representative forms/filtered_forms.csv")['form'].tolist()
_df = mrgd[(mrgd['rec_conversion']<1.)&(mrgd['form'].isin(rep_forms))]

table_data = [{
    'key': 'cta_after',
    'mean, grouped by form': _df[(_df['cta_after']==1)].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, not grouped': _df[(_df['cta_after']==1)]['rec_conversion'].mean()
},{
    'key': 'cta_before',
    'mean, grouped by form': _df[(_df['cta_before']==1)].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, not grouped': _df[(_df['cta_before']==1)]['rec_conversion'].mean()
},{
    'key': 'cta_either',
    'mean, grouped by form': _df[(_df['cta_either'])].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, not grouped': _df[(_df['cta_either'])]['rec_conversion'].mean()
},{
    'key': 'none',
    'mean, grouped by form': _df[(~_df['cta_either'])].groupby('form')['rec_conversion'].median().replace([np.inf, -np.inf], np.nan).dropna().mean(),
    'mean, not grouped': _df[(~_df['cta_either'])]['rec_conversion'].mean()
}]

print("Average recurring conversion rate by CTA setting, grouped by form, representative forms")
pd.DataFrame(table_data)

Average recurring conversion rate by CTA setting, grouped by form, representative forms


Unnamed: 0,key,"mean, grouped by form","mean, not grouped"
0,cta_after,0.004397,0.004822
1,cta_before,0.000915,0.001866
2,cta_either,0.001134,0.001875
3,none,0.001559,0.001854


# report prep

In [25]:
def report(df):
    # recurring frequency?
    df = df.copy().replace(np.inf, np.nan)
    return {
        'form sample size': len(df['form'].unique().tolist()),
        'transactions': df['trans_count_onetime'].sum() + df['trans_count_recurring'].sum(),
        'conversion': df['conversion_all'].mean(),
        'conversion median': df['conversion_all'].median(),
        'conversion onetime': df['conversion_onetime'].mean(),
        'conversion recurring': df['conversion_recurring'].mean(),
        'mean transaction onetime': df['vol_onetime'].sum() / df['count_onetime'].sum(),
        'mean transaction recurring': df['vol_recurring'].sum() / df['count_recurring'].sum(),
        'onetime/recurring': df['trans_count_onetime'].sum() / df['trans_count_recurring'].sum(),
        'pageviews': df['views_all'].sum(),
        'bounce rate': df['bounces_all'].sum() / df['views_all'].sum()
    }

# report output

## single step vs multistep

In [45]:
data_all = {
    'all': report(weeklies),
    'single step': report(weeklies[(weeklies['is new template'])&(~weeklies['multistep'])]),
    'multi step': report(weeklies[(weeklies['is new template'])&(weeklies['multistep'])])
}

NameError: name 'weeklies' is not defined

In [None]:
print("{:%Y-%m-%d} +".format(weeklies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

In [None]:
forms_multistep = weeklies[(weeklies['is new template'])&(weeklies['multistep'])]['form'].unique()
forms_singestep = weeklies[(weeklies['is new template'])&(~weeklies['multistep'])]['form'].unique()

print("forms multistep: {:,}".format(len(forms_multistep)))
print("forms single step: {:,}".format(len(forms_singestep)))
print("forms both: {:,}".format(len(list(set(forms_multistep) & set(forms_singestep)))))

## CTA vs no

In [27]:
cta = dailies[(dailies['cta_before']==1.0)|(dailies['cta_after']==1.0)]
nocta = dailies[(dailies['cta_before']==0.0)&(dailies['cta_after']==0.0)]

len(cta), len(nocta)

NameError: name 'dailies' is not defined

In [28]:
cta['ot/rec'] = cta['trans_count_onetime'] / cta['trans_count_recurring']
nocta['ot/rec'] = nocta['trans_count_onetime'] / nocta['trans_count_recurring']

cta['ot/rec'].describe()

NameError: name 'cta' is not defined

## CTA before vs after

In [None]:
data_all = {
    'all': report(dailies),
    'cta before': report(weeklies[(dailies['is new template'])&(dailies['cta_before']==1)]),
    'cta after': report(weeklies[(dailies['is new template'])&(dailies['cta_after']==1)])
}

In [None]:
print("{:%Y-%m-%d} +".format(weeklies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

In [None]:
forms_cta_before = weeklies[weeklies['cta_before']==1]['form'].unique()
forms_cta_after = weeklies[weeklies['cta_after']==1]['form'].unique()

print("forms cta before: {:,}".format(len(forms_cta_before)))
print("forms cta after: {:,}".format(len(forms_cta_after)))
print("forms both: {:,}".format(len(list(set(forms_cta_before) & set(forms_cta_after)))))

## CTA after vs old forms

In [None]:
data_all = {
    'all': report(weeklies),
    'old forms': report(weeklies[~weeklies['is new template']]),
    'cta after': report(weeklies[(weeklies['is new template'])&(weeklies['cta_after']==1)])
}

In [None]:
print("{:%Y-%m-%d} +".format(weeklies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

In [None]:
old_forms = weeklies[~weeklies['is new template']]['form'].unique()
forms_cta_after = weeklies[weeklies['cta_after']==1]['form'].unique().tolist()

print("old forms: {:,}".format(len(old_forms)))
print("forms cta after: {:,}".format(len(forms_cta_after)))

## CTA after or before vs old forms

In [None]:
data_all = {
    'all': report(dailies),
    'old forms': report(dailies[~dailies['is new template']]),
    'cta': report(dailies[(dailies['is new template'])&((dailies['cta_after']==1)|(dailies['cta_before']==1))])
}

In [None]:
print("{:%Y-%m-%d} +".format(weeklies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

In [None]:
old_forms = weeklies[~weeklies['is new template']]['form'].unique()
forms_cta = weeklies[(weeklies['cta_after']==1)|(weeklies['cta_before']==1)]['form'].unique().tolist()

print("old forms: {:,}".format(len(old_forms)))
print("forms w/ cta: {:,}".format(len(forms_cta)))

## Conditional fields

In [None]:
data_all = {
    'all': report(weeklies),
    'conditional fields': report(weeklies[(weeklies['is new template'])&(weeklies['conditional_fields']>0)]),
    'no conditional fields': report(weeklies[(weeklies['is new template'])&(weeklies['conditional_fields']==0)])
}

In [None]:
print("{:%Y-%m-%d} +".format(weeklies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)