In [3]:
import pandas as pd
import numpy as np
import datetime

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

In [6]:
%matplotlib inline

# load data

In [7]:
START_DATE = '2020-10-01'

In [8]:
# form status
q = "select id, status from form"
all_forms = redshift_query_read(q, schema='production')
active_forms = all_forms[all_forms['status']==1]['id'].tolist()

In [9]:
print("{:,} forms".format(len(all_forms)))
print("{:,} forms with active status".format(len(active_forms)))

86,468 forms
34,387 forms with active status


### embeds

In [10]:
# embeds
q = "select widget as form from embed"
embeds = redshift_query_read(q, schema='production')

In [11]:
def is_embed(r):
    if r['form'] in embeds['form'].unique().tolist():
        return True
    return False

### form upgrades

In [12]:
# logged upgrades
q = "select * from syslog_logs where message like '%Qgiv Form Template Upgraded%'"
form_upgrades = redshift_query_read(q, schema="production")[['org', 'form', 'created']]
forms_upgrades_ids_logged = form_upgrades['form'].unique().tolist()

In [13]:
print("first form upgrade: {}".format(form_upgrades['created'].min()))
print("last form upgrade: {}".format(form_upgrades['created'].max()))

first form upgrade: 2020-12-09 15:30:17
last form upgrade: 2023-03-28 11:33:38


In [14]:
# created new template forms
bucket = "qgiv-stats-data"
new_form_template_list = "form_download new template.csv"
new_forms = get_dataframe_from_file(bucket, new_form_template_list)

new_forms['Go Live Date'] = pd.to_datetime(new_forms['Go Live Date'])

new_forms = new_forms[~new_forms['Form ID'].isin(forms_upgrades_ids_logged)]
new_forms = new_forms[(new_forms['Status']=='active')&(new_forms['Go Live Date']>=START_DATE)]
new_forms_ids = new_forms['Form ID'].unique().tolist()

In [15]:
new_forms['Go Live Date'].max()

Timestamp('2023-01-08 00:00:00')

In [16]:
print("{:,} forms upgraded (noted in logs)".format(len(forms_upgrades_ids_logged)))
print("{:,} forms with new template (production export)".format(len(new_forms)))
print("{:,} forms with new template and active status (production export)".format(len(new_forms_ids)))
print()
print("{:,} new template forms & active".format(len([f for f in set(forms_upgrades_ids_logged + new_forms_ids) if f in active_forms])))
print("\n\nlog entries do not imply form status, so here we see an excess but status filters will be applied to remove non-active forms from the dataset")

18,420 forms upgraded (noted in logs)
9,454 forms with new template (production export)
9,454 forms with new template and active status (production export)

20,336 new template forms & active


log entries do not imply form status, so here we see an excess but status filters will be applied to remove non-active forms from the dataset


Last run:
- 2,740 forms upgraded (noted in logs)
- 7,116 forms with new template (production export)
- 5,154 forms with new template and active status (production export)

- 6,050 new template forms intersection

In [17]:
form_upgrades['month'] = form_upgrades['created'].dt.to_period('M')
form_upgrades['active'] = form_upgrades['form'].isin(active_forms)
monthly_upgrades = form_upgrades.groupby(['month', 'active'])['form'].nunique().reset_index()
monthly_upgrades = monthly_upgrades.pivot(index='month', columns='active', values='form').reset_index()
monthly_upgrades.columns = ['month', 'not active', 'active']

print("template conversions")
monthly_upgrades

template conversions


Unnamed: 0,month,not active,active
0,2020-12,63,150
1,2021-01,256,240
2,2021-02,173,255
3,2021-03,189,266
4,2021-04,141,225
5,2021-05,151,181
6,2021-06,135,202
7,2021-07,142,214
8,2021-08,120,183
9,2021-09,897,3182


In [18]:
new_forms['month'] = new_forms['Go Live Date'].dt.to_period('m')

print("created new template forms")
new_forms.groupby('month')['Form ID'].count().reset_index()

created new template forms


Unnamed: 0,month,Form ID
0,2020-10,135
1,2020-11,206
2,2020-12,134
3,2021-01,122
4,2021-02,138
5,2021-03,199
6,2021-04,207
7,2021-05,149
8,2021-06,198
9,2021-07,151


In [19]:
def is_new_template(daily_row):
    try:
        # check for log conversion
        if daily_row['form'] in form_upgrades['form'].unique().tolist():
            # is date after logged upgrade date?
            return daily_row['date'] >= form_upgrades[form_upgrades['form']==daily_row['form']]['created'].iloc[0]
        else:
            # form created as new form template
            return daily_row['form'] in new_forms_ids
    except:
        print("error with form {}".format(daily_row['form']))
        raise Exception

### transactions

In [20]:
q_trans_onetime = '''select 
                        id,
                        date, 
                        form, 
                        amount,
                        recurring,
                        source,
                        useragent
                    from transactions
                    where 
                        date>='{}' and
                        status='A' and 
                        recurring=0 and
                        (source='don_form' or source='mobile')'''
trans_onetime = redshift_query_read(q_trans_onetime.format(START_DATE), schema='production')
trans_onetime['is_recurring'] = False

trans_onetime = trans_onetime[trans_onetime['id']!=12774333]

In [21]:
print("{:,} transactions, {:,} unique".format(len(trans_onetime), len(trans_onetime['id'].unique())))
print("{:,} forms".format(len(trans_onetime['form'].unique())))

2,523,605 transactions, 2,523,605 unique
21,543 forms


In [22]:
q_trans_rec = '''select 
                        id,
                        date, 
                        form, 
                        amount,
                        recurring,
                        source,
                        useragent
                    from transactions
                    where 
                        date>='{}' and
                        status='A' and 
                        recurring!=0 and
                        (source='don_form' or source='mobile')'''
trans_rec = redshift_query_read(q_trans_rec.format(START_DATE), schema='production')
trans_rec['is_recurring'] = True

trans_rec = trans_rec[trans_rec['id']!=12774333]

In [23]:
print("{:,} recurring transactions, {:,} unique".format(len(trans_rec), len(trans_rec['id'].unique())))
print("{:,} forms".format(len(trans_rec['form'].unique())))

2,443,134 recurring transactions, 2,443,134 unique
7,660 forms


In [24]:
trans = trans_onetime.append(trans_rec)

In [25]:
print("forms: {:,}".format(trans['form'].nunique()))
print("transactions: {:,}, {:,} unique".format(len(trans), trans['id'].nunique()))
print("one time transactions: {:,}, {:,} unique".format(len(trans[~trans['is_recurring']]), trans[~trans['is_recurring']]['id'].nunique()))
print("recurring transactions: {:,}, {:,} unique".format(len(trans[trans['is_recurring']]), trans[trans['is_recurring']]['id'].nunique()))

forms: 22,972
transactions: 4,966,739, 4,966,739 unique
one time transactions: 2,523,605, 2,523,605 unique
recurring transactions: 2,443,134, 2,443,134 unique


In [26]:
trans['date'].min(), trans['date'].max()

(Timestamp('2020-10-01 00:00:00'), Timestamp('2023-03-13 00:00:00'))

### traffic

In [27]:
q_ga = '''select
                date,
                form,
                devicecategory,
                views,
                bounces
            from googleanalytics_traffic
            where
                date>='{}' and
                (qgiv_frontend=1 or p2p_frontend=1) and
                path not like '%/receipt/%' and
                path not like '%account/%' '''
traffic = redshift_query_read(q_ga.format(START_DATE), schema='production')

In [28]:
traffic = traffic.drop_duplicates()

In [29]:
print("{:,} entries; {:,} unique".format(len(traffic), len(traffic.drop_duplicates())))
print("{:,.2f} entries per form".format(traffic.groupby('form')['views'].count().mean()))
print("{:,.2f} observation dates per form".format(traffic.groupby('form')['date'].nunique().mean()))
print("{:,.2f} views per form".format(traffic.groupby('form')['views'].sum().mean()))
print("{:,.2f} views per form per day".format(traffic.groupby(['form', 'date'])['views'].sum().mean()))

7,342,847 entries; 7,342,847 unique
171.94 entries per form
60.95 observation dates per form
1,366.77 views per form
22.42 views per form per day


In [30]:
traffic['date'].min(), traffic['date'].max()

(Timestamp('2020-10-01 00:00:00'), Timestamp('2023-03-19 00:00:00'))

### merge & compile data

In [31]:
trans['is new template'] = trans[['form', 'date']].apply(is_new_template, axis=1)

In [32]:
trans['is embed'] = trans['form'].isin(embeds['form'].tolist())

In [33]:
trans_all = trans.copy()

trans_onetime = trans[~trans['is_recurring']]
trans_rec = trans[trans['is_recurring']]
trans_rec = trans_rec.groupby('recurring').first().reset_index()

trans = trans_onetime.append(trans_rec)

In [34]:
print("All transactions: {:,} ({:,} unique)".format(len(trans_all), len(trans_all['id'].unique())))
print("One time and originating recurring: {:,}".format(len(trans)))
print()
print("Trans per form: {:,.2f}".format(trans.groupby('form')['amount'].count().mean()))
print("Trans per form per day: {:,.2f}".format(trans.groupby(['form', 'date'])['amount'].count().mean()))
print("One time trans per form: {:,.2f}".format(trans[~trans['is_recurring']].groupby('form')['amount'].count().mean()))
print("Recurring trans per form: {:,.2f}".format(trans[trans['is_recurring']].groupby('form')['amount'].count().mean()))

All transactions: 4,966,739 (4,966,739 unique)
One time and originating recurring: 2,708,051

Trans per form: 117.88
Trans per form per day: 3.54
One time trans per form: 117.14
Recurring trans per form: 24.08


#### daily (all devices) data

In [35]:
# merge to daily
daily_trans = trans.groupby(['form', 'date', 'is_recurring', 'is new template', 'is embed'])['amount'].agg({'count', 'sum'}).reset_index()
daily_trans['trans_count'] = daily_trans['count']
daily_trans['trans_vol'] = daily_trans['sum']
daily_trans.drop(['count', 'sum'], axis=1, inplace=True)

daily_trans_pvt = daily_trans.pivot(index=['form', 'date', 'is new template', 'is embed'], columns='is_recurring', values=['trans_count', 'trans_vol']).reset_index()
cols = ['form', 'date', 'is new template', 'is embed', 
        'trans_count_onetime', 'trans_count_recurring', 
        'trans_vol_onetime', 'trans_vol_recurring']
daily_trans_pvt.columns = cols
daily_trans_pvt.fillna(0, inplace=True)

traffic_agg = traffic.groupby(['date', 'form'])[['views', 'bounces']].sum().reset_index()
dailies = traffic_agg.merge(daily_trans_pvt, on=['date', 'form'], how='outer')
dailies.fillna(0, inplace=True)
dailies = dailies[dailies['views']>0]

In [36]:
print("daily trans agg len: {:,}".format(len(daily_trans)))
print("daily traffic agg len: {:,}".format(len(traffic_agg)))
print("merged dailies len: {:,}".format(len(dailies)))

daily trans agg len: 806,394
daily traffic agg len: 2,603,024
merged dailies len: 2,581,724


In [38]:
dailies['conversion'] = (dailies['trans_count_onetime'] + dailies['trans_count_recurring']) / dailies['views']
dailies['conversion_onetime'] = dailies['trans_count_onetime'] / dailies['views']
dailies['conversion_recurring'] = dailies['trans_count_recurring'] / dailies['views']

In [39]:
# removing outliers
dailies = dailies[dailies['conversion']<1.]

In [40]:
dailies = dailies[dailies['date']>=START_DATE]

In [41]:
# limiting to prior report range
dailies = dailies[dailies['date']<='2023-02-01']

In [42]:
# need to tag embed & new template by prior entries for 0 transaction days
dailies['is new template'] = dailies[['form', 'date']].apply(is_new_template, axis=1)

In [43]:
dailies['is embed'] = dailies['form'].isin(embeds['form'].tolist())

In [45]:
dailies['date'].min(), dailies['date'].max()

(Timestamp('2020-10-01 00:00:00'), Timestamp('2023-02-01 00:00:00'))

In [60]:
dailies['trans_count'] = dailies['trans_count_onetime'] + dailies['trans_count_recurring']

In [62]:
dailies['trans_vol'] = dailies['trans_vol_onetime'] + dailies['trans_vol_recurring']

In [65]:
trans['month'] = trans['date'].dt.to_period('M')

In [106]:
#dailies.to_csv("dailies.csv", index=False)

# report

calculate data points from 01-2022 forward, report to include only 2023 with year over year changes

In [27]:
#dailies = pd.read_csv("dailies.csv")
bucket = 'qgiv-stats-data'
trans_all = get_dataframe_from_file(bucket, 'new_forms.trans.csv')
trans_all['date'] = pd.to_datetime(trans_all['date'])
dailies = get_dataframe_from_file(bucket, 'new_forms.dailies.csv')
dailies['date'] = pd.to_datetime(dailies['date'])
len(trans_all), len(dailies)

(1731780, 1530538)

In [28]:
def report(df, df_all_trans):
    df = df.copy().replace(np.inf, np.nan)
    return {
        'form sample size': len(df_all_trans['form'].unique().tolist()),
        'transactions': df_all_trans[df_all_trans['recurring']==0]['id'].nunique() + df['trans_count_recurring'].sum(),
        'transactions per form': df_all_trans.groupby('form')['id'].count().mean(),
        'conversion': df['conversion'].mean(),
        'conversion onetime': df['conversion_onetime'].mean(),
        'conversion recurring': df['conversion_recurring'].mean(),
        'mean transaction onetime': df['trans_vol_onetime'].sum() / df['trans_count_onetime'].sum(),
        'median transaction onetime': df_all_trans[~df_all_trans['is_recurring']]['amount'].median(),
        'mean transaction recurring': df['trans_vol_recurring'].sum() / df['trans_count_recurring'].sum(),
        'median transaction recurring': df_all_trans[df_all_trans['is_recurring']]['amount'].median(),
        'onetime/recurring': df['trans_count_onetime'].sum() / df['trans_count_recurring'].sum(),
        'pageviews': df['views'].sum(),
        'bounce rate': df['bounces'].sum() / df['views'].sum(),
        'transactions one time': len(df_all_trans[~df_all_trans['is_recurring']]),
        'transactions origin recurring': df['trans_count_recurring'].sum(), 
        'transactions all recurring': len(df_all_trans[df_all_trans['is_recurring']])
    }

In [29]:
# buid month ranges list to iterate through
month_ranges = []
for y in [2022, 2023]:
    for m in range(1, 13):
        y_1 = y_2 = y
        m_1 = m
        m_2 = m + 1
        if m == 12:
            m_2 = 1
            y_2 += 1
        month_ranges.append(("{:02d}-01-{}".format(m_1, y_1), ("{:02d}-01-{}".format(m_2, y_2))))

In [30]:
# build monthly dataframe from 01-2022 forward
monthly_data = None
for m in month_ranges:
    this_df = dailies[(dailies['date']>=m[0])&(dailies['date']<m[1])]
    this_df_all = trans_all[(trans_all['date']>=m[0])&(trans_all['date']<m[1])]
    
    print("{}: {:,}, {:,}".format(m, len(this_df), len(this_df_all)))
    
    new_template_data = report(this_df[this_df['is new template']], this_df_all[this_df_all['is new template']])
    old_template_data = report(this_df[~this_df['is new template']], this_df_all[~this_df_all['is new template']])
    
    # @TODO need to reorganize to have all month's data in single row
    #        currently split by new/old which makes YoY messy
    
    report_df = pd.DataFrame({'new template': new_template_data,
                             'old template': old_template_data})
    
    report_df = report_df.transpose()
    report_df['date'] = m[0]
    report_df['pageviews'] = report_df['pageviews'] / report_df['pageviews'].sum()
    
    monthly_data = pd.concat([monthly_data, report_df])

('01-01-2022', '02-01-2022'): 71,963, 79,363
('02-01-2022', '03-01-2022'): 76,662, 74,498
('03-01-2022', '04-01-2022'): 90,650, 97,508
('04-01-2022', '05-01-2022'): 87,194, 98,161
('05-01-2022', '06-01-2022'): 87,896, 85,247
('06-01-2022', '07-01-2022'): 81,237, 74,437
('07-01-2022', '08-01-2022'): 78,611, 64,978
('08-01-2022', '09-01-2022'): 89,365, 78,172
('09-01-2022', '10-01-2022'): 89,372, 88,534
('10-01-2022', '11-01-2022'): 93,300, 104,725
('11-01-2022', '12-01-2022'): 94,658, 142,529
('12-01-2022', '01-01-2023'): 86,495, 154,237
('01-01-2023', '02-01-2023'): 85,520, 68,642
('02-01-2023', '03-01-2023'): 59,700, 67,288
('03-01-2023', '04-01-2023'): 65,592, 88,317
('04-01-2023', '05-01-2023'): 68,900, 99,672
('05-01-2023', '06-01-2023'): 66,679, 85,447
('06-01-2023', '07-01-2023'): 61,952, 74,254
('07-01-2023', '08-01-2023'): 61,235, 70,399
('08-01-2023', '09-01-2023'): 33,557, 35,372
('09-01-2023', '10-01-2023'): 0, 0
('10-01-2023', '11-01-2023'): 0, 0
('11-01-2023', '12-01-2023'

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == "":
  
  


In [31]:
monthly_data = monthly_data.reset_index()
monthly_data['template'] = monthly_data['index'].str.replace(' template', '')
monthly_data.drop('index', axis=1, inplace=True)

In [32]:
vals_cols = [c for c in monthly_data.columns if c != 'template' and c != 'date']
monthly_pvt = monthly_data.pivot(index=['date'], columns='template', values=vals_cols)

In [33]:
monthly_pvt = monthly_pvt.reset_index().sort_values('date', ascending=True)

In [34]:
monthly_pvt.dropna()

Unnamed: 0_level_0,date,form sample size,form sample size,transactions,transactions,transactions per form,transactions per form,conversion,conversion,conversion onetime,...,pageviews,pageviews,bounce rate,bounce rate,transactions one time,transactions one time,transactions origin recurring,transactions origin recurring,transactions all recurring,transactions all recurring
template,Unnamed: 1_level_1,new,old,new,old,new,old,new,old,new,...,new,old,new,old,new,old,new,old,new,old
0,01-01-2022,3436.0,1132.0,52669.0,16541.0,16.967113,18.607774,0.041587,0.020853,0.037231,...,0.54826,0.45174,0.155348,0.310546,51487.0,15580.0,1182.0,961.0,6812.0,5484.0
1,01-01-2023,3600.0,691.0,53727.0,11377.0,15.658056,17.761216,0.036248,0.012052,0.033805,...,0.664114,0.335886,0.190505,0.365654,52876.0,11211.0,851.0,166.0,3493.0,1062.0
2,02-01-2022,3413.0,1110.0,48448.0,17875.0,15.539115,19.336036,0.039247,0.020897,0.035795,...,0.554658,0.445342,0.178075,0.306453,47440.0,16987.0,1008.0,888.0,5595.0,4476.0
3,02-01-2023,3857.0,661.0,52060.0,11735.0,14.195489,18.965204,0.048329,0.041881,0.045642,...,0.815274,0.184726,0.439296,0.435589,51157.0,11468.0,903.0,267.0,3595.0,1068.0
4,03-01-2022,3847.0,1172.0,59377.0,30197.0,16.645178,28.561433,0.040703,0.02125,0.037863,...,0.608912,0.391088,0.128371,0.306401,58339.0,29532.0,1038.0,665.0,5695.0,3942.0
5,03-01-2023,4432.0,463.0,73687.0,10646.0,17.430505,23.898488,0.056323,0.047188,0.053645,...,0.891316,0.108684,0.514186,0.480187,72117.0,10460.0,1570.0,186.0,5135.0,605.0
6,04-01-2022,3878.0,1209.0,61727.0,25839.0,17.816142,24.044665,0.044717,0.020203,0.041596,...,0.623345,0.376655,0.132283,0.335634,59483.0,25204.0,2244.0,635.0,9608.0,3866.0
7,04-01-2023,4483.0,294.0,79144.0,13703.0,19.104394,47.710884,0.055147,0.028765,0.05305,...,0.904805,0.095195,0.562804,0.530205,78175.0,13607.0,969.0,96.0,7470.0,420.0
8,05-01-2022,3861.0,1111.0,59201.0,18061.0,16.599068,19.044104,0.042509,0.018582,0.039678,...,0.6098,0.3902,0.136486,0.340485,58004.0,17487.0,1197.0,574.0,6085.0,3671.0
9,05-01-2023,4426.0,248.0,74704.0,8576.0,17.301853,35.762097,0.053063,0.030354,0.05154,...,0.927453,0.072547,0.554005,0.571471,74036.0,8507.0,668.0,69.0,2542.0,362.0


In [35]:
#monthly_pvt.dropna().to_csv("new_forms.yoy.csv", index=False)

In [36]:
#!rm 'new_forms.yoy.csv'