In [2]:
import sys, datetime
sys.path.append("../../../scripts/")
from s3_support import *

import pandas as pd
import numpy as np

# load data

In [3]:
START_DATE = '2019-10-01'

## logged upgrades for data integrity check

In [4]:
q = "select * from syslog_logs where message like '%Qgiv Form Template Upgraded%'"
form_upgrades = redshift_query_read(q, schema="production")[['org', 'form', 'created']]
forms_upgrades_ids_logged = form_upgrades['form'].unique().tolist()

## all new form templates

product: 1 == qgiv

template
- 0 == qgiv legacy
- 8 == new standard form

In [10]:
q = "select id, status, template, type as product from form"
all_forms = redshift_query_read(q, schema='production')

In [18]:
print("{:,} rows".format(len(all_forms)))
print("{:,} unique forms".format(len(all_forms['id'].unique())))
print("{:,} unique active forms".format(len(all_forms[all_forms['status']==1])))
print("{:,} unique year round forms".format(len(all_forms[all_forms['product']==1])))
print()
print("{:,} legacy templates; {:,} active legacy".format(len(all_forms[all_forms['template']==0]), len(all_forms[(all_forms['template']==0)&(all_forms['status']==1)])))
print("{:,} new standard form templates; {:,} active new stadard".format(len(all_forms[all_forms['template']==8]), len(all_forms[(all_forms['template']==8)&(all_forms['status']==1)])))

108,868 rows
108,868 unique forms
40,981 unique active forms
79,690 unique year round forms

2,162 legacy templates; 359 active legacy
56,376 new standard form templates; 27,521 active new stadard


In [11]:
all_forms.tail(2)

Unnamed: 0,id,status,template,product
108866,1026382,1,8,1
108867,1026401,9,7,1


In [60]:
new_forms_ids = all_forms[all_forms['template']==8]['id'].tolist()
year_round_form_ids = all_forms[all_forms['product']==1]['id'].tolist()

print("{:,} new standard template form IDs".format(len(new_forms_ids)))
print("{:,} year round form IDs".format(len(year_round_form_ids)))

56,376 new standard template form IDs
79,690 year round form IDs


## traffic

In [88]:
q = '''select
            date,
            form,
            sum(case when devicecategory='desktop' then views else 0 end) as pageviews_desktop,
            sum(case when devicecategory='mobile' or devicecategory='tablet' then views else 0 end) as pageviews_mobile,
            sum(views) as pageviews,
            sum(bounces) as bounces
        from ga
        where 
            date >= '{}' 
        group by date, form'''.format(START_DATE)
traffic = redshift_query_read(q, schema='production')
traffic['form'] = traffic['form'].astype(int)

In [89]:
print("{:,} rows".format(len(traffic)))
print("{} to {}".format(traffic['date'].min(), traffic['date'].max()))
print("{:,} unique forms".format(len(traffic['form'].unique())))

4,478,653 rows
2019-10-01 00:00:00 to 2024-02-28 00:00:00
62,506 unique forms


In [90]:
traffic.tail(3)

Unnamed: 0,date,form,pageviews_desktop,pageviews_mobile,pageviews,bounces
4478650,2022-02-18,951913,1,0,1,0
4478651,2022-08-22,958491,0,1,1,1
4478652,2021-10-23,934570,0,1,1,1


## transactions

In [119]:
q = '''select
            form,
            date,
            count(case when recurring=0 or recurring_origin=1 then id else null end) as trans_count,
            sum(amount) as trans_vol,
            count(case when recurring=0 then id else null end) as trans_onetime_count,
            sum(case when recurring=0 then amount else null end) as trans_onetime_vol,
            count(case when recurring_origin=1 then id else null end) as trans_rec_count,
            sum(case when recurring_origin=1 then amount else null end) as trans_rec_vol,
            count(case when useragent similar to '%(iPhone|iPad|Android)%' then id else null end) as mobile_count,
            count(case when useragent similar to '%(Macintosh|Windows)%' then id else null end) as desktop_count
        from transactions
        where
            status='A' and
            (source='don_form' or source='mobile') and
            date>='{}' 
        group by form, date'''.format(START_DATE)
trans = redshift_query_read(q, schema='production')

In [126]:
print("{:,} rows".format(len(trans)))
print("{:,} uniqe forms".format(len(trans['form'].unique())))
print("{} to {}".format(trans['date'].min(), trans['date'].max()))

2,236,870 rows
35,843 uniqe forms
2019-10-01 00:00:00 to 2024-03-27 00:00:00


In [121]:
trans['trans_onetime_vol'] = trans['trans_onetime_vol'].fillna(0)
trans['trans_rec_vol'] = trans['trans_rec_vol'].fillna(0)

In [122]:
trans.tail(3)

Unnamed: 0,form,date,trans_count,trans_vol,trans_onetime_count,trans_onetime_vol,trans_rec_count,trans_rec_vol,mobile_count,desktop_count
2236867,952644,2021-02-25,1,100.0,0,0.0,1,100.0,0,0
2236868,939167,2020-12-25,1,120.0,0,0.0,1,120.0,0,0
2236869,1009156,2024-03-01,1,50.0,0,0.0,1,50.0,0,0


## merge

In [123]:
mrgd = trans.merge(traffic, on=['form', 'date'])

filter to only year round forms

In [124]:
mrgd = mrgd[mrgd['form'].isin(year_round_form_ids)]

In [125]:
print("{:,} rows".format(len(mrgd)))
print("{:,} uniqe forms".format(len(mrgd['form'].unique())))
print("{} to {}".format(mrgd['date'].min(), mrgd['date'].max()))
print("{:,} rows w/ 0 page views".format(len(mrgd[mrgd['pageviews']==0])))

991,274 rows
25,581 uniqe forms
2019-10-01 00:00:00 to 2024-02-28 00:00:00
2,730 rows w/ 0 page views


we are primarily concerned with form conversion and transaction performance, so we drop any observations without page views

In [127]:
mrgd = mrgd[mrgd['pageviews']!=0]

we are aware google analytics misses page views, try to account for this by dropping observations with more transactions than page views

In [128]:
len_gt = len(mrgd[mrgd['trans_count']>mrgd['pageviews']])
perc_gt = len_gt / len(mrgd)

print("{:,} ({:.1f}%) observations trans > page views".format(len_gt, perc_gt * 100.))
mrgd = mrgd[mrgd['trans_count']<mrgd['pageviews']]

116,873 (11.8%) observations trans > page views


In [129]:
mrgd['conversion'] = mrgd['trans_count'] / mrgd['pageviews']
mrgd['conv_ot'] = mrgd['trans_onetime_count'] / mrgd['pageviews']
mrgd['conv_rec'] = mrgd['trans_rec_count'] / mrgd['pageviews']

In [130]:
mrgd.tail(3)

Unnamed: 0,form,date,trans_count,trans_vol,trans_onetime_count,trans_onetime_vol,trans_rec_count,trans_rec_vol,mobile_count,desktop_count,pageviews_desktop,pageviews_mobile,pageviews,bounces,conversion,conv_ot,conv_rec
1019674,937816,2021-03-01,1,50.0,0,0.0,1,50.0,0,0,5,0,5,0,0.2,0.0,0.2
1019675,1237,2021-01-15,1,56.25,0,0.0,1,56.25,0,0,8,11,19,0,0.052632,0.0,0.052632
1019676,955228,2021-01-15,1,25.0,0,0.0,1,25.0,0,0,0,2,2,2,0.5,0.0,0.5


### flag new standard form observations

In [131]:
def is_new_form_entry(daily_row):
    try:
        # check for log conversion
        if daily_row['form'] in form_upgrades['form'].unique().tolist():
            # is date after logged upgrade date?
            return daily_row['date'] >= form_upgrades[form_upgrades['form']==daily_row['form']]['created'].iloc[0]
        elif daily_row['form'] in new_forms_ids:
            # form created as new form template
            return True
        # not logged as upgrade or in new form template list
        return False
    except:
        print("error with form {}".format(daily_row['form']))
        raise Exception

In [133]:
mrgd['is_new_form'] = mrgd[['form', 'date']].apply(is_new_form_entry, axis=1)

In [159]:
'''
# trans count fix for ongoing recurring
mrgd['trans_count'] = mrgd['trans_onetime_count'] + mrgd['trans_rec_count']
mrgd['conversion'] = mrgd['trans_count'] / mrgd['pageviews']
mrgd['conv_ot'] = mrgd['trans_onetime_count'] / mrgd['pageviews']
mrgd['conv_rec'] = mrgd['trans_rec_count'] / mrgd['pageviews']
'''

### exploratory stats

In [160]:
mrgd.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
form,799030.0,774822.074708,348625.024405,1.0,835671.0,943765.0,973638.0,1023842.0
trans_count,799030.0,3.26409,13.833294,0.0,1.0,1.0,3.0,4037.0
trans_vol,799030.0,695.634858,3946.853689,0.0,50.0,150.0,500.0,1706259.0
trans_onetime_count,799030.0,3.12816,13.375697,0.0,1.0,1.0,3.0,4037.0
trans_onetime_vol,799030.0,599.617643,3751.046295,0.0,1.04,100.0,413.0,1685298.0
trans_rec_count,799030.0,0.13593,2.596877,0.0,0.0,0.0,0.0,1028.0
trans_rec_vol,799030.0,12.350111,275.466743,0.0,0.0,0.0,0.0,67976.2
mobile_count,799030.0,1.314623,7.776113,0.0,0.0,0.0,1.0,2493.0
desktop_count,799030.0,1.891373,7.769665,0.0,0.0,1.0,2.0,3050.0
pageviews_desktop,799030.0,24.104408,111.848229,0.0,3.0,7.0,19.0,18100.0


In [161]:
mrgd[mrgd['conversion']<1.].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
form,799030.0,774822.074708,348625.024405,1.0,835671.0,943765.0,973638.0,1023842.0
trans_count,799030.0,3.26409,13.833294,0.0,1.0,1.0,3.0,4037.0
trans_vol,799030.0,695.634858,3946.853689,0.0,50.0,150.0,500.0,1706259.0
trans_onetime_count,799030.0,3.12816,13.375697,0.0,1.0,1.0,3.0,4037.0
trans_onetime_vol,799030.0,599.617643,3751.046295,0.0,1.04,100.0,413.0,1685298.0
trans_rec_count,799030.0,0.13593,2.596877,0.0,0.0,0.0,0.0,1028.0
trans_rec_vol,799030.0,12.350111,275.466743,0.0,0.0,0.0,0.0,67976.2
mobile_count,799030.0,1.314623,7.776113,0.0,0.0,0.0,1.0,2493.0
desktop_count,799030.0,1.891373,7.769665,0.0,0.0,1.0,2.0,3050.0
pageviews_desktop,799030.0,24.104408,111.848229,0.0,3.0,7.0,19.0,18100.0


### exploratory split by new/old template

In [162]:
print("New standard forms")
mrgd[(mrgd['is_new_form']==1)&(mrgd['conversion']<1.)].describe().transpose()

New standard forms


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
form,482837.0,837410.252787,307786.77737,1.0,930176.0,965593.0,986419.0,1023842.0
trans_count,482837.0,3.266922,14.956788,0.0,1.0,1.0,3.0,4037.0
trans_vol,482837.0,707.559086,4705.958849,0.0,50.0,142.1,500.0,1706259.0
trans_onetime_count,482837.0,3.139281,14.381639,0.0,1.0,1.0,3.0,4037.0
trans_onetime_vol,482837.0,616.356762,4467.562085,0.0,5.2,100.0,405.49,1685298.0
trans_rec_count,482837.0,0.127641,3.263325,0.0,0.0,0.0,0.0,1028.0
trans_rec_vol,482837.0,13.041062,324.504181,0.0,0.0,0.0,0.0,67976.2
mobile_count,482837.0,1.438593,9.02335,0.0,0.0,0.0,1.0,2493.0
desktop_count,482837.0,1.776279,7.603648,0.0,0.0,1.0,2.0,3050.0
pageviews_desktop,482837.0,21.794521,100.441006,0.0,2.0,7.0,18.0,18100.0


In [163]:
print("Legacy forms")
mrgd[(mrgd['is_new_form']==0)&(mrgd['conversion']<1.)].describe().transpose()

Legacy forms


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
form,316193.0,679247.890146,383872.719054,3.0,221249.0,928214.0,945995.0,1015161.0
trans_count,316193.0,3.259765,11.91503,0.0,1.0,1.0,3.0,1614.0
trans_vol,316193.0,677.426174,2355.205155,0.0,50.0,154.5,525.0,250000.0
trans_onetime_count,316193.0,3.111179,11.673521,0.0,1.0,1.0,3.0,1590.0
trans_onetime_vol,316193.0,574.056463,2253.213154,0.0,0.0,100.0,425.0,250000.0
trans_rec_count,316193.0,0.148586,0.882982,0.0,0.0,0.0,0.0,110.0
trans_rec_vol,316193.0,11.295005,175.935766,0.0,0.0,0.0,0.0,40540.5
mobile_count,316193.0,1.125316,5.330387,0.0,0.0,0.0,1.0,1143.0
desktop_count,316193.0,2.067127,8.01337,0.0,0.0,1.0,2.0,1036.0
pageviews_desktop,316193.0,27.631681,127.229672,0.0,3.0,7.0,20.0,17369.0


# report prep

In [164]:
dailies = mrgd.copy()

In [165]:
def report(df):
    # recurring frequency?
    df.replace(np.inf, np.nan, inplace=True)
    return {
        'form sample size': len(df['form'].unique().tolist()),
        'transactions': df['trans_count'].sum(),
        'conversion mean': df['conversion'].mean(),
        'conversion onetime mean': df['conv_ot'].mean(),
        'conversion recurring mean': df['conv_rec'].mean(),
        'conversion median': df['conversion'].median(),
        'conversion onetime median': df['conv_ot'].median(),
        'conversion recurring median': df['conv_rec'].median(),
        'mean transaction onetime': df['trans_onetime_vol'].sum() / df['trans_onetime_count'].sum(),
        'mean transaction recurring': df['trans_rec_vol'].sum() / df['trans_rec_count'].sum(),
        'onetime/recurring': df['trans_onetime_count'].sum() / df['trans_rec_count'].sum(),
        'pageviews': df['pageviews'].sum(),
        'bounce rate': df['bounces'].sum() / df['pageviews'].sum()
    }

# report output

In [166]:
data_all = {
    'all': report(dailies),
    'new template': report(dailies[dailies['is_new_form']]),
    'old template': report(dailies[~dailies['is_new_form']])
}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


## all

In [167]:
print("{:%Y-%m-%d} +".format(dailies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

2019-10-01 +


Unnamed: 0,new template,old template
form sample size,19911.0,7537.0
transactions,0.604803,0.395197
conversion mean,0.148964,0.14649
conversion onetime mean,0.143415,0.140311
conversion recurring mean,0.005549,0.006178
conversion median,0.095238,0.1
conversion onetime median,0.089286,0.090909
conversion recurring median,0.0,0.0
mean transaction onetime,196.336923,184.514127
mean transaction recurring,102.169519,76.016377


## last 90 days

In [177]:
ninety_days_ago = dailies['date'].max() - datetime.timedelta(90)
dailies_last_90 = dailies[dailies['date']>=ninety_days_ago]
data_last_90_days = {
    'all': report(dailies_last_30),
    'new template': report(dailies_last_90[dailies_last_90['is_new_form']==1]),
    'old template': report(dailies_last_90[dailies_last_90['is_new_form']==0])
}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [190]:
print("Last 90 Days ({} to {})".format(dailies_last_90['date'].min(), dailies_last_90['date'].max()))
df = pd.DataFrame(data_last_90_days)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

Last 90 Days (2023-11-30 00:00:00 to 2024-02-28 00:00:00)


Unnamed: 0,new template,old template
form sample size,5170.0,89.0
transactions,3.686384,0.046578
conversion mean,0.204841,0.104307
conversion onetime mean,0.199275,0.101446
conversion recurring mean,0.005565,0.00286
conversion median,0.166667,0.0
conversion onetime median,0.153846,0.0
conversion recurring median,0.0,0.0
mean transaction onetime,213.651042,235.056384
mean transaction recurring,93.031662,64.186761


In [179]:
dailies_last_90[dailies_last_90['is_new_form']==0]['pageviews'].agg(['mean', 'median'])

mean      31.764521
median    18.000000
Name: pageviews, dtype: float64

In [180]:
dailies_last_90[dailies_last_90['is_new_form']==0]['trans_count'].agg(['mean', 'median', 'sum'])

mean         3.623234
median       0.000000
sum       2308.000000
Name: trans_count, dtype: float64

## last 30 days

In [172]:
thirty_days_ago = dailies['date'].max() - datetime.timedelta(30)
dailies_last_30 = dailies[dailies['date']>=thirty_days_ago]
data_last_30_days = {
    'all': report(dailies_last_30),
    'new template': report(dailies_last_30[dailies_last_30['is_new_form']==1]),
    'old template': report(dailies_last_30[dailies_last_30['is_new_form']==0])
}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [173]:
print("Last 30 Days")
df = pd.DataFrame(data_last_30_days)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['all']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['all']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose().drop('all', axis=1)

Last 30 Days


Unnamed: 0,new template,old template
form sample size,2262.0,27.0
transactions,0.991504,0.008496
conversion mean,0.181911,0.065467
conversion onetime mean,0.176185,0.062825
conversion recurring mean,0.005726,0.002641
conversion median,0.139303,0.0
conversion onetime median,0.130435,0.0
conversion recurring median,0.0,0.0
mean transaction onetime,148.545619,181.9028
mean transaction recurring,84.608456,71.462381


In [174]:
dailies_last_30[dailies_last_30['is_new_form']==0]['pageviews'].agg(['mean', 'median'])

mean      33.180645
median    17.000000
Name: pageviews, dtype: float64

In [175]:
dailies_last_30[dailies_last_30['is_new_form']==0]['trans_count'].agg(['mean', 'median', 'sum'])

mean        2.716129
median      0.000000
sum       421.000000
Name: trans_count, dtype: float64

In [188]:
new = dailies_last_30[dailies_last_30['is_new_form']==1]
old = dailies_last_30[dailies_last_30['is_new_form']==0]

new_five_perc = int(len(new)/10)
old_five_perc = int(len(old)/10)

new = new.sort_values('conversion').iloc[new_five_perc:-new_five_perc]
old = old.sort_values('conversion').iloc[old_five_perc:-old_five_perc]

print("{} to {}".format(new['date'].min(), new['date'].max()))
print("\tnew: {:.2f}% mean, {:.2f}% median, {:,} sample size, {:,.2f} mean pageviews per form".format(new['conversion'].mean() * 100., new['conversion'].median() * 100., len(new['form'].unique()), new['pageviews'].mean()))
print("\told: {:.2f}% mean, {:.2f}% median, {:,} sample size, {:,.2f} mean pageviews per form".format(old['conversion'].mean() * 100., old['conversion'].median() * 100., len(old['form'].unique()), old['pageviews'].mean()))

2024-01-29 00:00:00 to 2024-02-28 00:00:00
	new: 15.60% mean, 13.93% median, 1,978 sample size, 36.20 mean pageviews per form
	old: 4.70% mean, 0.00% median, 24 sample size, 36.64 mean pageviews per form


In [195]:
new_grpd = new.groupby('form')[['trans_count', 'trans_onetime_count', 'trans_rec_vol', 'pageviews']].sum().reset_index()
new_grpd['conversion'] = new_grpd['trans_count'] / new_grpd['pageviews']
new_grpd['conversion'].agg(['mean', 'median'])

mean      0.153593
median    0.138889
Name: conversion, dtype: float64

In [197]:
new['conv_desktop'] = new['desktop_count'] / new['pageviews_desktop']
new['conv_mobile'] = new['mobile_count'] / new['pageviews_mobile']
new[['conv_desktop', 'conv_mobile']].agg(['mean', 'median'])

Unnamed: 0,conv_desktop,conv_mobile
mean,inf,inf
median,0.166667,0.111111


In [198]:
old['conv_desktop'] = old['desktop_count'] / old['pageviews_desktop']
old['conv_mobile'] = old['mobile_count'] / old['pageviews_mobile']
old[['conv_desktop', 'conv_mobile']].agg(['mean', 'median'])

Unnamed: 0,conv_desktop,conv_mobile
mean,inf,0.022599
median,0.107377,0.0


In [199]:
old['pageviews_mobile'].mean(), old['pageviews_desktop'].mean()

(24.384, 12.256)