In [18]:
import sys
sys.path.append("../../scripts/")
from s3_support import *

import pandas as pd
import numpy as np

# upgrades from logs

In [19]:
q = "select * from syslog_logs where message like '%Qgiv Form Template Upgraded%'"
df = redshift_query_read(q, schema="production")

In [20]:
df['created'].max(), df['created'].min()

(Timestamp('2021-06-01 14:47:33'), Timestamp('2020-12-09 15:30:17'))

In [21]:
cols = ['id', 'org', 'form', 'created', 'message']
df[cols].tail()

Unnamed: 0,id,org,form,created,message
6803,40827250,447161,973599,2021-05-25 16:53:40,Qgiv Form Template Upgraded
6804,40876081,445882,973655,2021-05-27 10:01:51,Qgiv Form Template Upgraded
6805,40885809,447175,973691,2021-05-27 16:23:07,Qgiv Form Template Upgraded
6806,41016045,240,973752,2021-06-01 10:53:28,Qgiv Form Template Upgraded
6807,41020855,444551,949362,2021-06-01 14:00:24,Qgiv Form Template Upgraded


In [22]:
msk = df['message']=='Qgiv Form Template Upgraded'

df[msk][cols].head()

Unnamed: 0,id,org,form,created,message
0,36261262,445450,956860,2020-12-09 19:53:48,Qgiv Form Template Upgraded
1,36277866,446466,966619,2020-12-10 10:37:20,Qgiv Form Template Upgraded
2,36261262,445450,956860,2020-12-09 19:53:48,Qgiv Form Template Upgraded
3,36277866,446466,966619,2020-12-10 10:37:20,Qgiv Form Template Upgraded
4,36305671,443473,940976,2020-12-11 10:44:20,Qgiv Form Template Upgraded


In [23]:
converted_forms_dates = df[msk][['form', 'created']]
len(converted_forms_dates), len(converted_forms_dates['form'].unique())

(6808, 2298)

In [24]:
converted_forms_dates.drop_duplicates('form', keep='first', inplace=True)

In [25]:
#converted_forms_dates.to_csv("converted_forms_dates.csv", index=False)

In [74]:
# mercy ships form
converted_forms_dates[converted_forms_dates['form']==972972]

Unnamed: 0,form,converted


In [None]:
# fake upgrade entry for mercy ships
converted_forms_dates.append({
    'form': 972972,
    'created': '2021-05-15'
})

# multi-step vs single page

In [26]:
q = '''select date, form, appearance, cta_before, cta_after, conditional_fields
        from analyticsqgiv_weekly
        where date_part('year', date) = 2021
        order by date asc;'''
nuform_extra = redshift_query_read(q, schema="public") 

In [27]:
nuform_extra.tail(3)

Unnamed: 0,date,form,appearance,cta_before,cta_after,conditional_fields
231636,2021-04-05,967292,2,1,0,0
231637,2021-04-05,940126,1,0,0,0
231638,2021-04-05,968804,2,1,1,0


# recurring frequencies

In [28]:
converted_forms_list = converted_forms_dates['form'].unique().tolist()

def is_recurring_new_form(r):
    if r['form'] in converted_forms_list:
        if r['created'] >= converted_forms_dates[converted_forms_dates['form']==r['form']]['created'].iloc[0]:
            return True
    return False

In [29]:
# load recurring frequencies
rec_frequencies = pd.read_csv("recurring_frequencies.csv")
rec_frequencies['created'] = pd.to_datetime(rec_frequencies['created'])
rec_frequencies['frequency'] = pd.to_timedelta(rec_frequencies['frequency'])
rec_frequencies['new template'] = rec_frequencies.apply(is_recurring_new_form, axis=1)

In [30]:
#rec_frequencies.to_csv("recurring_frequencies.csv", index=False)

In [31]:
print(rec_frequencies['new template'].value_counts())
print(rec_frequencies['new template'].value_counts(normalize=True))

False    16504
True       411
Name: new template, dtype: int64
False    0.975702
True     0.024298
Name: new template, dtype: float64


In [32]:
print(rec_frequencies[rec_frequencies['new template']]['frequency'].mean())
print(rec_frequencies[~rec_frequencies['new template']]['frequency'].mean())

13 days 11:09:14.391651450
25 days 22:56:12.419087691


# device category conversions

In [33]:
start_date = '2020-09-01'

In [34]:
q = '''select 
            form,
            date,
            recurring,
            amount,
            source,
            useragent
        from transactions
        where
            status='A' and
            (source='don_form' or source='mobile') and
            recurring!=0 and
            date >= '{}'
        order by date asc
        '''.format(start_date)
rec = redshift_query_read(q, schema='public')
rec = rec.groupby('recurring').first().reset_index()
rec['useragent'].fillna('', inplace=True)
rec['is_recurring'] = True
rec.head(3)

Unnamed: 0,recurring,form,date,amount,source,useragent,is_recurring
0,544,55,2020-09-18,5.0,don_form,,True
1,764,55,2020-09-08,10.0,don_form,,True
2,1157,188,2020-09-06,78.65,don_form,,True


In [35]:
# load transactions
q = '''select 
            form,
            date,
            amount,
            source,
            useragent
        from transactions
        where
            status='A' and
            (source='don_form' or source='mobile') and
            recurring=0 and
            date >= '{}'
        '''.format(start_date)
trans = redshift_query_read(q, schema='public')
trans['useragent'].fillna('', inplace=True)
trans['is_recurring'] = False
trans.tail(3)

Unnamed: 0,form,date,amount,source,useragent,is_recurring
801936,931026,2021-06-01,20.0,mobile,Mozilla50iPhoneCPUiPhoneOS144likeMacOSXAppleWe...,False
801937,48174,2021-06-01,225.0,mobile,Mozilla50iPhoneCPUiPhoneOS144likeMacOSXAppleWe...,False
801938,393,2021-06-01,700.0,don_form,Mozilla50iPhoneCPUiPhoneOS1361likeMacOSXAppleW...,False


In [36]:
# merge onetime and recurring
trans = trans.append(rec)

In [37]:
len(trans[trans['useragent']=='']), len(trans[trans['useragent']=='']) / len(trans)

(159597, 0.1780185361381075)

In [38]:
keys = ['iPhone', 'iPad', 'Android', 'Macintosh', 'Windows', 'CrOS']
for k in keys:
    trans['{}'.format(k)] = trans['useragent'].str.contains(k)

In [39]:
trans[keys].value_counts()

iPhone  iPad   Android  Macintosh  Windows  CrOS 
False   False  False    False      True     False    328177
                                   False    False    165655
True    False  False    False      False    False    161662
False   False  False    True       False    False    146306
               True     False      False    False     73883
        True   False    False      False    False     12658
        False  False    False      False    True       8166
True    True   False    False      False    False        10
False   False  True     False      True     False         2
dtype: int64

In [40]:
trans['is_mobile'] = (trans['iPhone'])|(trans['iPad'])|(trans['Android'])
trans['is_desktop'] = (trans['Macintosh'])|(trans['Windows'])|(trans['CrOS'])

In [41]:
print("Unassigned by user agent string:")
print("\tmix of Linux, Cold Fusion, empty user agent, etc.")
trans[~trans['is_mobile']&~trans['is_desktop']]['source'].value_counts()

Unassigned by user agent string:
	mix of Linux, Cold Fusion, empty user agent, etc.


don_form    147733
mobile       17922
Name: source, dtype: int64

In [42]:
# defaulting to source for ambiguous/missing useragent
trans_assigned = trans[trans['is_mobile']|trans['is_desktop']].copy()
trans_unassigned = trans[~trans['is_mobile']&~trans['is_desktop']].copy()
trans_unassigned['is_mobile'] = trans_unassigned['source']=='mobile'
trans_unassigned['is_desktop'] = trans_unassigned['source']=='don_form'
trans = trans_assigned.append(trans_unassigned)

In [43]:
trans[(~trans['is_mobile'])&(~trans['is_desktop'])]['source'].value_counts()

Series([], Name: source, dtype: int64)

In [44]:
trans[['is_mobile', 'is_desktop']].value_counts()

is_mobile  is_desktop
False      True          630382
True       False         266135
           True               2
dtype: int64

In [45]:
trans.source.value_counts()

don_form    767884
mobile      128635
Name: source, dtype: int64

In [46]:
trans_src = trans.groupby(['form', 'date', 'is_mobile', 'is_desktop', 'is_recurring']).agg({
    'useragent': 'count',
    'amount': 'sum'
})
trans_src = trans_src.reset_index()
trans_src['trans_count'] = trans_src['useragent']
trans_src['trans_vol'] = trans_src['amount']
trans_src['source'] = trans_src[['is_mobile', 'is_desktop']].apply(lambda x: 'mobile' if x['is_mobile'] else 'desktop', axis=1)
trans_src.drop(['useragent', 'amount', 'is_mobile', 'is_desktop'], axis=1, inplace=True)
trans_src.head()

Unnamed: 0,form,date,is_recurring,trans_count,trans_vol,source
0,1,2020-09-01,False,1,21.0,mobile
1,1,2020-09-03,False,1,122.0,desktop
2,1,2020-09-04,False,1,144.0,desktop
3,1,2020-09-07,False,1,44.8,desktop
4,1,2020-09-08,False,1,99.98,desktop


In [47]:
trans_src['desktop_trans_count'] = trans_src[['source', 'trans_count']].apply(lambda x: x['trans_count'] if x['source']=='desktop' else 0, axis=1)
trans_src['desktop_trans_vol'] = trans_src[['source', 'trans_vol']].apply(lambda x: x['trans_vol'] if x['source']=='desktop' else 0, axis=1)

trans_src['mobile_trans_count'] = trans_src[['source', 'trans_count']].apply(lambda x: x['trans_count'] if x['source']=='mobile' else 0, axis=1)
trans_src['mobile_trans_vol'] = trans_src[['source', 'trans_vol']].apply(lambda x: x['trans_vol'] if x['source']=='mobile' else 0, axis=1)

In [48]:
trans_src_cols = ['form', 'date', 'is_recurring', 'desktop_trans_count', 'desktop_trans_vol', 
       'mobile_trans_count', 'mobile_trans_vol']
trans_src[trans_src_cols].tail(3)

Unnamed: 0,form,date,is_recurring,desktop_trans_count,desktop_trans_vol,mobile_trans_count,mobile_trans_vol
301609,973723,2021-05-31,False,0,0.0,1,41.2
301610,973731,2021-05-31,False,6,0.0,0,0.0
301611,973731,2021-06-01,False,2,0.0,0,0.0


In [49]:
trans_src['onetime_desktop'] = trans_src[['is_recurring', 'desktop_trans_count']].apply(lambda x: x['desktop_trans_count'] if not x['is_recurring'] else 0, axis=1)
trans_src['onetime_mobile'] = trans_src[['is_recurring', 'mobile_trans_count']].apply(lambda x: x['mobile_trans_count'] if not x['is_recurring'] else 0, axis=1)
trans_src['recurring_desktop'] = trans_src[['is_recurring', 'desktop_trans_count']].apply(lambda x: x['desktop_trans_count'] if x['is_recurring'] else 0, axis=1)
trans_src['recurring_mobile'] = trans_src[['is_recurring', 'mobile_trans_count']].apply(lambda x: x['mobile_trans_count'] if x['is_recurring'] else 0, axis=1)

In [50]:
grpd_cols = ['onetime_desktop', 'onetime_mobile', 'recurring_desktop', 'recurring_mobile']
trans_src.groupby(['form', 'date'])[grpd_cols].sum().reset_index().head()

Unnamed: 0,form,date,onetime_desktop,onetime_mobile,recurring_desktop,recurring_mobile
0,1,2020-09-01,0,1,0,0
1,1,2020-09-03,1,0,0,0
2,1,2020-09-04,1,0,0,0
3,1,2020-09-07,1,0,0,0
4,1,2020-09-08,1,2,0,0


In [51]:
# load traffic
q = '''select
            date,
            form,
            devicecategory,
            sum(views) as pageviews
        from googleanalytics_traffic
        where 
            qgiv_frontend=1 and
            date >= '{}'
        group by date, devicecategory, form'''.format(start_date)
traffic = redshift_query_read(q, schema='public')

In [52]:
traffic.tail()

Unnamed: 0,date,form,devicecategory,pageviews
666121,2021-05-10,265283,tablet,1
666122,2021-05-09,928241,mobile,1
666123,2021-05-08,930177,mobile,1
666124,2021-05-14,939053,desktop,1
666125,2021-05-20,939603,mobile,1


In [53]:
df_dc = traffic.pivot(index=['date', 'form'], columns='devicecategory', values='pageviews').reset_index().fillna(0)
# merging tablet & mobile
df_dc['mobile'] = df_dc['mobile'] + df_dc['tablet']
df_dc['desktop_pageviews'] = df_dc['desktop'].astype(int)
df_dc['mobile_pageviews'] = df_dc['mobile'].astype(int)
df_dc.drop(['tablet', 'desktop', 'mobile'], axis=1, inplace=True)
df_dc.head()

devicecategory,date,form,desktop_pageviews,mobile_pageviews
0,2020-09-01,1,2,11
1,2020-09-01,3,1,4
2,2020-09-01,9,5,0
3,2020-09-01,17,4,2
4,2020-09-01,18,3,0


In [54]:
# merging data
device_conversion = df_dc.merge(trans_src.groupby(['form', 'date'])[grpd_cols].sum().reset_index(), on=['form', 'date'], how='outer').fillna(0)
device_conversion['onetime_desktop_conversion'] = device_conversion['onetime_desktop'] / device_conversion['desktop_pageviews']
device_conversion['onetime_mobile_conversion'] = device_conversion['onetime_mobile'] / device_conversion['mobile_pageviews']
device_conversion['recurring_desktop_conversion'] = device_conversion['recurring_desktop'] / device_conversion['desktop_pageviews']
device_conversion['recurring_mobile_conversion'] = device_conversion['recurring_mobile'] / device_conversion['mobile_pageviews']
device_conversion.head()

Unnamed: 0,date,form,desktop_pageviews,mobile_pageviews,onetime_desktop,onetime_mobile,recurring_desktop,recurring_mobile,onetime_desktop_conversion,onetime_mobile_conversion,recurring_desktop_conversion,recurring_mobile_conversion
0,2020-09-01,1,2.0,11.0,0.0,1.0,0.0,0.0,0.0,0.090909,0.0,0.0
1,2020-09-01,3,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2020-09-01,9,5.0,0.0,1.0,0.0,1.0,0.0,0.2,,0.2,
3,2020-09-01,17,4.0,2.0,1.0,0.0,4.0,0.0,0.25,0.0,1.0,0.0
4,2020-09-01,18,3.0,0.0,1.0,0.0,0.0,0.0,0.333333,,0.0,


In [55]:
onetime_desktop_conv = device_conversion['onetime_desktop_conversion'].replace(np.inf, np.nan).mean()
onetime_mobile_conv = device_conversion['onetime_mobile_conversion'].replace(np.inf, np.nan).mean()
recurring_desktop_conv = device_conversion['recurring_desktop_conversion'].replace(np.inf, np.nan).mean()
recurring_mobile_conv = device_conversion['recurring_mobile_conversion'].replace(np.inf, np.nan).mean()

print("One time:")
print("Desktop: {:.2f}%".format(onetime_desktop_conv * 100.))
print("Mobile: {:.2f}%".format(onetime_mobile_conv * 100.))
print()
print("Recurring:")
print("Desktop: {:.2f}%".format(recurring_desktop_conv * 100.))
print("Mobile: {:.2f}%".format(recurring_mobile_conv * 100.))

One time:
Desktop: 7.48%
Mobile: 4.36%

Recurring:
Desktop: 1.78%
Mobile: 0.36%


In [56]:
#device_conversion.to_csv("device_conversions.csv", index=False)

# traffic overall breakdown by device category

In [57]:
start_date = '2020-09-01'
end_date = '2021-04-15'

In [58]:
# load traffic
q = '''select
            devicecategory,
            sum(views) as pageviews
        from googleanalytics_traffic
        where 
            qgiv_frontend=1 and
            date >= '{}'
        group by devicecategory'''.format(start_date)
traffic_device = redshift_query_read(q, schema='public')

In [59]:
traffic_device['percentage'] = traffic_device['pageviews'] / traffic_device['pageviews'].sum()
print("Device category traffic overall:")
traffic_device

Device category traffic overall:


Unnamed: 0,devicecategory,pageviews,percentage
0,mobile,3362197,0.373518
1,tablet,255675,0.028404
2,desktop,5383571,0.598079


In [60]:
# reload form conversions
converted_forms_dates = pd.read_csv("converted_forms_dates.csv")
converted_forms_dates['converted'] = pd.to_datetime(converted_forms_dates['created'])
converted_forms_dates.drop('created', axis=1, inplace=True)

In [61]:
# load traffic by form
q = '''select
            date,
            form,
            devicecategory,
            sum(views) as pageviews
        from googleanalytics_traffic
        where 
            qgiv_frontend=1 and
            date >= '{}'
        group by devicecategory, date, form'''.format(start_date)
traffic_device = redshift_query_read(q, schema='public')
traffic_device['date'] = pd.to_datetime(traffic_device['date'])

In [62]:
converted_forms = converted_forms_dates['form'].unique().tolist()

def is_form_converted_date(r):
    if r['form'] in converted_forms:
        if r['date'] >= converted_forms_dates[converted_forms_dates['form']==r['form']]['converted'].iloc[0]:
            return True
    return False
    
traffic_device['is_new_form'] = traffic_device[['date', 'form']].apply(is_form_converted_date, axis=1)

### device category traffic for relevant period

In [63]:
pvt_device_traffic = traffic_device.groupby(['is_new_form', 'devicecategory'])['pageviews'].sum().reset_index().pivot(index='devicecategory', columns='is_new_form', values='pageviews').reset_index()

In [64]:
pvt_device_traffic['old form'] = pvt_device_traffic[False]
pvt_device_traffic['new form'] = pvt_device_traffic[True]
pvt_device_traffic.drop([True, False], axis=1, inplace=True)
pvt_device_traffic['old form perc'] = pvt_device_traffic['old form'] / pvt_device_traffic['old form'].sum()
pvt_device_traffic['new form perc'] = pvt_device_traffic['new form'] / pvt_device_traffic['new form'].sum()

In [65]:
print("New vs. old form device category traffic breakdown:")
pvt_device_traffic

New vs. old form device category traffic breakdown:


is_new_form,devicecategory,old form,new form,old form perc,new form perc
0,desktop,5259840,123731,0.596954,0.650138
1,mobile,3298626,63571,0.37437,0.33403
2,tablet,252662,3013,0.028675,0.015832


### device category by month for relevant period

In [66]:
traffic_device['month'] = traffic_device['date'].dt.to_period('M')
traffic_device.head(3)

Unnamed: 0,date,form,devicecategory,pageviews,is_new_form,month
0,2021-02-15,828502,desktop,4,False,2021-02
1,2021-03-11,923771,desktop,18,False,2021-03
2,2021-02-17,923771,tablet,2,False,2021-02


In [67]:
grp_traffic = traffic_device.groupby(['is_new_form', 'devicecategory', 'month'])['pageviews'].sum().reset_index()
pvt_traffic = grp_traffic.pivot(columns='devicecategory', index=['month', 'is_new_form'], values='pageviews').reset_index()

In [68]:
pvt_traffic['desktop_traffic'] = pvt_traffic['desktop']
pvt_traffic['mobile_traffic'] = pvt_traffic['mobile']
pvt_traffic['tablet_traffic'] = pvt_traffic['tablet']
pvt_traffic.drop(['desktop', 'mobile', 'tablet'], axis=1, inplace=True)

In [69]:
grp_forms = traffic_device.groupby(['is_new_form', 'devicecategory', 'month'])['form'].count().reset_index()
pvt_forms = grp_forms.pivot(columns='devicecategory', index=['month', 'is_new_form'], values='form').reset_index()

In [70]:
pvt_forms['desktop_forms'] = pvt_forms['desktop']
pvt_forms['mobile_forms'] = pvt_forms['mobile']
pvt_forms['tablet_forms'] = pvt_forms['tablet']
pvt_forms.drop(['desktop', 'mobile', 'tablet'], axis=1, inplace=True)

In [71]:
monthly_device_nuold = pvt_forms.merge(pvt_traffic, on=['month', 'is_new_form'])
monthly_device_nuold['desktop_traffic_per_form'] = monthly_device_nuold['desktop_traffic'] / monthly_device_nuold['desktop_forms']
monthly_device_nuold['mobile_traffic_per_form'] = monthly_device_nuold['mobile_traffic'] / monthly_device_nuold['mobile_forms']
monthly_device_nuold['tablet_traffic_per_form'] = monthly_device_nuold['tablet_traffic'] / monthly_device_nuold['tablet_forms']

In [72]:
cols = ['month', 'is_new_form', 'desktop_traffic', 
        'desktop_traffic_per_form', 'mobile_traffic',
        'mobile_traffic_per_form', 'tablet_traffic', 
        'tablet_traffic_per_form']
monthly_device_nuold[cols].pivot(index='month', columns='is_new_form', values=['desktop_traffic', 'desktop_traffic_per_form', 'mobile_traffic', 'mobile_traffic_per_form', 'tablet_traffic', 'tablet_traffic_per_form']).fillna(0).reset_index()

Unnamed: 0_level_0,month,desktop_traffic,desktop_traffic,desktop_traffic_per_form,desktop_traffic_per_form,mobile_traffic,mobile_traffic,mobile_traffic_per_form,mobile_traffic_per_form,tablet_traffic,tablet_traffic,tablet_traffic_per_form,tablet_traffic_per_form
is_new_form,Unnamed: 1_level_1,False,True,False,True,False,True,False,True,False,True,False,True
0,2020-09,604244.0,0.0,12.773094,0.0,340298.0,0.0,11.596851,0.0,30960.0,0.0,3.905146,0.0
1,2020-10,634908.0,0.0,13.242424,0.0,390979.0,0.0,13.30132,0.0,33582.0,0.0,4.342126,0.0
2,2020-11,775237.0,0.0,16.494755,0.0,653166.0,0.0,21.734527,0.0,47342.0,0.0,5.728703,0.0
3,2020-12,888548.0,849.0,18.191549,6.685039,628913.0,450.0,18.910127,6.923077,53864.0,36.0,5.923678,2.4
4,2021-01,518159.0,6083.0,12.401786,9.065574,299862.0,4024.0,11.059713,10.589474,22606.0,249.0,3.700442,2.706522
5,2021-02,491750.0,13446.0,13.009603,10.696897,242317.0,8725.0,9.832698,11.465177,18379.0,519.0,3.557685,4.086614
6,2021-03,275135.0,40607.0,13.062479,40.165183,150642.0,8007.0,11.357207,15.7,11193.0,296.0,3.898642,3.217391
7,2021-04,378706.0,33184.0,11.436432,16.038666,261211.0,19509.0,11.775278,15.557416,16922.0,904.0,3.695567,3.217082
8,2021-05,693153.0,29562.0,19.413875,10.649135,331238.0,22856.0,12.647499,11.922796,17814.0,1009.0,3.853342,2.826331
