In [1]:
import sys, datetime
sys.path.append("../../scripts/")
from s3_support import *

import pandas as pd
import numpy as np
%matplotlib inline

# old method

no source filter on transactions

In [2]:
q = '''select 
            form, 
            org, 
            sum(amount) as volume, 
            count(id) as count, 
            date 
        from transactions 
        where 
            status='A' and
            creatingtransactionfor=0 and
            recurring=0
        group by form, org, date'''
trans = redshift_query_read(q, schema="public")
trans['date'] = pd.to_datetime(trans['date'])

In [3]:
q = '''select
            date, 
            org,
            form,
            sum(views) as pageviews,
            sum(bounces) as bounces
        from googleanalytics_traffic
        where qgiv_frontend=1
        group by date, org, form'''
traffic = redshift_query_read(q, schema="public")
traffic['date'] = pd.to_datetime(traffic['date'])

In [4]:
# isolate active qgiv forms
form_onetime_counts = trans.groupby('form')['count'].sum().reset_index()
active_qgiv_forms = form_onetime_counts[form_onetime_counts['count']>0]['form'].unique().tolist()
len(active_qgiv_forms), len(trans['form'].unique())

trans = trans[trans['form'].isin(active_qgiv_forms)]

In [5]:
# limit data to matched dates
trans = trans[trans['date']>=traffic['date'].min()]
df = trans.merge(traffic, on=['form', 'org', 'date'], how='outer')
df.fillna(0, inplace=True)

In [6]:
df['conversion'] = df['count'] / df['pageviews']

df.tail(3)

Unnamed: 0,form,org,volume,count,date,pageviews,bounces,conversion
3841419,413763,185347,0.0,0.0,2021-05-14,2.0,0.0,0.0
3841420,814177,123907,0.0,0.0,2021-05-24,1.0,1.0,0.0
3841421,939977,441669,0.0,0.0,2021-05-15,1.0,1.0,0.0


In [7]:
len(df[df['conversion']==np.inf]), df[df['conversion']!=np.inf]['conversion'].mean(), df['conversion'].median()

(441613, 0.056141909876467284, 0.0)

In [8]:
# last 30 days
thirty_days_ago = datetime.datetime.now() - pd.to_timedelta("30day")
last_30_days = df[df['date']>=thirty_days_ago]
len(last_30_days[last_30_days['conversion']==np.inf]), last_30_days[last_30_days['conversion']!=np.inf]['conversion'].mean(), last_30_days['conversion'].median()

(17620, 0.06446798027104912, 0.0)

# localize transactions

In [9]:
# query onetime
q = '''select 
            form, 
            org, 
            sum(amount) as volume_onetime, 
            count(id) as count_onetime, 
            date 
        from transactions 
        where 
            status='A' and
            creatingtransactionfor=0 and
            recurring=0
        group by form, org, date'''
trans_onetime = redshift_query_read(q, schema="public")
trans_onetime['date'] = pd.to_datetime(trans_onetime['date'])

In [10]:
# query recurring
q = '''select 
            form, 
            org, 
            amount, 
            id as recurring, 
            date 
        from transactions 
        where 
            status='A' and
            recurring!=0
        order by date asc'''
trans_rec = redshift_query_read(q, schema="public")
trans_rec['date'] = pd.to_datetime(trans_rec['date'])
trans_rec.drop_duplicates(subset=['recurring'], keep='first', inplace=True)

In [11]:
# aggregate recurring
trans_rec = trans_rec.groupby(['form', 'org', 'date']).agg({'amount': 'sum', 'id': 'count'}).reset_index()
trans_rec['count_recurring'] = trans_rec['id']
trans_rec['volume_recurring'] = trans_rec['amount']
trans_rec.drop(['id', 'amount'], axis=1, inplace=True)

In [12]:
# merge one time & recurring
trans = trans_onetime.merge(trans_rec, on=['form', 'org', 'date'])

In [13]:
trans.head(3)

Unnamed: 0,form,org,volume_onetime,count_onetime,date,count_recurring,volume_recurring
0,63,89,87.0,5,2010-01-08,1,5.0
1,453,481,2752.0,9,2009-10-15,15,1020.0
2,453,481,17360.0,151,2009-07-29,5,145.0


In [14]:
trans['date'].min(), trans['date'].max()

(Timestamp('2006-06-02 00:00:00'), Timestamp('2021-06-01 00:00:00'))

# localize traffic

In [15]:
q = '''select
            date, 
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
        where qgiv_frontend=1
        group by date, org, form'''
traffic = redshift_query_read(q, schema="public")
traffic['date'] = pd.to_datetime(traffic['date'])

In [16]:
traffic.head(3)

Unnamed: 0,date,org,form,pageviews
0,2021-03-10,427123,828502,44
1,2021-02-11,438482,923771,40
2,2021-02-23,555,939242,28


In [17]:
traffic['date'].min(), traffic['date'].max()

(Timestamp('2016-12-08 00:00:00'), Timestamp('2021-05-31 00:00:00'))

# localize form settings (appearance, CTA, conditional fields)

In [18]:
q = '''select date, form, appearance, cta_before, cta_after, conditional_fields
        from analyticsqgiv_weekly
        where date_part('year', date) = 2021
        order by date asc;'''
nuform_extra = redshift_query_read(q, schema="public")

In [19]:
print(len(nuform_extra)), print(len(nuform_extra['form'].unique()))
nuform_extra.tail(3)

231639
34030


Unnamed: 0,date,form,appearance,cta_before,cta_after,conditional_fields
231636,2021-04-05,967292,2,1,0,0
231637,2021-04-05,940126,1,0,0,0
231638,2021-04-05,968804,2,1,1,0


# merging data

In [20]:
q = "select id, type from form where status=1"
forms = redshift_query_read(q, schema="production")
active_qgiv_forms = forms[forms['type']==1]['id'].tolist()

In [21]:
# isolate active qgiv forms
len(active_qgiv_forms), len(trans['form'].unique())

trans = trans[trans['form'].isin(active_qgiv_forms)]

In [22]:
# limit data to matched dates
trans = trans[trans['date']>=traffic['date'].min()]
df = trans.merge(traffic, on=['form', 'org', 'date'], how='outer')
df.fillna(0, inplace=True)

In [23]:
nuform_extra_ls = nuform_extra['form'].unique().tolist()

def get_appearance(r):
    if r['form'] in nuform_extra_ls:
        msk_form = nuform_extra['form']==r['form']
        msk_date = nuform_extra['date']>=r['date']
        if len(nuform_extra[(msk_form)&(msk_date)]) > 0:
            return nuform_extra[(msk_form)&(msk_date)]['appearance'].iloc[0]
    return 0

def get_cta_before(r):
    if r['form'] in nuform_extra_ls:
        msk_form = nuform_extra['form']==r['form']
        msk_date = nuform_extra['date']>=r['date']
        if len(nuform_extra[(msk_form)&(msk_date)]) > 0:
            return nuform_extra[(msk_form)&(msk_date)]['cta_before'].iloc[0]
    return 0

def get_cta_after(r):
    if r['form'] in nuform_extra_ls:
        msk_form = nuform_extra['form']==r['form']
        msk_date = nuform_extra['date']>=r['date']
        if len(nuform_extra[(msk_form)&(msk_date)]) > 0:
            return nuform_extra[(msk_form)&(msk_date)]['cta_after'].iloc[0]
    return 0

def get_conditionals(r):
    if r['form'] in nuform_extra_ls:
        msk_form = nuform_extra['form']==r['form']
        msk_date = nuform_extra['date']>=r['date']
        if len(nuform_extra[(msk_form)&(msk_date)]) > 0:
            return nuform_extra[(msk_form)&(msk_date)]['conditional_fields'].iloc[0]
    return 0

In [24]:
xtra_msk = (nuform_extra['appearance']!=0)|(nuform_extra['cta_before']!=0)|(nuform_extra['cta_after']!=0)|(nuform_extra['conditional_fields']!=0)
df_msk = (nuform_extra['form'].isin(df['form'].unique().tolist()))
print(len(nuform_extra[xtra_msk&df_msk]['form'].unique()))
print(len(df['form'].unique()))
nuform_extra.tail(3)

14395
15806


Unnamed: 0,date,form,appearance,cta_before,cta_after,conditional_fields
231636,2021-04-05,967292,2,1,0,0
231637,2021-04-05,940126,1,0,0,0
231638,2021-04-05,968804,2,1,1,0


In [25]:
# merge form settings (appearance, CTA, conditional fields)
df['appearance'] = df[['form', 'date']].apply(get_appearance, axis=1)
print("done with appearance")
df['recurring_cta_before'] = df[['form', 'date']].apply(get_cta_before, axis=1)
print("done with CTA before")
df['recurring_cta_after'] = df[['form', 'date']].apply(get_cta_after, axis=1)
print("done with CTA after")
df['conditional_fields'] = df[['form', 'date']].apply(get_conditionals, axis=1)
print("done with conditional fields")

done with appearance
done with CTA before
done with CTA after
done with conditional fields


In [26]:
for c in ['appearance', 'recurring_cta_before', 'recurring_cta_after', 'conditional_fields']:
    print(c)
    print(df[c].mean())
    print(df[c].value_counts())
    print()

appearance
0.005591061053257664
0    3419548
1      14804
2       2205
Name: appearance, dtype: int64

recurring_cta_before
0.0005016648930892169
0    3434833
1       1724
Name: recurring_cta_before, dtype: int64

recurring_cta_after
8.00219522039064e-05
0    3436282
1        275
Name: recurring_cta_after, dtype: int64

conditional_fields
7.187426252496322e-05
0     3436470
1          38
2          22
3           7
5           7
4           4
10          4
9           3
13          2
Name: conditional_fields, dtype: int64



In [27]:
len(df), df['date'].min(), df['date'].max()

(3436557, Timestamp('2016-12-08 00:00:00'), Timestamp('2021-06-01 00:00:00'))

In [28]:
df['conversion_onetime'] = df['count_onetime'] / df['pageviews']
df['conversion_recurring'] = df['count_recurring'] / df['pageviews']

df.tail(3)

Unnamed: 0,form,org,volume_onetime,count_onetime,date,count_recurring,volume_recurring,pageviews,appearance,recurring_cta_before,recurring_cta_after,conditional_fields,conversion_onetime,conversion_recurring
3436554,525378,9154,0.0,0.0,2021-05-24,0.0,0.0,1.0,0,0,0,0,0.0,0.0
3436555,814177,123907,0.0,0.0,2021-05-11,0.0,0.0,2.0,0,0,0,0,0.0,0.0
3436556,928464,436247,0.0,0.0,2021-05-28,0.0,0.0,1.0,0,0,0,0,0.0,0.0


In [29]:
# dropping inf values
print("inf rows: {}".format(len(df[df['conversion_onetime']==np.inf])))

df = df[df['pageviews']>0]

inf rows: 36683


In [30]:
#df.to_csv("forms_conversion.csv", index=False)

# by device category

In [31]:
q = '''select
            date, 
            org,
            form,
            devicecategory,
            sum(views) as pageviews
        from googleanalytics_traffic
        where qgiv_frontend=1
        group by date, org, form, devicecategory'''
traffic_device = redshift_query_read(q, schema="public")
traffic_device['date'] = pd.to_datetime(traffic_device['date'])

In [32]:
df_device = trans.merge(traffic_device, on=['form', 'org', 'date'], how='outer')
df_device.fillna(0, inplace=True)

In [33]:
len(df_device), df_device['date'].min(), df_device['date'].max()

(5350625, Timestamp('2016-12-08 00:00:00'), Timestamp('2021-06-01 00:00:00'))

In [34]:
df_device['conversion_onetime'] = df_device['count_onetime'] / df_device['pageviews']
df_device['conversion_recurring'] = df_device['count_recurring'] / df_device['pageviews']

df_device.tail(3)

Unnamed: 0,form,org,volume_onetime,count_onetime,date,count_recurring,volume_recurring,devicecategory,pageviews,conversion_onetime,conversion_recurring
5350622,350274,175108,0.0,0.0,2021-05-24,0.0,0.0,desktop,1.0,0.0,0.0
5350623,871522,29748,0.0,0.0,2021-05-04,0.0,0.0,mobile,1.0,0.0,0.0
5350624,930279,442118,0.0,0.0,2021-05-28,0.0,0.0,mobile,1.0,0.0,0.0


In [35]:
# dropping inf values
print("inf rows: {}".format(len(df_device[df_device['conversion_onetime']==np.inf])))

df_device = df_device[df_device['pageviews']>0]

inf rows: 36720


In [36]:
#df_device.to_csv("forms_device_conversion.csv", index=False)