notes:
- have not been excluding receipts, so conversion is actually higher than has been previously reported
- outer merge didn't work as documented when joining traffic and transaction data, 0 transaction days were dropped from traffic, _this was the core issue_
- few gaps (week-ish long) in traffic data, forms w/ no traffic (all examples found so far have been single transactions, but still need to recover data)
- did not version/store prior dataset builds, can't say with any certainty which reports were meaningfully effected and which were not

In [1]:
import pandas as pd
import numpy as np
import datetime

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

In [2]:
%matplotlib inline

# checking sample size

In [91]:
# form status
q = "select id, status from form"
all_forms = redshift_query_read(q, schema='production')
active_forms = all_forms[all_forms['status']==1]['id'].tolist()

In [93]:
# logged upgrades
q = "select * from syslog_logs where message like '%Qgiv Form Template Upgraded%'"
form_upgrades = redshift_query_read(q, schema="production")[['org', 'form', 'created']]
forms_upgrades_ids_logged = form_upgrades['form'].unique().tolist()

In [94]:
# created new template forms
bucket = "qgiv-stats-data"
new_form_template_list = "form_download new template.csv"
new_forms = get_dataframe_from_file(bucket, new_form_template_list)

new_forms_ids = new_forms[new_forms['Status']=='active']['Form ID'].unique().tolist()

In [96]:
q_trans_forms = '''select
                        distinct form
                    from transactions
                    where
                        date>='{}' and
                        recurring=0 and
                        status='A' and 
                        (source='don_form' or source='mobile')'''
trans_forms = redshift_query_read(q_trans_forms.format(START_DATE), schema='public')
trans_forms = trans_forms['form'].unique().tolist()

In [98]:
trans_template_download_overlap = [f for f in trans_forms if f in new_forms_ids]

print("{:,} forms in transactions".format(len(trans_forms)))
print("{:,} forms in new template download".format(len(new_forms_ids)))
print("{:,} forms in new template upgrades".format(len(forms_upgrades_ids_logged)))
print("{:,} active forms".format(len(active_forms)))
print()
print("{:,} forms intersection transactions & new template download".format(len(trans_template_download_overlap)))

9,797 forms in transactions
5,154 forms in new template download
2,740 forms in new template upgrades
26,194 active forms

3,373 forms intersection transactions & new template download


# looking for forms w/ no traffic

In [58]:
START_DATE = '2020-10-01'

In [72]:
q = "select distinct form from googleanalytics_traffic where date>='{}'".format(START_DATE)
traffic_forms = redshift_query_read(q, schema='public')['form'].tolist()

In [73]:
q = "select distinct form from transactions where date>='{}' and recurring=0".format(START_DATE)
trans_forms = redshift_query_read(q, schema='public')['form'].tolist()

In [74]:
# what forms have transactions and no traffic?
non_overlap_forms = [f for f in trans_forms if f not in traffic_forms]

In [75]:
print("{:,} forms in traffic, {:,} forms in transactions".format(len(traffic_forms), len(trans_forms)))
print("{:,} forms in transactions that are not in traffic".format(len(non_overlap_forms)))

8,782 forms in traffic, 13,329 forms in transactions
10,024 forms in transactions that are not in traffic


In [76]:
non_overlap_forms[-10:]

[965011,
 972995,
 949137,
 964046,
 958576,
 967571,
 968285,
 963751,
 972056,
 975073]

### spot checking

#### 965011

In [77]:
test_form = 965011
q = "select * from googleanalytics_traffic where form={}".format(test_form)
redshift_query_read(q, schema='public')

Unnamed: 0,date,org,form,views,sessions,sessionduration,bounces,path,devicecategory,controlpanel,qgiv_frontend,p2p_frontend


In [80]:
q = "select id, date, status, amount, recurring, source from transactions where form={} and date>='{}'".format(test_form, START_DATE)
redshift_query_read(q, schema='public')

Unnamed: 0,id,date,status,amount,recurring,source
0,10857943,2020-11-23,A,26.25,0,don_form


#### 972995

In [81]:
test_form = 972995
q = "select * from googleanalytics_traffic where form={}".format(test_form)
redshift_query_read(q, schema='public')

Unnamed: 0,date,org,form,views,sessions,sessionduration,bounces,path,devicecategory,controlpanel,qgiv_frontend,p2p_frontend


In [82]:
q = "select id, date, status, amount, recurring, source from transactions where form={} and date>='{}'".format(test_form, START_DATE)
redshift_query_read(q, schema='public')

Unnamed: 0,id,date,status,amount,recurring,source
0,12410977,2021-05-24,A,1.0,0,don_form


#### 949137

In [83]:
test_form = 949137
q = "select * from googleanalytics_traffic where form={}".format(test_form)
redshift_query_read(q, schema='public')

Unnamed: 0,date,org,form,views,sessions,sessionduration,bounces,path,devicecategory,controlpanel,qgiv_frontend,p2p_frontend


In [84]:
q = "select id, date, status, amount, recurring, source from transactions where form={} and date>='{}'".format(test_form, START_DATE)
redshift_query_read(q, schema='public')

Unnamed: 0,id,date,status,amount,recurring,source
0,12416677,2021-05-25,A,312.75,0,don_form


#### 964046

In [85]:
test_form = 964046
q = "select * from googleanalytics_traffic where form={}".format(test_form)
redshift_query_read(q, schema='public')

Unnamed: 0,date,org,form,views,sessions,sessionduration,bounces,path,devicecategory,controlpanel,qgiv_frontend,p2p_frontend


In [86]:
q = "select id, date, status, amount, recurring, source from transactions where form={} and date>='{}'".format(test_form, START_DATE)
redshift_query_read(q, schema='public')

Unnamed: 0,id,date,status,amount,recurring,source
0,10795136,2020-11-16,A,1.0,0,don_form


#### 963751

In [87]:
test_form = 963751
q = "select * from googleanalytics_traffic where form={}".format(test_form)
redshift_query_read(q, schema='public')

Unnamed: 0,date,org,form,views,sessions,sessionduration,bounces,path,devicecategory,controlpanel,qgiv_frontend,p2p_frontend


In [88]:
q = "select id, date, status, amount, recurring, source from transactions where form={} and date>='{}'".format(test_form, START_DATE)
redshift_query_read(q, schema='public')

Unnamed: 0,id,date,status,amount,recurring,source
0,11963275,2021-04-06,CR,5000.0,0,vt


#### 975073

In [89]:
test_form = 975073
q = "select * from googleanalytics_traffic where form={}".format(test_form)
redshift_query_read(q, schema='public')

Unnamed: 0,date,org,form,views,sessions,sessionduration,bounces,path,devicecategory,controlpanel,qgiv_frontend,p2p_frontend


In [90]:
q = "select id, date, status, amount, recurring, source from transactions where form={} and date>='{}'".format(test_form, START_DATE)
redshift_query_read(q, schema='public')

Unnamed: 0,id,date,status,amount,recurring,source
0,12749868,2021-07-15,A,1.0,0,don_form


# spot checking

### load dailies data

In [56]:
dailies = pd.read_csv("dailies.csv")

In [57]:
dailies[dailies['is new template']].groupby('form')['date'].count().reset_index().tail()

Unnamed: 0,form,date
559,943463,44
560,943544,7
561,943985,7
562,944136,17
563,944141,1


### form 943463

In [44]:
print("Entries for form 943463: {}".format(len(dailies[dailies['form']==943463])))
dailies[dailies['form']==943463]['is new template'].value_counts()

Entries for form 943463: 226


False    182
True      44
Name: is new template, dtype: int64

In [46]:
dailies[dailies['form']==943463].groupby('is new template')[['conversion', 'views', 'trans_count_onetime']].mean().reset_index()

Unnamed: 0,is new template,conversion,views,trans_count_onetime
0,False,0.0,28.32967,0.0
1,True,0.10776,112.204545,1.590909


### form 654

In [45]:
print("Entries for form 654: {}".format(len(dailies[dailies['form']==654])))
dailies[dailies['form']==654]['is new template'].value_counts()

Entries for form 654: 251


False    148
True     103
Name: is new template, dtype: int64

In [47]:
dailies[dailies['form']==654].groupby('is new template')[['conversion', 'views', 'trans_count_onetime']].mean().reset_index()

Unnamed: 0,is new template,conversion,views,trans_count_onetime
0,False,0.0,9.689189,0.0
1,True,0.159885,17.126214,1.708738


In [48]:
cols = ['date', 'trans_count_onetime', 'views', 'conversion']
dailies[(dailies['form']==654)&(dailies['is new template'])][cols]

Unnamed: 0,date,trans_count_onetime,views,conversion
6744,2020-10-02,2.0,7.0,0.285714
6745,2020-10-05,4.0,20.0,0.200000
6746,2020-10-06,1.0,8.0,0.125000
6747,2020-10-11,1.0,3.0,0.333333
6748,2020-10-18,0.0,2.0,1.000000
...,...,...,...,...
6842,2021-06-15,0.0,27.0,0.037037
6843,2021-06-17,1.0,8.0,0.125000
6844,2021-06-21,1.0,13.0,0.076923
6845,2021-06-22,1.0,9.0,0.111111


In [53]:
# 2020-10-07, 08, 09, 10
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date>='{}' and date<='{}'"
redshift_query_read(q_traffic.format(654, '2020-10-07', '2020-10-10'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,654,2020-10-07,1,mobile,/for/ronaldmcdonaldmm/embed
1,654,2020-10-08,1,desktop,/for/ronaldmcdonaldmm/embed
2,654,2020-10-07,2,desktop,/for/ronaldmcdonaldmm/embed
3,654,2020-10-10,1,desktop,/for/ronaldmcdonaldmm/embed
4,654,2020-10-08,1,mobile,/for/ronaldmcdonaldmm/embed
5,654,2020-10-10,1,mobile,/for/ronaldmcdonaldmm/embed/receipt/tf9f252ekp...
6,654,2020-10-10,1,mobile,/for/ronaldmcdonaldmm/embed


In [50]:
# 2020-10-05	4.0
q_trans = "select id, form, date, amount, recurring, status from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(654, '2020-10-05'), schema='public')

Unnamed: 0,id,form,date,amount,recurring,status
0,10460210,654,2020-10-05,36.38,0,D
1,10460242,654,2020-10-05,164.0,0,A
2,10462440,654,2020-10-05,103.95,0,A
3,10460538,654,2020-10-05,20.0,0,A
4,10460215,654,2020-10-05,36.38,0,A
5,10460211,654,2020-10-05,36.38,0,D


In [51]:
# 2020-10-05	20
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(654, '2020-10-05'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,654,2020-10-05,13,desktop,/for/ronaldmcdonaldmm/embed
1,654,2020-10-05,1,desktop,/for/ronaldmcdonaldmm/embed/receipt/rscsq7phwg...
2,654,2020-10-05,1,desktop,/for/ronaldmcdonaldmm/embed/receipt/sk36fffjwy...
3,654,2020-10-05,1,tablet,/for/ronaldmcdonaldmm/embed/receipt/ye5gyxae5a...
4,654,2020-10-05,1,desktop,/for/ronaldmcdonaldmm/embed/receipt/k5bfsecgw6...
5,654,2020-10-05,1,desktop,/for/ronaldmcdonaldmm/embed/receipt/jj26g42tcb...
6,654,2020-10-05,2,tablet,/for/ronaldmcdonaldmm/embed


### form 943985

In [30]:
print("Entries for form 943985: {}".format(len(dailies[dailies['form']==943985])))
dailies[dailies['form']==943985]['is new template'].value_counts()

Entries for form 943985: 88


False    81
True      7
Name: is new template, dtype: int64

In [31]:
dailies[dailies['form']==943985].groupby('is new template')[['conversion', 'views', 'trans_count_onetime']].mean().reset_index()

Unnamed: 0,is new template,conversion,views,trans_count_onetime
0,False,0.0,6.419753,0.0
1,True,0.197959,6.285714,1.142857


In [32]:
cols = ['date', 'trans_count_onetime', 'views', 'conversion']
dailies[(dailies['form']==943985)&(dailies['is new template'])][cols]

Unnamed: 0,date,trans_count_onetime,views,conversion
92757,2020-12-08,1.0,4.0,0.25
92758,2020-12-16,1.0,7.0,0.142857
92759,2020-12-18,1.0,10.0,0.1
92760,2020-12-20,1.0,4.0,0.25
92761,2020-12-29,2.0,8.0,0.25
92762,2020-12-31,1.0,7.0,0.142857
92763,2021-01-08,1.0,4.0,0.25


In [33]:
# 2020-12-08	1.0
q_trans = "select id, form, date, amount, recurring from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(943985, '2020-12-08'), schema='public')

Unnamed: 0,id,form,date,amount,recurring
0,11030210,943985,2020-12-08,60.0,0


In [34]:
# 2020-12-08	4.0
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(943985, '2020-12-08'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,943985,2020-12-08,1,desktop,/for/ccampaig/
1,943985,2020-12-08,1,desktop,/for/ccampaig/receipt/w483jehew2fra2rxw9qfryab...
2,943985,2020-12-08,1,desktop,/for/ccampaig/embed/
3,943985,2020-12-08,1,desktop,/for/ccampaig/widgetize/1921606/?form=943985&e...


In [35]:
# 2020-12-29	2.0
q_trans = "select id, form, date, amount, recurring from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(943985, '2020-12-29'), schema='public')

Unnamed: 0,id,form,date,amount,recurring
0,11213490,943985,2020-12-29,1000.0,0
1,11215775,943985,2020-12-29,100.0,0


In [36]:
# 2020-12-29	8.0
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(943985, '2020-12-29'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,943985,2020-12-29,1,desktop,/for/ccampaig/receipt/x592958jrr64bw9aaxwb7hxr...
1,943985,2020-12-29,2,desktop,/for/ccampaig/
2,943985,2020-12-29,2,desktop,/for/ccampaig/embed/
3,943985,2020-12-29,2,desktop,/for/ccampaig/widgetize/1921606/?form=943985&e...
4,943985,2020-12-29,1,desktop,/for/ccampaig/receipt/rqxe4b4d7e7jjsd27gped6sk...


In [39]:
# 2020-12-31	1.0
q_trans = "select id, form, date, amount, recurring from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(943985, '2020-12-31'), schema='public')

Unnamed: 0,id,form,date,amount,recurring
0,11248118,943985,2020-12-31,200.0,0


In [40]:
# 2020-12-31	7.0
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(943985, '2020-12-31'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,943985,2020-12-31,2,mobile,/for/ccampaig/
1,943985,2020-12-31,3,mobile,/for/ccampaig/receipt/pfbd46rx5twwk8t9w2eeqad4...
2,943985,2020-12-31,1,mobile,/for/ccampaig/widgetize/1921606/?form=943985&e...
3,943985,2020-12-31,1,mobile,/for/ccampaig/embed/


### form 944136

In [8]:
print("Entries for form 944136: {}".format(len(dailies[dailies['form']==944136])))
dailies[dailies['form']==944136]['is new template'].value_counts()

Entries for form 944136: 117


False    100
True      17
Name: is new template, dtype: int64

In [12]:
dailies[dailies['form']==944136].groupby('is new template')[['conversion', 'views', 'trans_count_onetime']].mean().reset_index()

Unnamed: 0,is new template,conversion,views,trans_count_onetime
0,False,0.04562,6.06,0.34
1,True,0.205322,6.941176,0.882353


In [14]:
cols = ['date', 'trans_count_onetime', 'views', 'conversion']
dailies[(dailies['form']==944136)&(dailies['is new template'])][cols]

Unnamed: 0,date,trans_count_onetime,views,conversion
92797,2021-04-01,0.0,6.0,0.166667
92798,2021-04-08,1.0,7.0,0.142857
92799,2021-04-17,1.0,7.0,0.142857
92800,2021-04-18,1.0,12.0,0.083333
92801,2021-05-05,1.0,10.0,0.1
92802,2021-05-06,0.0,15.0,0.066667
92803,2021-05-07,2.0,8.0,0.25
92804,2021-05-10,1.0,10.0,0.1
92805,2021-05-11,1.0,6.0,0.166667
92806,2021-05-12,1.0,3.0,0.333333


In [17]:
# 2021-05-07	2.0
q_trans = "select id, form, date, amount, recurring from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(944136, '2021-05-07'), schema='public')

Unnamed: 0,id,form,date,amount,recurring
0,12254749,944136,2021-05-07,386.25,0
1,12265586,944136,2021-05-07,400.0,0


In [20]:
# 2021-05-07	8.0
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(944136, '2021-05-07'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,944136,2021-05-07,1,desktop,/for/kks/receipt/ysy55jy3grp6b3wr98pj44wakftc45hd
1,944136,2021-05-07,2,desktop,/for/kks/
2,944136,2021-05-07,1,mobile,/for/kks/embed/
3,944136,2021-05-07,1,mobile,/for/kks/receipt/grbygyg2ghkhga3kkc8x73q6ytqqcc73
4,944136,2021-05-07,2,desktop,/for/kks/embed/
5,944136,2021-05-07,1,mobile,/for/kks/


In [21]:
# 2021-05-10	1.0
q_trans = "select id, form, date, amount, recurring from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(944136, '2021-05-10'), schema='public')

Unnamed: 0,id,form,date,amount,recurring
0,12295824,944136,2021-05-10,386.25,0


In [22]:
# 2021-05-10	10
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(944136, '2021-05-10'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,944136,2021-05-10,2,desktop,/for/kks/embed/
1,944136,2021-05-10,2,mobile,/for/kks/embed/
2,944136,2021-05-10,2,desktop,/for/kks/
3,944136,2021-05-10,1,mobile,/for/kks/receipt/p5992psy4jfb8dk9dy6bhj89cyperdp9
4,944136,2021-05-10,2,mobile,/for/kks/
5,944136,2021-05-10,1,mobile,/for/kks/receipt/jcpyffrrrxtfk5xhk875req3j9jyjtx3


In [23]:
# 2021-04-17	1.0
q_trans = "select id, form, date, amount, recurring from transactions where form={} and date='{}'"
redshift_query_read(q_trans.format(944136, '2021-04-17'), schema='public')

Unnamed: 0,id,form,date,amount,recurring
0,12058112,944136,2021-04-17,386.25,0


In [24]:
# 2021-04-17	7.0
q_traffic = "select form, date, views, devicecategory, path from googleanalytics_traffic where form={} and date='{}'"
redshift_query_read(q_traffic.format(944136, '2021-04-17'), schema='public')

Unnamed: 0,form,date,views,devicecategory,path
0,944136,2021-04-17,1,desktop,/for/kks/receipt/aq5hpqxa93e8673fgshxff522p4j6j8b
1,944136,2021-04-17,3,desktop,/for/kks/embed/
2,944136,2021-04-17,3,desktop,/for/kks/
