In [117]:
import sys, datetime
sys.path.append("../../scripts/")
from s3_support import *

import pandas as pd
import numpy as np

# load data

#### all new form templates

In [118]:
bucket = "qgiv-stats-data"
new_form_template_list = "form_download new template.csv"
new_forms = get_dataframe_from_file(bucket, new_form_template_list)

In [119]:
print("{} new template forms".format(len(new_forms)))
print("{} active".format(len(new_forms[new_forms['Status']=='active'])))

6510 new template forms
4745 active


In [120]:
new_forms.head()

Unnamed: 0,Form ID,Go Live Date,Status
0,1,05/18/2006,active
1,159827,,demo
2,159829,,demo
3,408644,05/15/2015,demo
4,625735,04/01/2015,demo


#### mercy ships

In [2]:
# mercy ships
# - new form key: ihruxh
TARGET_ORG = 436247
NEW_FORMS = [972972]
COMPARE_FORMS = [967125]

START_DATE = '2021-05-15'
START_DATE_BROAD = '2021-01-01'

TRAFFIC_LOAD_PATH = "../../scripts/export google analytics/update/mercy_ships_new_form.csv"

In [3]:
# load traffic
q = '''select
            date,
            form,
            devicecategory,
            views as pageviews,
            bounces
        from googleanalytics_traffic
        where 
            qgiv_frontend=1 and
            date >= '{}' and
            org = {}'''.format(START_DATE_BROAD, TARGET_ORG)
traffic = redshift_query_read(q, schema='public')

xtra = pd.read_csv(TRAFFIC_LOAD_PATH)
xtra['pageviews'] = xtra['views']
xtra.drop('views', axis=1, inplace=True)
traffic = traffic.append(xtra)

traffic['form'] = traffic['form'].astype(int)

In [6]:
"{:,}".format(len(traffic)), "{:,}".format(len(traffic[(traffic['form'].isin(NEW_FORMS)|(traffic['form'].isin(COMPARE_FORMS)))]))

('17,816', '10,602')

In [7]:
# load transactions
q = '''select 
            form,
            date,
            amount,
            source,
            useragent
        from transactions
        where
            status='A' and
            (source='don_form' or source='mobile') and
            recurring=0 and
            date >= '{}' and
            org = {}
        '''.format(START_DATE_BROAD, TARGET_ORG)
trans = redshift_query_read(q, schema='public')
trans['useragent'].fillna('', inplace=True)
trans['is_recurring'] = False

In [8]:
q = '''select 
            form,
            date,
            recurring,
            amount,
            source,
            useragent
        from transactions
        where
            status='A' and
            (source='don_form' or source='mobile') and
            recurring!=0 and
            date >= '{}' and
            org = {}
        order by date asc
        '''.format(START_DATE_BROAD, TARGET_ORG)
rec = redshift_query_read(q, schema='public')
rec = rec.groupby('recurring').first().reset_index()
rec['useragent'].fillna('', inplace=True)
rec['is_recurring'] = True

In [9]:
trans = trans.append(rec)

In [10]:
# flagging mobile & desktop
trans['is_mobile'] = (trans['useragent'].str.contains('iPhone').fillna(False))|(trans['useragent'].str.contains('iPad').fillna(False))|(trans['useragent'].str.contains('Android').fillna(False))
trans['is_desktop'] = (trans['useragent'].str.contains('Macintosh').fillna(False))|(trans['useragent'].str.contains('Windows').fillna(False))|(trans['useragent'].str.contains('CrOS').fillna(False))

# defaulting to source for ambiguous/missing useragent
trans_assigned = trans[trans['is_mobile']|trans['is_desktop']].copy()
trans_unassigned = trans[~trans['is_mobile']&~trans['is_desktop']].copy()
trans_unassigned['is_mobile'] = trans_unassigned['source']=='mobile'
trans_unassigned['is_desktop'] = trans_unassigned['source']=='don_form'
trans = trans_assigned.append(trans_unassigned)

In [11]:
trans = trans[trans['form'].isin(traffic['form'].unique().tolist())]

In [20]:
daily_trans = trans.groupby(['form', 'date', 'is_recurring'])['amount'].agg({'count', 'sum'}).reset_index()
daily_trans['trans_count'] = daily_trans['count']
daily_trans['trans_vol'] = daily_trans['sum']
daily_trans.drop(['count', 'sum'], axis=1, inplace=True)

daily_trans_pvt = daily_trans.pivot(index=['form', 'date'], columns='is_recurring', values=['trans_count', 'trans_vol']).reset_index()
daily_trans_pvt.columns = ['form', 'date', 'trans_count_onetime', 'trans_count_recurring', 'trans_vol_onetime', 'trans_vol_recurring']
daily_trans_pvt.fillna(0, inplace=True)
daily_trans_pvt.tail(3)

Unnamed: 0,form,date,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring
552,972972,2021-06-03,7.0,0.0,1148.38,0.0
553,972972,2021-06-04,5.0,1.0,170.0,30.9
554,972972,2021-06-05,4.0,1.0,546.0,19.57


In [21]:
# calculate conversions
daily_trans = trans.groupby(['form', 'date', 'is_recurring'])['amount'].agg({'count', 'sum'}).reset_index()
daily_trans['trans_count'] = daily_trans['count']
daily_trans['trans_vol'] = daily_trans['sum']
daily_trans.drop(['count', 'sum'], axis=1, inplace=True)

daily_trans_pvt = daily_trans.pivot(index=['form', 'date'], columns='is_recurring', values=['trans_count', 'trans_vol']).reset_index()
daily_trans_pvt.columns = ['form', 'date', 'trans_count_onetime', 'trans_count_recurring', 'trans_vol_onetime', 'trans_vol_recurring']
daily_trans_pvt.fillna(0, inplace=True)

dailies = daily_trans_pvt.merge(traffic.groupby(['date', 'form'])[['pageviews', 'bounces']].sum().reset_index(), on=['date', 'form'], how='outer')
dailies.fillna(0, inplace=True)

dailies['conversion'] = (dailies['trans_count_onetime'] + dailies['trans_count_recurring']) / dailies['pageviews']
dailies['conversion_onetime'] = dailies['trans_count_onetime'] / dailies['pageviews']
dailies['conversion_recurring'] = dailies['trans_count_recurring'] / dailies['pageviews']

In [23]:
dailies.tail(3)

Unnamed: 0,form,date,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring,pageviews,bounces,conversion,conversion_onetime,conversion_recurring
1708,972972,2021-06-07,0.0,0.0,0.0,0.0,46.0,21.0,0.0,0.0,0.0
1709,967125,2021-06-08,0.0,0.0,0.0,0.0,220.0,29.0,0.0,0.0,0.0
1710,972972,2021-06-08,0.0,0.0,0.0,0.0,24.0,10.0,0.0,0.0,0.0


In [24]:
print("Earliest traffic date for target form: {}".format(xtra['date'].min()))
print("Earliest transaction date for target form: {:%Y-%m-%d}".format(trans[trans['form'].isin(NEW_FORMS)]['date'].min()))

Earliest traffic date for target form: 2021-03-12
Earliest transaction date for target form: 2021-05-19


# fake inserts

In [None]:
fake_log = [{
    'form': 972972,
    'created': '2021-05-15'
}]
fake_traffic = [{
    'date': x['date'],
    'org': x['org'],
    'form': x['form'],
    'views': x['pageviews'],
    'sessions': x['sessions'],
    'sessionduration': x['sessionduration'],
    'bounces': x['bounces'],
    'path': x['path'],
    'devicecategory': x['devicecategory'],
    'controlpanel': False,
    'qgiv_frontend': True,
    'p2p_frontend': False
} for _, x in xtra.iterrows()]

In [None]:
str_cols = ['date', 'created', 'path', 'devicecategory']
bool_cols = ['controlpanel', 'qgiv_frontend', 'p2p_frontend']

def build_values_clause(q, vals):
    values_clause = []
    for val in vals:
        val_clause = []
        for c in val:
            if c in str_cols:
                val_clause.append("'" + str(val[c]) + "'")
            elif c in bool_cols:
                val_clause.append(str(val[c]))
            else:
                val_clause.append(str(int(val[c])))
        values_clause.append('(' + ', '.join(val_clause) + ')')
    return "{} {}".format(q, ", ".join(values_clause))

q = "insert into syslog_logs values"
print(build_values_clause(q, fake_log))
print()

q = "insert into googleanalytics_traffic values"
print(build_values_clause(q, fake_traffic)[:250] + "...")

Query output:

insert into syslog_logs values (972972, '2021-05-15')

insert into googleanalytics_traffic values ('2021-05-06', 436247, 972972, 1, 0, 0, 0, '/for/ihruxh/', 'desktop', False, True, False), ('2021-05-06', 436247, 972972, 1, 0, 0, 0, '/for/ihruxh/embed', 'desktop', False, True, False), ('2021-05-06', 43624...

In [None]:
'''
q = "insert into syslog_logs values"
q = build_values_clause(q, fake_log)
redshift_query_write(q, schema='production')

q = "insert into googleanalytics_traffic values"
q = build_values_clause(q, fake_traffic)
redshift_query_write(q, schema='public')
'''

# analysis

### support functions

In [104]:
def report(df):
    # recurring frequency?
    df.replace(np.inf, np.nan, inplace=True)
    return {
        'form sample size': len(df['form'].unique().tolist()),
        'transactions': df['trans_count_onetime'].sum() + df['trans_count_recurring'].sum(),
        'conversion': df['conversion'].mean(),
        'conversion onetime': df['conversion_onetime'].mean(),
        'conversion recurring': df['conversion_recurring'].mean(),
        'mean transaction onetime': df['trans_vol_onetime'].sum() / df['trans_count_onetime'].sum(),
        'mean transaction recurring': df['trans_vol_recurring'].sum() / df['trans_count_recurring'].sum(),
        'onetime/recurring': df['trans_count_onetime'].sum() / df['trans_count_recurring'].sum(),
        'pageviews': df['pageviews'].sum(),
        'bounce rate': df['bounces'].sum() / df['pageviews'].sum()
    }

In [105]:
def print_report(d, title):
    print(title)
    print()
    keys = [d_k for d_k in d[[k for k in d.keys()][0]].keys()]
    
    print("| " + " | ".join([""] + keys) + " |")
    print("|----".join(["" for i in range(len(keys) + 2)]) + "|")
    for k in d:
        print("| {} | ".format(k) + " | ".join([str(v) for v in d[k].values()]) + " |")

### data output

In [106]:
traffic['form'] = traffic['form'].astype(int)

data_all = {
    'org': report(dailies),
    'new forms': report(dailies[dailies['form'].isin(NEW_FORMS)]),
    'compare forms': report(dailies[dailies['form'].isin(COMPARE_FORMS)]),
    'old forms': report(dailies[~dailies['form'].isin(NEW_FORMS)])
}
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(30)
dailies_last_30 = dailies[dailies['date']>=thirty_days_ago]
data_last_30_days = {
    'org': report(dailies_last_30),
    'new forms': report(dailies_last_30[dailies_last_30['form'].isin(NEW_FORMS)]),
    'compare forms': report(dailies_last_30[dailies_last_30['form'].isin(COMPARE_FORMS)]),
    'not new forms': report(dailies_last_30[~dailies_last_30['form'].isin(NEW_FORMS)])
}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [107]:
print("{:%Y-%m-%d} +".format(dailies['date'].min()))
df = pd.DataFrame(data_all)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['org']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['org']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose()

2021-01-01 +


Unnamed: 0,org,new forms,compare forms,old forms
form sample size,41.0,1.0,1.0,40.0
transactions,1.0,0.007761,0.514337,0.992239
conversion,0.314054,0.037213,0.128295,0.319833
conversion onetime,0.017746,0.029208,0.107672,0.017507
conversion recurring,0.296308,0.008005,0.020622,0.302327
mean transaction onetime,130.958599,142.579651,134.858944,130.791138
mean transaction recurring,34.870922,99.200909,62.224339,34.690381
onetime/recurring,0.770131,3.909091,3.829285,0.761322
pageviews,1.0,0.042382,0.563018,0.957618
bounce rate,0.308434,0.450161,0.215145,0.302161


In [108]:
print("Last 30 Days")
df = pd.DataFrame(data_last_30_days)
df = df.transpose()

df['pageviews_perc'] = df['pageviews'] / df['pageviews']['org']
df['pageviews'] = df['pageviews_perc']
df['transactions_perc'] = df['transactions'] / df['transactions']['org']
df['transactions'] = df['transactions_perc']

df.drop(['pageviews_perc', 'transactions_perc'], axis=1, inplace=True)
df.transpose()

Last 30 Days


Unnamed: 0,org,new forms,compare forms,not new forms
form sample size,25.0,1.0,1.0,24.0
transactions,1.0,0.106931,0.880198,0.893069
conversion,0.026831,0.042726,0.112064,0.025223
conversion onetime,0.024422,0.033536,0.095635,0.0235
conversion recurring,0.002409,0.00919,0.016429,0.001723
mean transaction onetime,128.687711,142.579651,128.200637,127.128042
mean transaction recurring,39.032785,99.200909,29.299706,29.299706
onetime/recurring,5.392405,3.909091,5.536765,5.632353
pageviews,1.0,0.159512,0.599237,0.840488
bounce rate,0.327401,0.459447,0.208037,0.30234
