Need to run new forms reports using both traffic sources: Google Analytics and Matomo

- start date 10/1/2020

Notes
- _dropping bounce rate; can't presently confidently state that from matomo stats_

In [80]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
import pandas as pd
import numpy as np

%matplotlib inline

# load data

In [2]:
start_date = '2020-10-01'
ga_end_date = '2022-02-03'
matomo_start_date = '2022-09-01'

### embeds

In [3]:
# embeds
q = "select widget as form from embed"
embeds = redshift_query_read(q, schema='production')

In [4]:
def is_embed(f):
    if f in embeds['form'].unique().tolist():
        return True
    return False

### traffic

In [5]:
q = '''select
            date,
            form,
            sum(views) as pageviews_ga
        from googleanalytics_traffic
        where 
            date>='2020-10-01' and
            (qgiv_frontend=1 or p2p_frontend=1) and
            path not like '%/receipt/%' 
        group by date, form'''

traffic_ga = redshift_query_read(q, schema='production')

In [6]:
q = '''select
            trunc(timestamp) as date,
            form,
            count(id) as pageviews_matomo
        from matomo_traffic
        where url not like '%/receipt/%'
        group by form, trunc(timestamp);'''

traffic_matomo = redshift_query_read(q, schema='production')

In [32]:
# merge
traffic_ga['date'] = pd.to_datetime(traffic_ga['date'])
traffic_matomo['date'] = pd.to_datetime(traffic_matomo['date'])

traffic = traffic_ga.merge(traffic_matomo, on=['date', 'form'], how='outer')
traffic.sort_values('date', ascending=True, inplace=True)

In [33]:
print("len(traffic): {:,}".format(len(traffic)))
print("len(traffic[GA not NA]): {:,}".format(len(traffic[~traffic['pageviews_ga'].isna()])))
print("len(traffic[Matomo not NA]): {:,}".format(len(traffic[~traffic['pageviews_matomo'].isna()])))
print("len(traffic[Matomo & GA not NA]): {:,}".format(len(traffic[~traffic['pageviews_matomo'].isna()&~traffic['pageviews_ga'].isna()])))

len(traffic): 2,755,252
len(traffic[GA not NA]): 2,554,725
len(traffic[Matomo not NA]): 389,163
len(traffic[Matomo & GA not NA]): 188,636


### transactions

In [34]:
# query onetime
q = '''select
            date,
            form,
            count(id) as count_onetime,
            sum(amount) as volume_onetime
        from transactions
        where
            status='A' and
            date>='2020-10-01' and
            recurring=0 and
            (source='don_form' or source='mobile')
        group by form, date'''

trans = redshift_query_read(q, schema='production')

In [35]:
# query recurring
q = '''select
            recurring,
            min(date) as date,
            form,
            amount
        from transactions
        where
            status='A' and
            recurring!=0 and
            (source='don_form' or source='mobile')
        group by form, date, recurring, amount'''

rec = redshift_query_read(q, schema='production')
rec = rec[rec['date']>='2020-10-01']
rec = rec.groupby(['form', 'date']).agg({'recurring': 'count', 'amount': 'sum'}).reset_index()
rec.columns = ['form', 'date', 'count_recurring', 'volume_recurring']

In [36]:
# merge
trans = trans.merge(rec, on=['form', 'date'])
trans['count'] = trans['count_recurring'] + trans['count_onetime']
trans['volume'] = trans['volume_recurring'] + trans['volume_onetime']

In [37]:
trans.tail(3)

Unnamed: 0,date,form,count_onetime,volume_onetime,count_recurring,volume_recurring,count,volume
160118,2023-02-02,956878,1,250.0,2,135.0,3,385.0
160119,2023-02-03,833637,1,35.0,10,360.5,11,395.5
160120,2023-02-07,937277,1,160.0,1,20.0,2,180.0


In [38]:
trans['date'].min(), trans['date'].max()

(Timestamp('2020-10-01 00:00:00'), Timestamp('2023-02-07 00:00:00'))

### identify new forms

In [39]:
# upgrades
q = "select * from syslog_logs where message like '%Qgiv Form Template Upgraded%'"
form_upgrades = redshift_query_read(q, schema="production")[['org', 'form', 'created']]
forms_upgrades_ids_logged = form_upgrades['form'].unique().tolist()

In [40]:
q = "select id, status from form"
all_forms = redshift_query_read(q, schema='production')

In [41]:
bucket = "qgiv-stats-data"
new_form_template_list = "form_download new template.csv"
new_forms = get_dataframe_from_file(bucket, new_form_template_list)

new_forms_ids = new_forms[new_forms['Status']=='active']['Form ID'].unique().tolist()

print("{:,} new template forms".format(len(new_forms)))
print("{:,} active".format(len(new_forms[new_forms['Status']=='active'])))

28,671 new template forms
18,622 active


In [42]:
print("{:,} forms logged".format(len(form_upgrades['form'].unique().tolist())))
print("{:,} logged forms in new forms ID list".format(len(form_upgrades[form_upgrades['form'].isin(new_forms_ids)]['form'].unique().tolist())))
print("{:,} new forms in logged upgrades".format(len(new_forms[new_forms['Form ID'].isin(forms_upgrades_ids_logged)])))
print("{:,} total forms, {:,} active".format(len(all_forms), len(all_forms[all_forms['status']==1])))

11,717 forms logged
8,365 logged forms in new forms ID list
11,597 new forms in logged upgrades
83,687 total forms, 33,520 active


### merge

In [None]:
# merge trans & traffic, fill missing w/ 0
df = trans.merge(traffic, on=['form', 'date'], how='outer').fillna(0)

In [44]:
q = '''select 
            count(distinct(form)) as forms,
            count(id) as trans
        from transactions 
        where
            status='A' and
            date>='2022-12-01' and
            date<='2022-12-31' '''
dec_forms = redshift_query_read(q, schema='production')

print("{:,} forms, {:,} trans from db".format(dec_forms['forms'].iloc[0], dec_forms['trans'].iloc[0]))

8,893 forms, 276,801 trans from db


In [45]:
print("len(trans): {:,}".format(len(trans)))
print("len(traffic): {:,}".format(len(traffic)))
print("len(df): {:,}".format(len(df)))

trans['month'] = trans['date'].dt.to_period('M')
traffic['month'] = traffic['date'].dt.to_period('M')
df['month'] = df['date'].dt.to_period('M')

trans_forms = len(trans[trans['month']=='2022-12']['form'].unique())
traffic_forms = len(traffic[traffic['month']=='2022-12']['form'].unique())
df_forms = len(df[df['month']=='2022-12']['form'].unique())

print()
print("len month period:")
print("len(trans.forms): {:,}".format((trans_forms)))
print("len(traffic.forms): {:,}".format((traffic_forms)))
print("len(df.forms): {:,}".format((df_forms)))

trans_forms = len(trans[(trans['date']>='2022-12-01')&(trans['date']<='2022-12-31')]['form'].unique())
traffic_forms = len(traffic[(traffic['date']>='2022-12-01')&(traffic['date']<='2022-12-31')]['form'].unique())
df_forms = len(df[(df['date']>='2022-12-01')&(df['date']<='2022-12-31')]['form'].unique())

print()
print("len date range:")
print("len(trans.forms): {:,}".format((trans_forms)))
print("len(traffic.forms): {:,}".format((traffic_forms)))
print("len(df.forms): {:,}".format((df_forms)))

len(trans): 160,121
len(traffic): 2,755,252
len(df): 2,819,132

len month period:
len(trans.forms): 1,483
len(traffic.forms): 17,144
len(df.forms): 17,154

len date range:
len(trans.forms): 1,483
len(traffic.forms): 17,144
len(df.forms): 17,154


In [71]:
# conversion matomo
df['conversion_matomo'] = df['count'] / df['pageviews_matomo']
df['conversion_onetime_matomo'] = df['count_onetime'] / df['pageviews_matomo']
df['conversion_recurring_matomo'] = df['count_recurring'] / df['pageviews_matomo']

df['conversion_matomo'] = df['conversion_matomo'].fillna(0)
df['conversion_onetime_matomo'] = df['conversion_onetime_matomo'].fillna(0)
df['conversion_recurring_matomo'] = df['conversion_recurring_matomo'].fillna(0)

In [72]:
# conversion GA
df['conversion_ga'] = df['count'] / df['pageviews_ga']
df['conversion_onetime_ga'] = df['count_onetime'] / df['pageviews_ga']
df['conversion_recurring_ga'] = df['count_recurring'] / df['pageviews_ga']

df['conversion_ga'] = df['conversion_ga'].fillna(0)
df['conversion_onetime_ga'] = df['conversion_onetime_ga'].fillna(0)
df['conversion_recurring_ga'] = df['conversion_recurring_ga'].fillna(0)

In [73]:
len_w_outliers = len(df)
len_w_outliers_ga = len(df[df['date']<='2022-02'])
len_w_outliers_matomo = len(df[df['date']>='2022-09'])

df = df[df['conversion_ga']<1.]

len_wout_outliers = len(df)
len_wout_outliers_ga = len(df[df['date']<='2022-02'])
len_wout_outliers_matomo = len(df[df['date']>='2022-09'])

num_outliers = len_w_outliers - len_wout_outliers
num_outliers_ga = len_w_outliers_ga - len_wout_outliers_ga
num_outliers_matomo = len_w_outliers_matomo - len_wout_outliers_matomo
perc_outliers_ga = (num_outliers_ga / num_outliers) * 100.
perc_outliers_ga_sample = (num_outliers_ga / len_w_outliers_ga) * 100.
perc_outliers_matomo = (num_outliers_matomo / num_outliers) * 100.
perc_outliers_matomo_sample = (num_outliers_matomo / len_w_outliers_matomo) * 100.

print("w/ outliers:")
print("len: {:,}".format(len_w_outliers))
print("len GA: {:,}".format(len_w_outliers_ga))
print("len Matomo: {:,}".format(len_w_outliers_matomo))
print()
print("w/out outliers:")
print("len: {:,}".format(len_wout_outliers))
print("len GA: {:,} ({:.1f}% of outliers, {:.1f}% of sample)".format(len_wout_outliers_ga, perc_outliers_ga, perc_outliers_ga_sample))
print("len Matomo: {:,} ({:.1f}% of outliers, {:.1f}% of sample)".format(len_wout_outliers_matomo, perc_outliers_matomo, perc_outliers_matomo_sample))

ZeroDivisionError: division by zero

In [49]:
# tag date & form with new form or old form
def tag_new_form(r):
    if r['form'] in forms_upgrades_ids_logged:
        return r['date']>= form_upgrades[form_upgrades['form']==r['form']]['created'].iloc[0]
    else:
        return r['form'] in new_forms_ids
    
df['is_new_form'] = df[['date', 'form']].apply(tag_new_form, axis=1)
trans['is_new_form'] = trans[['date', 'form']].apply(tag_new_form, axis=1)

In [50]:
df['is_embed'] = df['form'].apply(is_embed)
trans['is_embed'] = trans['form'].apply(is_embed)

# analysis

In [120]:
pd.set_option('display.max_columns', 100)

In [52]:
# calculate stats by month for new & old forms
#df['month'] = df['date'].dt.to_period('M')
df['month'] = df['date'].apply(lambda x: "{}-{}".format(x.year, x.month))
trans['month'] = trans['date'].dt.to_period('M')

In [74]:
data = []
months = df['month'].unique()
for month in months:
    this_df = df[df['month']==month]
    this_trans = trans[trans['month']==month]
    
    data.append({
        'month': month,
        'forms': len(this_df['form'].unique()),
        'forms_trans': len(this_trans['form'].unique()),
        'transactions': this_df['count'].sum(),
        'transactions_trans': this_trans['count'].sum(),
        'new forms': len(this_df[this_df['is_new_form']]['form'].unique()),
        'new forms_trans': len(this_trans[this_trans['is_new_form']]['form'].unique()),
        'new forms transactions': this_df[this_df['is_new_form']]['count'].sum(),
        'new forms transactions_trans': this_trans[this_trans['is_new_form']]['count'].sum(),
        'new forms trans/form': this_df[this_df['is_new_form']]['count'].sum() / len(this_df[this_df['is_new_form']]['form'].unique()),
        'new forms mean onetime': this_df[this_df['is_new_form']]['volume_onetime'].sum() / this_df[this_df['is_new_form']]['count_onetime'].sum(),
        'new forms mean recurring': this_df[this_df['is_new_form']]['volume_recurring'].sum() / this_df[this_df['is_new_form']]['count_recurring'].sum(),
        'old forms': len(this_df[~this_df['is_new_form']]['form'].unique()),
        'old forms_trans': len(this_trans[~this_trans['is_new_form']]['form'].unique()),
        'old forms transactions': this_df[~this_df['is_new_form']]['count'].sum(),
        'old forms transactions_trans': this_trans[~this_trans['is_new_form']]['count'].sum(),
        'old forms trans/form': this_df[~this_df['is_new_form']]['count'].sum() / len(this_df[~this_df['is_new_form']]['form'].unique()),
        'old forms mean onetime': this_df[~this_df['is_new_form']]['volume_onetime'].sum() / this_df[~this_df['is_new_form']]['count_onetime'].sum(),
        'old forms mean recurring': this_df[~this_df['is_new_form']]['volume_recurring'].sum() / this_df[~this_df['is_new_form']]['count_recurring'].sum(),
        'new forms conversion GA': this_df[this_df['is_new_form']]['conversion_ga'].mean(),
        'new forms conversion GA onetime': this_df[this_df['is_new_form']]['conversion_onetime_ga'].mean(),
        'new forms conversion GA recurring': this_df[this_df['is_new_form']]['conversion_recurring_ga'].mean(),
        'old forms conversion GA': this_df[~this_df['is_new_form']]['conversion_ga'].mean(),
        'old forms conversion GA onetime': this_df[~this_df['is_new_form']]['conversion_onetime_ga'].mean(),
        'old forms conversion GA recurring': this_df[~this_df['is_new_form']]['conversion_recurring_ga'].mean(),
        'new forms conversion Matomo': this_df[this_df['is_new_form']]['conversion_matomo'].mean(),
        'new forms conversion Matomo onetime': this_df[this_df['is_new_form']]['conversion_onetime_matomo'].mean(),
        'new forms conversion Matomo recurring': this_df[this_df['is_new_form']]['conversion_recurring_matomo'].mean(),
        'old forms conversion Matomo': this_df[~this_df['is_new_form']]['conversion_matomo'].mean(),
        'old forms conversion Matomo onetime': this_df[~this_df['is_new_form']]['conversion_onetime_matomo'].mean(),
        'old forms conversion Matomo recurring': this_df[~this_df['is_new_form']]['conversion_recurring_matomo'].mean(),
        'new forms pageviews/form GA': this_df[this_df['is_new_form']]['pageviews_ga'].sum() / len(this_df[this_df['is_new_form']]['form'].unique()),
        'old forms pageviews/form GA': this_df[~this_df['is_new_form']]['pageviews_ga'].sum() / len(this_df[~this_df['is_new_form']]['form'].unique()),
        'new forms pageviews/form Matomo': this_df[this_df['is_new_form']]['pageviews_matomo'].sum() / len(this_df[this_df['is_new_form']]['form'].unique()),
        'old forms pageviews/form Matomo': this_df[~this_df['is_new_form']]['pageviews_matomo'].sum() / len(this_df[~this_df['is_new_form']]['form'].unique()),
        'embed conversion GA': this_df[this_df['is_embed']]['conversion_ga'].mean(),
        'embed conversion onetime GA': this_df[this_df['is_embed']]['conversion_onetime_ga'].mean(),
        'embed conversion recurring GA': this_df[this_df['is_embed']]['conversion_recurring_ga'].mean(),
        'embed conversion Matomo': this_df[this_df['is_embed']]['conversion_matomo'].mean(),
        'embed conversion onetime Matomo': this_df[this_df['is_embed']]['conversion_onetime_matomo'].mean(),
        'embed conversion recurring Matomo': this_df[this_df['is_embed']]['conversion_recurring_matomo'].mean(),
        'embed mean onetime': this_df[this_df['is_embed']]['volume_onetime'].sum() / this_df[this_df['is_embed']]['count_onetime'].sum(),
        'embed mean recurring': this_df[this_df['is_embed']]['volume_recurring'].sum() / this_df[this_df['is_embed']]['count_recurring'].sum(),
        'nonembed conversion GA': this_df[~this_df['is_embed']]['conversion_ga'].mean(),
        'nonembed conversion onetime GA': this_df[~this_df['is_embed']]['conversion_onetime_ga'].mean(),
        'nonembed conversion recurring GA': this_df[~this_df['is_embed']]['conversion_recurring_ga'].mean(),
        'nonembed conversion Matomo': this_df[~this_df['is_embed']]['conversion_matomo'].mean(),
        'nonembed conversion onetime Matomo': this_df[~this_df['is_embed']]['conversion_onetime_matomo'].mean(),
        'nonembed conversion recurring Matomo': this_df[~this_df['is_embed']]['conversion_recurring_matomo'].mean(),
        'nonembed mean onetime': this_df[~this_df['is_embed']]['volume_onetime'].sum() / this_df[~this_df['is_embed']]['count_onetime'].sum(),
        'nonembed mean recurring': this_df[~this_df['is_embed']]['volume_recurring'].sum() / this_df[~this_df['is_embed']]['count_recurring'].sum()
    })

In [75]:
pd.DataFrame(data).sort_values('month').tail(12)

Unnamed: 0,month,forms,forms_trans,transactions,transactions_trans,new forms,new forms_trans,new forms transactions,new forms transactions_trans,new forms trans/form,new forms mean onetime,new forms mean recurring,old forms,old forms_trans,old forms transactions,old forms transactions_trans,old forms trans/form,old forms mean onetime,old forms mean recurring,new forms conversion GA,new forms conversion GA onetime,new forms conversion GA recurring,old forms conversion GA,old forms conversion GA onetime,old forms conversion GA recurring,new forms conversion Matomo,new forms conversion Matomo onetime,new forms conversion Matomo recurring,old forms conversion Matomo,old forms conversion Matomo onetime,old forms conversion Matomo recurring,new forms pageviews/form GA,old forms pageviews/form GA,new forms pageviews/form Matomo,old forms pageviews/form Matomo,embed conversion GA,embed conversion onetime GA,embed conversion recurring GA,embed conversion Matomo,embed conversion onetime Matomo,embed conversion recurring Matomo,embed mean onetime,embed mean recurring,nonembed conversion GA,nonembed conversion onetime GA,nonembed conversion recurring GA,nonembed conversion Matomo,nonembed conversion onetime Matomo,nonembed conversion recurring Matomo,nonembed mean onetime,nonembed mean recurring
21,2022-11,10866,1271,33009.0,85001,5943,1054,18783.0,54441,3.160525,203.011891,67.730072,4952,220,14226.0,30560,2.872779,140.107707,46.392578,0.009066,0.00517,0.003895,0.004448,0.002494,0.001954,inf,inf,inf,inf,inf,inf,233.631499,451.643578,97.844018,170.527464,0.012731,0.00728,0.005451,inf,inf,inf,207.547228,58.025922,0.004608,0.002593,0.002015,inf,inf,inf,161.351888,60.301584
11,2022-12,9774,1483,34187.0,101894,5488,1266,23561.0,72802,4.293185,286.297385,71.692243,4298,218,10626.0,29092,2.472313,273.574416,51.222532,0.012055,0.007342,0.004713,0.005069,0.003025,0.002044,inf,inf,inf,inf,inf,inf,205.852952,293.001629,105.681851,65.950907,0.016579,0.010249,0.00633,inf,inf,inf,325.018191,68.776014,0.005727,0.003382,0.002345,inf,inf,inf,258.892564,63.221065
24,2022-2,10391,945,22938.0,59156,4789,723,9138.0,32296,1.908123,175.706084,62.416196,5617,223,13800.0,26860,2.456827,145.865534,47.699026,0.009564,0.004873,0.00469,0.006125,0.002645,0.00348,inf,inf,inf,inf,inf,inf,205.680727,415.974364,0.0,0.0,0.013799,0.006722,0.007077,inf,inf,inf,177.362301,60.49428,0.005002,0.002321,0.002682,inf,inf,inf,152.225067,49.035599
26,2022-3,9815,1001,24672.0,66276,4687,764,11631.0,35445,2.481545,212.81804,71.821609,5161,242,13041.0,30831,2.526836,141.970083,48.028028,0.008785,0.004559,0.004227,0.005348,0.002435,0.002913,inf,inf,inf,inf,inf,inf,319.718157,489.17361,0.0,0.0,0.012475,0.006167,0.006308,inf,inf,inf,201.861135,59.198234,0.004573,0.002251,0.002322,inf,inf,inf,165.46006,57.454985
0,2022-4,9177,981,41931.0,104461,4439,766,30134.0,77704,6.788466,183.57906,50.625432,4765,216,11797.0,26757,2.475761,154.706872,47.215661,0.009986,0.005149,0.004837,0.005325,0.002435,0.00289,inf,inf,inf,inf,inf,inf,349.261095,500.325918,0.0,0.0,0.013278,0.006711,0.006567,inf,inf,inf,192.798366,57.423624,0.005075,0.002446,0.002629,inf,inf,inf,160.955339,48.799203
19,2022-5,9373,976,22502.0,61987,4578,776,11908.0,37711,2.601136,213.217976,63.882693,4820,200,10594.0,24276,2.197925,173.025658,52.278386,0.008405,0.004348,0.004058,0.004391,0.001781,0.00261,inf,inf,inf,inf,inf,inf,252.019222,433.503734,0.0,0.0,0.01106,0.005333,0.005727,inf,inf,inf,186.888276,57.583262,0.004326,0.002046,0.002281,inf,inf,inf,201.179122,57.888962
25,2022-6,9198,1015,23653.0,61228,4506,815,13026.0,36983,2.890812,173.976157,66.505083,4717,200,10627.0,24245,2.252915,152.243338,50.364775,0.009339,0.004868,0.004471,0.00438,0.001942,0.002439,inf,inf,inf,inf,inf,inf,204.497115,310.864533,0.0,0.0,0.011358,0.005631,0.005727,inf,inf,inf,161.655425,55.907642,0.004915,0.00244,0.002475,inf,inf,inf,167.26759,59.177333
12,2022-7,8856,944,25772.0,61522,4459,768,15694.0,39222,3.519623,190.619025,51.975075,4412,176,10078.0,22300,2.284225,199.876541,53.118702,0.00904,0.004656,0.004384,0.004779,0.001978,0.002801,inf,inf,inf,inf,inf,inf,198.542498,305.627153,0.0,0.0,0.012169,0.005982,0.006187,inf,inf,inf,183.758911,57.054113,0.004723,0.002217,0.002506,inf,inf,inf,198.522879,51.481663
15,2022-8,9906,1014,23156.0,59916,4947,829,12204.0,36189,2.46695,334.955681,78.388762,4973,185,10952.0,23727,2.202292,168.466835,87.443204,0.008362,0.004114,0.004248,0.00478,0.002091,0.002689,inf,inf,inf,inf,inf,inf,238.323024,430.048864,0.0,0.0,0.011543,0.005588,0.005956,inf,inf,inf,179.066241,54.935269,0.004551,0.002096,0.002455,inf,inf,inf,304.065594,92.558068
7,2022-9,9737,1000,22460.0,60251,5036,811,11818.0,35909,2.346704,216.658646,64.889494,4718,190,10642.0,24342,2.255617,159.531751,42.617693,0.007606,0.004018,0.003588,0.004565,0.001981,0.002585,inf,inf,inf,inf,inf,inf,322.616759,558.377914,206.2222,263.108944,0.010633,0.005457,0.005176,inf,inf,inf,222.522622,57.4579,0.004233,0.00201,0.002223,inf,inf,inf,180.246189,51.482057


# debug: 12/2022

In [76]:
dec = df[df['month']=='2022-12']

print("len: {:,}".format(len(dec)))
print("forms: {:,}".format(dec['form'].nunique()))
print("trans: {:,}".format(dec['count'].sum()))
print("trans onetime: {:,}".format(dec['count_onetime'].sum()))
print("trans rec: {:,}".format(dec['count_recurring'].sum()))

len: 96,253
forms: 9,774
trans: 34,187.0
trans onetime: 24,274.0
trans rec: 9,913.0


In [82]:
cols = ['date', 'form', 'count_onetime', 'volume_onetime', 'count_recurring',
        'volume_recurring', 'count', 'pageviews_ga', 'pageviews_matomo']
dec[df['conversion_matomo']==np.inf][cols].head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,date,form,count_onetime,volume_onetime,count_recurring,volume_recurring,count,pageviews_ga,pageviews_matomo
428,2022-12-01,394,5.0,558.75,1.0,25.87,6.0,9.0,0.0
1037,2022-12-01,3,1.0,35.0,1.0,200.0,2.0,5.0,0.0
1299,2022-12-02,907,9.0,1100.0,1.0,50.0,10.0,164.0,0.0
3932,2022-12-01,937546,5.0,550.0,8.0,630.0,13.0,43.0,0.0
5014,2022-12-01,1836,2.0,1025.0,3.0,60.75,5.0,24.0,0.0


Examples of one time transactions on days with 0 traffic from matomo. Let's compare traffic in December from GA vs Matomo. This may illustrate failures in the tagging script vs missed traffic data.

_Matomo traffic stopped being tagged automatically on 2023-01-14, no idea why_

In [95]:
months_to_check = [('11', '2021'), ('12', '2021'), ('11', '2022'),
                  ('12', '2022'), ('01', '2023')]
for m in months_to_check:
    # check untagged
    q = '''select 
                sum(views) as views
            from googleanalytics_traffic
            where
                date_part('year', date)={} and
                date_part('month', date)={} and
                form=0'''.format(m[1], m[0])
    ga_untagged = redshift_query_read(q, schema='production')['views'].iloc[0]
    
    q = '''select 
                count(id) as views
            from matomo_traffic
            where
                date_part('year', timestamp)={} and
                date_part('month', timestamp)={} and
                form=0'''.format(m[1], m[0])
    matomo_untagged = redshift_query_read(q, schema='production')['views'].iloc[0]

    # check total traffic
    q = '''select 
                sum(views) as views
            from googleanalytics_traffic
            where
                date_part('year', date)={} and
                date_part('month', date)={}'''.format(m[1], m[0])
    ga_traff = redshift_query_read(q, schema='production')['views'].iloc[0]

    
    q = '''select
                count(id) as views
            from matomo_traffic
            where
                date_part('year', timestamp)={} and
                date_part('month', timestamp)={}'''.format(m[1], m[0])
    matomo_traff = redshift_query_read(q, schema='production')['views'].iloc[0]

    print("{}/{}".format(m[0], m[1]))
    print("Untagged: {:,} GA; {:,} Matomo".format(ga_untagged, matomo_untagged))
    print("Traffic: {:,} GA; {:,} Matomo".format(ga_traff, matomo_traff))
    print()

11/2021
Untagged: 322,234 GA; 0 Matomo
Traffic: 3,884,495 GA; 0 Matomo

12/2021
Untagged: 294,621 GA; 0 Matomo
Traffic: 3,189,233 GA; 0 Matomo

11/2022
Untagged: 457,789 GA; 179,178 Matomo
Traffic: 4,017,237 GA; 1,945,510 Matomo

12/2022
Untagged: 254,576 GA; 86,806 Matomo
Traffic: 2,716,948 GA; 1,835,922 Matomo

01/2023
Untagged: 249,879 GA; 2,710,060 Matomo
Traffic: 2,618,379 GA; 3,622,283 Matomo



In [105]:
jan = df[df['month']=='2023-1']

In [111]:
print("from compiled conversion data")
print("-"*40)

len_dec = len(dec)
len_match = len(dec[dec['pageviews_ga']==dec['pageviews_matomo']])
len_ga_gt_mat = len(dec[dec['pageviews_ga']>dec['pageviews_matomo']])
len_ga_lt_mat = len(dec[dec['pageviews_ga']<dec['pageviews_matomo']])

print("12/2022")
print("{:,} rows".format(len_dec))
print("{:,} ({:.2f}%) rows GA == Matomo".format(len_match, (len_match / len_dec) * 100.))
print("{:,} ({:.2f}%) GA > Matomo".format(len_ga_gt_mat, (len_ga_gt_mat / len_dec) * 100.))
print("{:,} ({:.2f}%) GA < Matomo".format(len_ga_lt_mat, (len_ga_lt_mat / len_dec) * 100.))

len_jan = len(jan)
len_match = len(jan[jan['pageviews_ga']==jan['pageviews_matomo']])
len_ga_gt_mat = len(jan[jan['pageviews_ga']>jan['pageviews_matomo']])
len_ga_lt_mat = len(jan[jan['pageviews_ga']<jan['pageviews_matomo']])

print()
print("1/2023")
print("{:,} rows".format(len_jan))
print("{:,} ({:.2f}%) rows GA == Matomo".format(len_match, (len_match / len_dec) * 100.))
print("{:,} ({:.2f}%) GA > Matomo".format(len_ga_gt_mat, (len_ga_gt_mat / len_dec) * 100.))
print("{:,} ({:.2f}%) GA < Matomo".format(len_ga_lt_mat, (len_ga_lt_mat / len_dec) * 100.))

from compiled conversion data
----------------------------------------
12/2022
96,253 rows
12,466 (12.95%) rows GA == Matomo
74,510 (77.41%) GA > Matomo
9,277 (9.64%) GA < Matomo

1/2023
91,438 rows
3 (0.00%) rows GA == Matomo
91,381 (94.94%) GA > Matomo
54 (0.06%) GA < Matomo


Matomo clearly has a dramatically lower page view count in 12/2022 than GA, so let's compare day by day to see if there's a clear dropoff that might be attributed to a technical failure

In [118]:
pd.set_option('display.max_rows', 500)

In [115]:
def perc_diff(r):
    return "{:.2f}%".format(((r['ga_views'] - r['matomo_views']) / r['ga_views']) * 100.)

In [116]:
q = '''select
            date,
            sum(views) as ga_views
        from googleanalytics_traffic
        where
            (date_part('year', date)=2022 and
            date_part('month', date)=12) or
            (date_part('year', date)=2023 and
            date_part('month', date)=1)
        group by date'''
df_ga = redshift_query_read(q, schema='production')

q = '''select
            date_trunc('day', timestamp) as date,
            count(id) as matomo_views
        from matomo_traffic
        where
            (date_part('year', timestamp)=2022 and
            date_part('month', timestamp)=12) or
            (date_part('year', date)=2023 and
            date_part('month', date)=1)
        group by date'''
df_matomo = redshift_query_read(q, schema='production')

df = df_ga.merge(df_matomo, on='date').sort_values('date', ascending=True)
df['diff'] = df[['ga_views', 'matomo_views']].apply(perc_diff, axis=1)

In [119]:
df

Unnamed: 0,date,ga_views,matomo_views,diff
48,2022-12-01,141714,246,99.83%
14,2022-12-02,110898,795,99.28%
55,2022-12-03,83417,7179,91.39%
34,2022-12-04,90147,68101,24.46%
3,2022-12-05,111473,85521,23.28%
28,2022-12-06,111245,82462,25.87%
44,2022-12-07,104996,73664,29.84%
12,2022-12-08,104965,75261,28.30%
17,2022-12-09,83109,72267,13.05%
50,2022-12-10,69491,48251,30.57%
