In [1]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

import pandas as pd

In [2]:
start_date = '2018-01-01'

# system stats build

## median's

In [3]:
q = '''select
            date_trunc('month', date) as month,
            median(case when recurring=0 then amount else null end) as trans_mdn_onetime
        from production.transactions
        where status='A' and date>='{}'
        group by date_trunc('month', date)'''.format(start_date)
df_mdn_onetime = redshift_query_read(q, schema='production')

In [4]:
df_mdn_onetime.tail(2)

Unnamed: 0,month,trans_mdn_onetime
77,2023-03-01,50.0
78,2023-10-01,50.0


In [5]:
q = '''select
            date_trunc('month', date) as month,
            median(case when recurring_origin=1 then amount else null end) as trans_mdn_rec
        from production.transactions
        where status='A' and date>='{}'
        group by date_trunc('month', date)'''.format(start_date)
df_mdn_rec = redshift_query_read(q, schema='production')

In [6]:
df_mdn_rec.tail(2)

Unnamed: 0,month,trans_mdn_rec
77,2023-03-01,25.0
78,2023-10-01,26.25


## transactions general stats

In [7]:
# base
fields = [
    "date_trunc('month', t.date) as month",
    "count(distinct(t.org)) as orgs",
    "count(distinct(t.form)) as forms",
    "count(distinct(t.email)) as donors",
    "count(distinct(t.id)) as trans_count",
    "sum(t.amount) as trans_vol",
    "count(distinct(case when recurring=0 or recurring_origin=1 then id else null end)) as trans_new_count",
    "sum(case when recurring=0 or recurring_origin=1 then amount else null end) as trans_new_vol",
    "avg(case when recurring=0 or recurring_origin=1 then amount else null end) as trans_avg_amount"
]
# one time vs recurring
fields += [
    "count(distinct(case when recurring=0 then id else null end)) as trans_onetime",
    "avg(case when recurring=0 then amount else null end) as trans_avg_onetime",
    "count(distinct(case when recurring_origin=1 then id else null end)) as trans_rec_origin",
    "avg(case when recurring_origin=1 then amount else null end) as trans_avg_rec"
]
# platform (desktop, mobile, tablet)
fields += [
    "count(distinct(case when (platform='Windows' or platform='Mac') and (recurring=0 or recurring_origin=1) then id else null end)) as trans_count_desktop",
    "count(distinct(case when (platform='iPhone' or platform='Android') and (recurring=0 or recurring_origin=1) then id else null end)) as trans_count_mobile",
]
# transaction types (express checkout, gift assist, p2p purchases, auction purchases)
fields += [
    "sum(purchases_count) as purchases",
    "sum(registrations_count) as registrations",
    "count(distinct(case when t.isexpresscheckout then t.form else null end)) as expressdonate_forms",
    "sum(t.isexpresscheckout::int) as expressdonate_count",
    "sum(case when t.isexpresscheckout then t.amount else null end) as expressdonate_vol",
    "count(distinct(case when t.gift_assist_count then t.form else null end)) as giftassist_forms",
    "sum(t.gift_assist_count) as giftassist_count",
    "sum(t.gift_assist_amt) as giftassist_vol",
    "sum(t.purchases_count) as store_purchases_count",
    "sum(t.purchases_amt) as store_purchases_vol",
    "sum(t.auctionpurchase_count) as auction_purchases_count"
]
# teams (2) & registrations (10)
fields += [
    "count(distinct(case when t.transdonationentitytype=2 and t.source='p2p' then t.id else null end)) as donations_to_teams_count",
    "sum(case when t.transdonationentitytype=2 and t.source='p2p' then t.amount else null end) as donations_to_teams_vol",
    "count(distinct(case when t.transdonationentitytype=2 and t.source='p2p' then t.transdonationentity else null end)) as active_teams",
    "count(distinct(case when t.transdonationentitytype=10 and t.source='p2p' then t.id else null end)) as donations_to_registrations_count",
    "sum(case when t.transdonationentitytype=10 and t.source='p2p' then t.amount else null end) as donations_to_registrations_vol",
    "count(distinct(case when t.transdonationentitytype=10 and t.source='p2p' then t.transdonationentity else null end)) as active_registrations",
    "sum(t.registrations_count) as registrations_count",
    "sum(t.registrations_amt) as registrations_vol"
]

In [8]:
q = '''select {}
        from production.transactions as t
        where
            t.status='A' and t.date>='{}'
        group by date_trunc('month', t.date)'''.format(", ".join(fields), start_date)
df_trans = redshift_query_read(q, schema='production')

In [9]:
df_trans.tail(2)

Unnamed: 0,month,orgs,forms,donors,trans_count,trans_vol,trans_new_count,trans_new_vol,trans_avg_amount,trans_onetime,...,store_purchases_vol,auction_purchases_count,donations_to_teams_count,donations_to_teams_vol,active_teams,donations_to_registrations_count,donations_to_registrations_vol,active_registrations,registrations_count,registrations_vol
77,2021-06-01,2463,6748,141299,170986,19054260.84,89884,14933740.6,166.144593,86156,...,41313.26,1806,3364,672897.54,825,8329,853016.21,3261,6697,80862.0
78,2023-02-01,2858,8556,194939,235206,25442989.32,133694,19500351.56,145.858091,129864,...,87635.44,3344,6040,668787.68,1701,30590,2870688.05,9140,22568,270965.5


## GA

In [10]:
fields = ["date_trunc('month', ga.date) as month",
          "sum(ga.views) as pageviews",
          "sum(case when ga.devicecategory='desktop' then ga.views else null end) as pageviews_desktop",
          "sum(case when ga.devicecategory='mobile' then ga.views else null end) as pageviews_mobile",
          "sum(case when ga.devicecategory='tablet' then ga.views else null end) as pageviews_tablet"]

In [11]:
q = '''select {}
        from production.ga as ga
        where
            ga.date>='{}'
        group by date_trunc('month', ga.date)'''.format(", ".join(fields), start_date)
df_ga = redshift_query_read(q, schema='production')

In [12]:
df_ga['pageviews'].fillna(0, inplace=True)
df_ga['pageviews_desktop'].fillna(0, inplace=True)
df_ga['pageviews_mobile'].fillna(0, inplace=True)
df_ga['pageviews_tablet'].fillna(0, inplace=True)

In [13]:
df_ga['month'].min()

Timestamp('2018-01-01 00:00:00')

In [14]:
df_ga.sort_values('month', ascending=True).tail(4)

Unnamed: 0,month,pageviews,pageviews_desktop,pageviews_mobile,pageviews_tablet
72,2024-04-01,2839072,1622765,1205571,10736
48,2024-05-01,2428457,1499838,913981,14628
26,2024-06-01,2014907,1282191,720464,12216
37,2024-07-01,3963,2290,1647,26


### compare to lower resolution

In [15]:
fields = ["date_trunc('month', ga.week) as month",
          "sum(ga.views) as pageviews",
          "sum(case when ga.devicecategory='desktop' then ga.views else null end) as pageviews_desktop",
          "sum(case when ga.devicecategory='mobile' then ga.views else null end) as pageviews_mobile",
          "sum(case when ga.devicecategory='tablet' then ga.views else null end) as pageviews_tablet"]

q = '''select {}
        from production.ga4_traffic_weekly_device as ga
        where ga.week>='{}' 
        group by date_trunc('month', ga.week)'''.format(", ".join(fields), start_date)
df_ga_week = redshift_query_read(q, schema='production')

In [16]:
df_ga_week['pageviews'].fillna(0, inplace=True)
df_ga_week['pageviews_desktop'].fillna(0, inplace=True)
df_ga_week['pageviews_mobile'].fillna(0, inplace=True)
df_ga_week['pageviews_tablet'].fillna(0, inplace=True)

In [17]:
df_ga_week['month'].min()

Timestamp('2023-07-01 00:00:00')

In [18]:
q = '''select
            date_trunc('month', ga.week) as month,
            sum(ga.views) as pageviews_new
        from production.ga4_traffic_weekly as ga
        where ga.week>='{}' 
        group by date_trunc('month', ga.week)'''.format(start_date)
df_ga_week_nodev = redshift_query_read(q, schema='production')

In [19]:
df_months = df_ga[['month', 'pageviews']].copy()
df_months.columns = ['month', 'pageviews_old']

df_months = df_months.merge(df_ga_week[['month', 'pageviews']], on='month')
df_months = df_months.merge(df_ga_week_nodev, on='month')

df_months['pageviews_old'] = df_months['pageviews_old'].apply(lambda x: "{:,}".format(x))
df_months['pageviews_new'] = df_months['pageviews_new'].apply(lambda x: "{:,}".format(x))
df_months['pageviews_new_device'] = df_months['pageviews'].apply(lambda x: "{:,}".format(x))
df_months.drop('pageviews', axis=1, inplace=True)

df_months.sort_values('month')

Unnamed: 0,month,pageviews_old,pageviews_new,pageviews_new_device
10,2023-07-01,4489051,1153473,982181
2,2023-08-01,6010484,1619512,1396147
4,2023-09-01,6886439,2369275,2087717
1,2023-10-01,3212297,2766447,2437753
8,2023-11-01,2181027,1709492,1495564
6,2023-12-01,1544305,971188,826364
9,2024-01-01,1945672,2029309,1803461
0,2024-02-01,2341039,2457169,2231084
5,2024-03-01,2701030,2397268,2123792
11,2024-04-01,2839072,3496219,3145314


### pulling max views available

In [20]:
# old source
fields = ["date_trunc('month', ga.date) as month",
          "ga.org as org",
          "ga.form as form",
          "sum(ga.views) as pageviews",
          "sum(case when ga.devicecategory='desktop' then ga.views else null end) as pageviews_desktop",
          "sum(case when ga.devicecategory='mobile' then ga.views else null end) as pageviews_mobile",
          "sum(case when ga.devicecategory='tablet' then ga.views else null end) as pageviews_tablet"]
q = '''select {}
        from production.ga as ga
        where
            ga.date>='{}'
        group by org, form, date_trunc('month', ga.date)'''.format(", ".join(fields), start_date)
df_ga = redshift_query_read(q, schema='production')

# new, device
fields = ["date_trunc('month', ga.week) as month",
          "ga.org as org",
          "ga.form as form",
          "sum(ga.views) as pageviews_new_device",
          "sum(case when ga.devicecategory='desktop' then ga.views else null end) as pageviews_desktop_new",
          "sum(case when ga.devicecategory='mobile' then ga.views else null end) as pageviews_mobile_new",
          "sum(case when ga.devicecategory='tablet' then ga.views else null end) as pageviews_tablet_new"]
q = '''select {}
        from production.ga4_traffic_weekly_device as ga
        where
            ga.week>='{}'
        group by org, form, date_trunc('month', ga.week)'''.format(", ".join(fields), start_date)
df_new_dev = redshift_query_read(q, schema='production')

# new, no device
fields = ["date_trunc('month', ga.week) as month",
          "ga.org as org",
          "ga.form as form",
          "sum(ga.views) as pageviews_new"]
q = '''select {}
        from production.ga4_traffic_weekly as ga
        where
            ga.week>='{}'
        group by org, form, date_trunc('month', ga.week)'''.format(", ".join(fields), start_date)
df_new = redshift_query_read(q, schema='production')

In [21]:
mrg_cols = ['month', 'form', 'org']
df_traff_all = df_ga.merge(df_new_dev, on=mrg_cols).merge(df_new, on=mrg_cols)

In [22]:
df_traff_all['pageviews_max'] = df_traff_all[['pageviews', 'pageviews_new', 'pageviews_new_device']].max(axis=1)
df_traff_all['pageviews_desktop_max'] = df_traff_all[['pageviews_desktop', 'pageviews_desktop_new']].max(axis=1)
df_traff_all['pageviews_mobile_max'] = df_traff_all[['pageviews_mobile', 'pageviews_mobile_new']].max(axis=1)
df_traff_all['pageviews_tablet_max'] = df_traff_all[['pageviews_tablet', 'pageviews_tablet_new']].max(axis=1)

In [23]:
monthly_sums = df_traff_all.groupby('month')[['pageviews', 'pageviews_max']].sum().reset_index()
monthly_sums['perc_growth'] = monthly_sums['pageviews_max'] / monthly_sums['pageviews']

print("Mean growth: {:.2f}%; median growth: {:.2f}%".format(monthly_sums['perc_growth'].mean() * 100., monthly_sums['perc_growth'].median() * 100.))
print("Instances of old > new-max: {}".format(len(monthly_sums[monthly_sums['pageviews'] > monthly_sums['pageviews_max']])))
monthly_sums.tail(3)

Mean growth: 117.49%; median growth: 114.04%
Instances of old > new-max: 0


Unnamed: 0,month,pageviews,pageviews_max,perc_growth
9,2024-04-01,2325239,3424058,1.472562
10,2024-05-01,2333854,2753182,1.179672
11,2024-06-01,1620875,1621168,1.000181


In [24]:
max_cols = ['month', 'pageviews_max', 'pageviews_desktop_max',
            'pageviews_mobile_max', 'pageviews_tablet_max']
df_ga_max = df_traff_all.groupby('month')[max_cols].sum().reset_index()
df_ga_max.columns = ['month', 'pageviews', 'pageviews_desktop',
                     'pageviews_mobile', 'pageviews_tablet']
df_ga_max.sort_values('month', ascending=False).head(3)

Unnamed: 0,month,pageviews,pageviews_desktop,pageviews_mobile,pageviews_tablet
11,2024-06-01,1621168,1054335.0,557136.0,9782.0
10,2024-05-01,2753182,1643372.0,1027677.0,20134.0
9,2024-04-01,3424058,1910819.0,1329680.0,10027.0


In [25]:
df_ga_max['month'].min(), df_ga_max['month'].max()

(Timestamp('2023-07-01 00:00:00'), Timestamp('2024-06-01 00:00:00'))

In [26]:
grpd = df_ga[df_ga['month']<df_ga_max['month'].min()]
grpd = grpd.groupby('month')['pageviews', 'pageviews_desktop',
                             'pageviews_mobile', 'pageviews_tablet'].sum().reset_index()
df_ga_max = df_ga_max.append(grpd)

  


In [27]:
df_ga_max['month'].min(), df_ga_max['month'].max()

(Timestamp('2018-01-01 00:00:00'), Timestamp('2024-06-01 00:00:00'))

## analytics

In [28]:
q = '''select
            date as month,
            avg(events_count) as events_yearround,
            count(distinct(case when conditional_fields>0 then form else null end)) as conditional_fields_forms,
            count(distinct(case when cta_before=1 then form else null end)) as cta_before_forms,
            count(distinct(case when cta_after=1 then form else null end)) as cta_after_forms
        from analyticsqgiv_monthly 
        where date>='{}'
        group by date'''.format(start_date)
df_analytics_qgiv = redshift_query_read(q, schema='public')

In [29]:
df_analytics_qgiv.sort_values('month', ascending=True).tail(2)

Unnamed: 0,month,events_yearround,conditional_fields_forms,cta_before_forms,cta_after_forms
52,2024-06-01,0,1773,46213,4007
60,2024-07-01,0,1788,50067,4073


In [30]:
q = '''select
            date as month,
            sum(events_count) as events_yearround,
            sum(case when conditional_fields>0 then 1 else 0 end) as conditional_fields_forms,
            sum(cta_before) as cta_before_forms,
            sum(cta_after) as cta_after_forms
        from analyticsp2p_monthly 
        where date>='{}'
        group by date'''.format(start_date)
#df_analytics_p2p = redshift_query_read(q, schema='public')

In [31]:
#df_analytics_p2p.tail(2).transpose()

## merge

In [32]:
mrgd = df_trans.merge(df_mdn_onetime, on='month', how='left').merge(df_mdn_rec, on='month', how='left')
mrgd = mrgd.merge(df_ga_max, on='month', how='left').merge(df_analytics_qgiv, on='month', how='left')

In [33]:
print("{} rows".format(len(mrgd)))
print("{} to {}".format(mrgd['month'].min(), mrgd['month'].max()))

79 rows
2018-01-01 00:00:00 to 2024-07-01 00:00:00


In [34]:
mrgd.sort_values('month', ascending=True, inplace=True)

In [35]:
mrgd.tail(4).transpose()

Unnamed: 0,28,74,40,50
month,2024-04-01 00:00:00,2024-05-01 00:00:00,2024-06-01 00:00:00,2024-07-01 00:00:00
orgs,3831,3834,3853,1956
forms,11635,11668,11420,3930
donors,280031,236333,208772,19411
trans_count,397064,296443,273166,22729
trans_vol,44134696.09,42285115.76,33045754.63,4414063.87
trans_new_count,215325,166511,132895,10124
trans_new_vol,34993129.72,35107836.17,25699080.58,3616769.3
trans_avg_amount,162.513084,210.843945,193.378837,357.247066
trans_onetime,207400,160893,126778,8425


In [36]:
mrgd.to_csv("dashboard.stats.csv", index=False)

# forms stats build

## medians

In [37]:
# median one time
q = '''select
            form,
            date_trunc('month', date) as month,
            median(case when recurring=0 then amount else null end) as trans_mdn_onetime
        from production.transactions
        where status='A' and date>='{}'
        group by form, date_trunc('month', date)'''.format(start_date)
df_mdn_onetime = redshift_query_read(q, schema='production').fillna(0)

In [38]:
df_mdn_onetime.tail(2)

Unnamed: 0,form,month,trans_mdn_onetime
524706,1036210,2024-07-01,846.23
524707,1036526,2024-07-01,10.0


In [39]:
# median recurring
q = '''select
            form,
            date_trunc('month', date) as month,
            median(case when recurring_origin=1 then amount else null end) as trans_mdn_rec
        from production.transactions
        where status='A' and date>='{}'
        group by form, date_trunc('month', date)'''.format(start_date)
df_mdn_rec = redshift_query_read(q, schema='production').fillna(0)

In [40]:
df_mdn_rec.tail(2)

Unnamed: 0,form,month,trans_mdn_rec
524706,1036210,2024-07-01,0.0
524707,1036526,2024-07-01,0.0


## transactions general stats

In [41]:
# base
fields = [
    "t.form",
    "f.type as product",
    "f.template as frontend_template",
    "date_trunc('month', t.date) as month",
    "o.segment as ntee"
]
# base processing
fields += [
    "count(distinct(t.id)) as trans_count",
    "sum(t.amount) as trans_vol",
    "count(distinct(case when t.recurring=0 or t.recurring_origin=1 then t.id else null end)) as trans_new_count",
    "sum(case when t.recurring=0 or t.recurring_origin=1 then amount else null end) as trans_new_vol",
    "avg(case when t.recurring=0 or t.recurring_origin=1 then t.amount else null end) as trans_avg_amount"
]
# onetime vs recurring
fields += [
    "count(distinct(case when t.recurring=0 then t.id else null end)) as trans_onetime_count",
    "avg(case when t.recurring=0 then t.amount else null end) as trans_avg_onetime",
    "count(distinct(case when t.recurring_origin=1 then t.id else null end)) as trans_rec_origin_count",
    "avg(case when t.recurring_origin=1 then t.amount else null end) as trans_avg_rec"
]
# platform (desktop, mobile, tablet)
fields += [
    "count(distinct(case when (t.platform='Windows' or t.platform='Mac') and (recurring=0 or recurring_origin=1) then t.id else null end)) as trans_count_desktop",
    "count(distinct(case when (t.platform='iPhone' or t.platform='Android') and (t.recurring=0 or t.recurring_origin=1) then t.id else null end)) as trans_count_mobile"
]
# transaction types (gift assist, express donate, p2p store purchases, auction purchases)
fields += [
    "sum(t.isexpresscheckout::int) as expressdonate_count",
    "count(distinct(case when t.isexpresscheckout and (t.recurring=0 or t.recurring_origin=1) then t.id else null end)) as expressdonate_new_count",
    "sum(t.amount) as expressdonate_vol",
    "sum(case when t.isexpresscheckout and (t.recurring=0 or t.recurring_origin=1) then t.amount else null end) as expressdonate_new_vol",
    "sum(t.gift_assist_count) as giftassist_count",
    "sum(t.gift_assist_amt) as giftassist_vol",
    "sum(t.purchases_count) as store_purchases_count",
    "sum(t.purchases_amt) as store_purchases_vol",
    "sum(t.auctionpurchase_count) as auction_purchases_count",
]
# teams (2) & participants (10)
fields += [
    "count(distinct(case when t.transdonationentitytype=2 and t.source='p2p' then t.id else null end)) as donations_to_teams_count",
    "sum(case when t.transdonationentitytype=2 and t.source='p2p' then t.amount else null end) as donations_to_teams_vol",
    "count(distinct(case when t.transdonationentitytype=2 and t.source='p2p' then t.transdonationentity else null end)) as active_teams",
    "count(distinct(case when t.transdonationentitytype=10 and t.source='p2p' then t.id else null end)) as donations_to_registrations_count",
    "sum(case when t.transdonationentitytype=10 and t.source='p2p' then t.amount else null end) as donations_to_registrations_vol",
    "count(distinct(case when t.transdonationentitytype=10 and t.source='p2p' then t.transdonationentity else null end)) as active_registrations",
    "sum(t.registrations_count) as registrations_count",
    "sum(t.registrations_amt) as registrations_vol"
]

In [42]:
q = '''select {}
        from production.transactions as t
            left join production.form as f on t.form=f.id
            left join production.organization as o on f.org=o.id
        where
            t.status='A' and t.date>='{}'
        group by t.form, date_trunc('month', t.date), f.type, f.template, o.segment'''.format(", ".join(fields), start_date)
df_trans = redshift_query_read(q, schema='production')

In [43]:
df_trans.tail(2).transpose()

Unnamed: 0,524706,524707
form,1004308,890983
product,1.0,1.0
frontend_template,8.0,8.0
month,2024-07-01 00:00:00,2024-07-01 00:00:00
ntee,B - Educational Institutions,P - Human Services
trans_count,1,1
trans_vol,250.0,1600.0
trans_new_count,1,1
trans_new_vol,250.0,1600.0
trans_avg_amount,250.0,1600.0


In [44]:
df_trans.groupby('month')['form'].nunique().tail(6)

month
2024-02-01    10248
2024-03-01    10991
2024-04-01    11635
2024-05-01    11668
2024-06-01    11420
2024-07-01     3930
Name: form, dtype: int64

## GA

In [45]:
fields = ["ga.form",
          "date_trunc('month', ga.date) as month",
          "sum(ga.views) as pageviews",
          "sum(case when ga.devicecategory='desktop' then ga.views else null end) as pageviews_desktop",
          "sum(case when ga.devicecategory='mobile' then ga.views else null end) as pageviews_mobile",
          "sum(case when ga.devicecategory='tablet' then ga.views else null end) as pageviews_tablet"]

In [46]:
q = '''select {}
        from production.ga as ga
        where
            ga.date>='{}'
        group by ga.form, date_trunc('month', ga.date)'''.format(", ".join(fields), start_date)
df_ga = redshift_query_read(q, schema='production')

In [47]:
df_ga['pageviews'].fillna(0, inplace=True)
df_ga['pageviews_desktop'].fillna(0, inplace=True)
df_ga['pageviews_mobile'].fillna(0, inplace=True)
df_ga['pageviews_tablet'].fillna(0, inplace=True)

In [48]:
df_ga.tail(2)

Unnamed: 0,form,month,pageviews,pageviews_desktop,pageviews_mobile,pageviews_tablet
641375,979310,2023-12-01,3,3.0,0.0,0.0
641376,919671,2021-09-01,1,0.0,1.0,0.0


In [49]:
max_cols = ['month', 'form', 'pageviews_max',
            'pageviews_desktop_max', 'pageviews_mobile_max',
            'pageviews_tablet_max']

df_ga_max = df_traff_all[max_cols]
df_ga_max.columns = ['month', 'form', 'pageviews',
                     'pageviews_desktop', 'pageviews_mobile',
                     'pageviews_tablet']

In [50]:
len(df_ga_max), df_ga_max['month'].min(), df_ga_max['month'].max()

(10295, Timestamp('2023-07-01 00:00:00'), Timestamp('2024-06-01 00:00:00'))

In [51]:
df_ga_max.tail(2)

Unnamed: 0,month,form,pageviews,pageviews_desktop,pageviews_mobile,pageviews_tablet
10293,2024-05-01,1032459,18,14.0,,
10294,2024-05-01,1031799,9,9.0,,


In [52]:
df_ga_max = df_ga_max.append(df_ga[df_ga['month']<df_ga_max['month'].min()])

In [53]:
len(df_ga_max), df_ga_max['month'].min(), df_ga_max['month'].max()

(564453, Timestamp('2018-01-01 00:00:00'), Timestamp('2024-06-01 00:00:00'))

## analytics

In [54]:
q = '''select
            form,
            date as month,
            events_count as events_yearround,
            conditional_fields,
            cta_before,
            cta_after
        from analyticsqgiv_monthly 
        where date>='{}' '''.format(start_date)
df_analytics_qgiv = redshift_query_read(q, schema='public')

In [55]:
q = '''select widget as form from embed'''
embeds = redshift_query_read(q, schema='production')

df_analytics_qgiv['is_embed'] = df_analytics_qgiv['form'].isin(embeds['form'].tolist())

In [56]:
df_analytics_qgiv['is_embed'].value_counts()

False    1758035
True      517396
Name: is_embed, dtype: int64

In [57]:
df_analytics_qgiv.tail(2)

Unnamed: 0,form,month,events_yearround,conditional_fields,cta_before,cta_after,is_embed
2275429,974825,2021-06-01,0,0,0,0,False
2275430,1010235,2024-07-01,1,0,0,0,False


## representative forms

In [58]:
df_rep_forms = pd.read_csv("../representative forms/filtered_forms.csv")['form'].tolist()

In [59]:
df_rep_forms[-2:]

[1020596, 1022212]

## merge

In [60]:
len(df_trans)

524708

In [61]:
mrgd = df_trans.merge(df_mdn_onetime, on=['form', 'month'], how='left').merge(df_mdn_rec, on=['form', 'month'], how='left')
mrgd = mrgd.merge(df_ga_max, on=['form', 'month'], how='left').merge(df_analytics_qgiv, on=['form', 'month'], how='left')
mrgd['rep_forms'] = mrgd['form'].isin(df_rep_forms)

In [62]:
print("{:,} rows".format(len(mrgd)))
print("{:,} unique forms".format(len(mrgd['form'].unique())))
print("{:,} unique months".format(len(mrgd['month'].unique())))
print("{} to {}".format(mrgd['month'].min(), mrgd['month'].max()))

524,723 rows
57,260 unique forms
79 unique months
2018-01-01 00:00:00 to 2024-07-01 00:00:00


In [63]:
mrgd.to_csv("dashboard.stats_forms.csv", index=False)

In [64]:
mrgd.head(2)

Unnamed: 0,form,product,frontend_template,month,ntee,trans_count,trans_vol,trans_new_count,trans_new_vol,trans_avg_amount,...,pageviews,pageviews_desktop,pageviews_mobile,pageviews_tablet,events_yearround,conditional_fields,cta_before,cta_after,is_embed,rep_forms
0,11,1.0,8.0,2020-04-01,X - Religion; Spiritual Development,4,191.67,0,,,...,29.0,19.0,10.0,0.0,1.0,0.0,0.0,0.0,False,False
1,11,1.0,8.0,2020-06-01,X - Religion; Spiritual Development,3,91.67,0,,,...,38.0,26.0,12.0,0.0,1.0,0.0,0.0,0.0,False,False


In [65]:
mrgd['conversion_onetime'] = (mrgd['trans_onetime_count'] / mrgd['pageviews']) * 100.
mrgd['conversion_recurring'] = (mrgd['trans_rec_origin_count'] / mrgd['pageviews']) * 100.
mrgd['conversion'] = mrgd['conversion_onetime'] + mrgd['conversion_recurring']

In [66]:
mrgd[mrgd['rep_forms']][['month', 'form', 'pageviews_mobile', 'trans_count_mobile']]

Unnamed: 0,month,form,pageviews_mobile,trans_count_mobile
79,2020-01-01,1293,30.0,1
82,2019-02-01,1293,56.0,1
83,2019-05-01,1293,48.0,0
198,2018-02-01,47162,740.0,115
201,2019-01-01,47162,634.0,117
...,...,...,...,...
524667,2024-06-01,788554,16.0,2
524690,2024-06-01,928742,,0
524693,2024-06-01,1012737,,0
524712,2024-07-01,1017351,,2


In [67]:
grpd_embeds = mrgd.groupby(['month', 'is_embed'])['trans_vol'].agg(['sum', 'mean', 'median']).reset_index()

In [68]:
grpd_embeds.head(2)

Unnamed: 0,month,is_embed,sum,mean,median
0,2018-01-01,False,3673082.73,2810.315784,326.0
1,2018-01-01,True,4324674.25,3386.589076,686.67


In [69]:
pvt_embeds = grpd_embeds.pivot(index='month', columns='is_embed', values=['sum', 'mean', 'median']).reset_index()
pvt_embeds.head(2)

Unnamed: 0_level_0,month,sum,sum,mean,mean,median,median
is_embed,Unnamed: 1_level_1,False,True,False,True,False,True
0,2018-01-01,3673082.73,4324674.25,2810.315784,3386.589076,326.0,686.67
1,2019-02-01,4499717.05,6097481.24,2379.543654,3321.068214,225.0,612.5


In [70]:
cols = ['month', 
        'nonembed_vol', 'embed_vol',
        'nonembed_vol_mean', 'embed_vol_mean',
        'nonembed_vol_median', 'embed_vol_median']
pvt_embeds.columns = cols
pvt_embeds.head(2)

Unnamed: 0,month,nonembed_vol,embed_vol,nonembed_vol_mean,embed_vol_mean,nonembed_vol_median,embed_vol_median
0,2018-01-01,3673082.73,4324674.25,2810.315784,3386.589076,326.0,686.67
1,2019-02-01,4499717.05,6097481.24,2379.543654,3321.068214,225.0,612.5


# cool stuff data

- daily stats
- monthly basics with YoY

In [71]:
q = '''select
            date,
            count(distinct(org)) as orgs,
            count(distinct(form)) as forms,
            count(id) as trans_count,
            sum(amount) as trans_vol,
            count(case when recurring=0 then id else null end) as trans_count_onetime,
            count(case when recurring_origin=1 then id else null end) as trans_count_recurring,
            sum(case when recurring=0 then amount else null end) as trans_vol_onetime,
            sum(case when recurring_origin=1 then amount else null end) as trans_vol_recurring
        from transactions
        where status='A'
        group by date 
        order by date desc limit 30'''
cool_stuff_daily = redshift_query_read(q, schema='production')

In [72]:
cool_stuff_daily.head(2)

Unnamed: 0,date,orgs,forms,trans_count,trans_vol,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring
0,2024-07-02,1343,2381,8922,1155446.89,4234,222,889027.89,21152.2
1,2024-07-01,1565,2879,13807,3258616.98,4191,1477,997404.77,1709184.44


In [73]:
cool_stuff_daily.to_csv("cool.daily.csv", index=False)

In [74]:
q = '''select
            date_trunc('month', date) as month,
            count(distinct(org)) as orgs,
            count(distinct(form)) as forms,
            count(id) as trans_count,
            sum(amount) as trans_vol,
            count(case when recurring=0 then id else null end) as trans_count_onetime,
            count(case when recurring_origin=1 then id else null end) as trans_count_recurring,
            sum(case when recurring=0 then amount else null end) as trans_vol_onetime,
            sum(case when recurring_origin=1 then amount else null end) as trans_vol_recurring
        from transactions
        where status='A'
        group by date_trunc('month', date)
        order by date_trunc('month', date) desc limit 24'''
cool_stuff_monthly = redshift_query_read(q, schema='production')

In [75]:
cool_stuff_monthly.head(2)

Unnamed: 0,month,orgs,forms,trans_count,trans_vol,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring
0,2024-07-01,1956,3930,22729,4414063.87,8425,1699,1886432.66,1730336.64
1,2024-06-01,3853,11420,273166,33045754.63,126778,6117,25084174.55,614906.03


In [76]:
cool_stuff_monthly.to_csv("cool.monthly.csv", index=False)

In [77]:
cool_stuff_monthly['dt.month'] = pd.to_datetime(cool_stuff_monthly['month']).dt.month

cool_stuff_monthly.sort_values('month', ascending=True, inplace=True)
yoy = cool_stuff_monthly.drop('month', axis=1).groupby('dt.month').pct_change()
yoy['month'] = cool_stuff_monthly['month']

In [78]:
yoy.dropna()

Unnamed: 0,orgs,forms,trans_count,trans_vol,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring,month
11,0.09065,0.14511,0.189126,0.198367,0.293386,0.004926,0.266424,-0.241237,2023-08-01
10,0.089013,0.142536,0.151272,0.210094,0.134,1.332711,0.225658,0.658298,2023-09-01
9,0.091411,0.138879,0.172444,0.259857,0.174509,-0.070497,0.291493,0.158825,2023-10-01
8,0.079652,0.132736,0.119452,0.149983,0.102514,-0.10995,0.155659,-0.07079,2023-11-01
7,0.077548,0.12954,0.103934,0.113225,0.076438,-0.042149,0.116041,-0.129796,2023-12-01
6,0.095016,0.151418,0.268053,0.269874,0.324296,0.997256,0.300723,0.902426,2024-01-01
5,0.148355,0.197756,0.240899,0.243509,0.263583,0.450131,0.266127,0.198716,2024-02-01
4,0.175217,0.192212,0.328845,0.253574,0.222755,0.570576,0.248532,0.180066,2024-03-01
3,0.278278,0.251614,0.150823,0.215309,0.257938,-0.088976,0.266568,-0.180832,2024-04-01
2,0.279706,0.247514,0.186664,0.401816,0.162354,0.196847,0.439577,0.691187,2024-05-01


In [79]:
cool_monthly_data = []
for month in yoy['month'].to_list():
    entry = {'month': month}
    
    for c in yoy.columns:
        if c != 'month':
            val = cool_stuff_monthly[cool_stuff_monthly['month']==month][c].iloc[0]
            change = yoy[yoy['month']==month][c].iloc[0]
            if 'vol' in c:
                entry[c] = "${:,.2f} ({:.2f}%)".format(val, change * 100.)
            else:
                entry[c] = "{:,} ({:.2f}%)".format(val, change * 100.)
    
    cool_monthly_data.append(entry)

In [80]:
pd.DataFrame(cool_monthly_data).tail(12)

Unnamed: 0,month,orgs,forms,trans_count,trans_vol,trans_count_onetime,trans_count_recurring,trans_vol_onetime,trans_vol_recurring
12,2023-08-01,"3,056 (9.06%)","9,296 (14.51%)","254,605 (18.91%)","$32,182,972.18 (19.84%)","142,125 (29.34%)","4,692 (0.49%)","$25,221,204.23 (26.64%)","$704,998.21 (-24.12%)"
13,2023-09-01,"3,132 (8.90%)","9,651 (14.25%)","301,944 (15.13%)","$35,874,083.75 (21.01%)","179,443 (13.40%)","11,239 (133.27%)","$28,928,926.99 (22.57%)","$563,784.72 (65.83%)"
14,2023-10-01,"3,164 (9.14%)","9,816 (13.89%)","316,432 (17.24%)","$40,294,357.35 (25.99%)","194,494 (17.45%)","4,061 (-7.05%)","$33,081,325.47 (29.15%)","$366,466.81 (15.88%)"
15,2023-11-01,"3,226 (7.97%)","10,232 (13.27%)","323,028 (11.95%)","$42,405,878.29 (15.00%)","201,394 (10.25%)","4,258 (-10.99%)","$35,360,213.31 (15.57%)","$327,440.59 (-7.08%)"
16,2023-12-01,"3,182 (7.75%)","10,045 (12.95%)","305,570 (10.39%)","$63,448,519.49 (11.32%)","182,130 (7.64%)","4,636 (-4.21%)","$56,153,068.10 (11.60%)","$396,158.09 (-12.98%)"
17,2024-01-01,"3,054 (9.50%)","9,376 (15.14%)","257,175 (26.81%)","$28,429,277.29 (26.99%)","128,511 (32.43%)","8,007 (99.73%)","$20,743,240.72 (30.07%)","$677,298.74 (90.24%)"
18,2024-02-01,"3,282 (14.84%)","10,248 (19.78%)","291,867 (24.09%)","$31,638,587.27 (24.35%)","164,094 (26.36%)","5,554 (45.01%)","$24,261,459.58 (26.61%)","$405,654.71 (19.87%)"
19,2024-03-01,"3,528 (17.52%)","10,991 (19.22%)","376,806 (32.88%)","$40,868,435.23 (25.36%)","197,393 (22.28%)","11,049 (57.06%)","$32,215,851.38 (24.85%)","$589,302.62 (18.01%)"
20,2024-04-01,"3,831 (27.83%)","11,635 (25.16%)","397,064 (15.08%)","$44,134,696.09 (21.53%)","207,400 (25.79%)","7,925 (-8.90%)","$34,506,362.04 (26.66%)","$486,767.68 (-18.08%)"
21,2024-05-01,"3,834 (27.97%)","11,668 (24.75%)","296,443 (18.67%)","$42,285,115.76 (40.18%)","160,893 (16.24%)","5,618 (19.68%)","$34,441,039.91 (43.96%)","$666,796.26 (69.12%)"
