In [91]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# load data

In [92]:
prod_forms_df = pd.read_csv("smart_amount_forms.csv")

In [93]:
# forms from Sandra
#     production query that have smart amounts enabled
prod_forms = prod_forms_df['id'].to_list()

print("production query for forms w/ smart amounts currently enabled:")
print("{:,} forms".format(len(prod_forms)))

production query for forms w/ smart amounts currently enabled:
500 forms


In [94]:
q = '''select 
            form, 
            created, 
            message 
        from syslog_logs 
        where message like '%Smart Amount%' '''
logs = redshift_query_read(q, schema='production')

print("{:,} smart amounts log entries; {:,} forms".format(len(logs), len(logs['form'].unique())))
print("ranging from {:%Y-%m-%d} to {:%Y-%m-%d}".format(logs['created'].min(), logs['created'].max()))

1,364 smart amounts log entries; 939 forms
ranging from 2022-11-07 to 2024-07-24


In [95]:
logs_forms = logs['form'].unique()
r = [p for p in prod_forms if p not in logs_forms]
print("Forms in production export not in logs: {:,}".format(len(r)))

Forms in production export not in logs: 268


In [96]:
logs['enabled'] = logs['message'].str.contains('Enabled').fillna(False)
logs['disabled'] = logs['message'].str.contains('Disabled').fillna(False)
entry_counts = logs.groupby('form')[['enabled', 'disabled']].sum().reset_index()

print("{:,} forms have only one entry (enabled)".format(len(entry_counts[(entry_counts['enabled']==1)&(entry_counts['disabled']==0)])))
print("{:,} forms enabled == disabled".format(len(entry_counts[entry_counts['enabled']==entry_counts['disabled']])))

424 forms have only one entry (enabled)
283 forms enabled == disabled


In [97]:
start_date = "{:%Y-%m-%d}".format(logs['created'].min())
print("collecting all transactions from {} forward".format(start_date))

q = '''select
            date,
            form,
            org,
            count(distinct(id)) trans_count,
            sum(amount) as trans_vol
        from transactions
        where
            date > '{}' and
            status='A' and
            recurring=0
        group by form, org, date'''.format(start_date)
trans = redshift_query_read(q, schema='production')

print("{:,} one time observations".format(len(trans)))
print("{:,.2f} transactions per form per date".format(trans['trans_count'].mean()))
print("{:,} forms possibly associated w/ smart amounts".format(len(trans[trans['form'].isin(logs['form'].unique())])))

collecting all transactions from 2022-11-07 forward
754,789 one time observations
4.23 transactions per form per date
26,239 forms possibly associated w/ smart amounts


In [98]:
forms_processed = len(trans[trans['form'].isin(logs['form'].unique())|trans['form'].isin(prod_forms)]['form'].unique())

print("{} smart amounts associated forms have accepted transactions".format(forms_processed))

889 smart amounts associated forms have accepted transactions


## tag transactions using smart amounts

In [99]:
# isolate smart amounts forms to more quickly tag transactions
trans['is_smart_amounts'] = False
trans_sa_forms = trans[trans['form'].isin(logs['form'].unique())|trans['form'].isin(prod_forms)].copy()
trans_nonsa_forms = trans[~trans['form'].isin(logs['form'].unique())&~trans['form'].isin(prod_forms)].copy()

In [100]:
print("trans_sa_forms (forms associated with SA)")
print("{:,} observations".format(len(trans_sa_forms)))
print("{:,} forms".format(len(trans_sa_forms['form'].unique())))
print("{:,} transactions".format(trans_sa_forms['trans_count'].sum()))

print()

print("trans_nonsa_forms (forms never associated with SA)")
print("{:,} observations".format(len(trans_nonsa_forms)))
print("{:,} forms".format(len(trans_nonsa_forms['form'].unique())))
print("{:,} transactions".format(trans_nonsa_forms['trans_count'].sum()))

trans_sa_forms (forms associated with SA)
27,819 observations
889 forms
113,858 transactions

trans_nonsa_forms (forms never associated with SA)
726,970 observations
27,447 forms
3,077,524 transactions


In [101]:
def is_using_smart_amounts(r):
    try:
        return logs[(logs['form']==r['form'])&(logs['created']<=r['date'])].sort_values('created', ascending=True)['enabled'].iloc[0]
    except:
        return False

In [102]:
# tag smart amounts transactions
trans_sa_forms_tagged = None
for f in trans_sa_forms['form'].unique().tolist():
    these_trans = trans_sa_forms[trans_sa_forms['form']==f].copy()
    
    if f in prod_forms and f not in logs_forms:
        these_trans['is_smart_amounts'] = True
    elif f in entry_counts[(entry_counts['enabled']==1)&(entry_counts['disabled']==0)]['form'].tolist():
        # forms that were just enabled and never disabled
        activation_date = logs[logs['form']==f]['created'].iloc[0]
        these_trans['is_smart_amounts'] = these_trans['date']>=activation_date
    else:
        # forms that were enabled and disabled, possibly multiple times
        # need to verify transactions fall within active windows
        these_trans['is_smart_amounts'] = these_trans.apply(is_using_smart_amounts, axis=1)
    
    if trans_sa_forms_tagged is None:
        trans_sa_forms_tagged = these_trans
    else:
        trans_sa_forms_tagged = pd.concat([trans_sa_forms_tagged, these_trans])

In [103]:
print("{:,} tagged transactions".format(trans_sa_forms_tagged['trans_count'].sum()))
print("{:,} tagged observations".format(len(trans_sa_forms_tagged)))
print("{:,} transactions".format(trans_sa_forms_tagged['trans_count'].sum()))
print("{:,} transactions that used smart amounts".format(trans_sa_forms_tagged[trans_sa_forms_tagged['is_smart_amounts']]['trans_count'].sum()))
print("{:,} forms that have used smart amounts".format(len(trans_sa_forms_tagged[trans_sa_forms_tagged['is_smart_amounts']]['form'].unique())))

113,858 tagged transactions
27,819 tagged observations
113,858 transactions
66,062 transactions that used smart amounts
674 forms that have used smart amounts


In [104]:
trans_tagged = pd.concat([trans_sa_forms_tagged, trans_nonsa_forms]).sort_values('date', ascending=True)

_prior run_

- 4,019 tagged transactions
- 1,693,433 tagged transactions
- 850 transactions that used smart amounts

## traffic

In [105]:
q = '''select
            date,
            form,
            sum(views) as views
        from ga
        where date>='{}' 
        group by date, form'''.format(start_date)
traff = redshift_query_read(q, schema='production')

In [106]:
len(traff), traff['date'].min(), traff['date'].max()

(1252782, Timestamp('2022-11-07 00:00:00'), Timestamp('2024-07-01 00:00:00'))

In [107]:
q = '''select
            date,
            form,
            sum(views) as views
        from ga4_traffic
        where date>='{}' 
        group by date, form'''.format(start_date)
traff_ga4 = redshift_query_read(q, schema='production')

In [108]:
len(traff_ga4), traff_ga4['date'].min(), traff_ga4['date'].max()

(1031946, Timestamp('2023-01-02 00:00:00'), Timestamp('2024-07-24 00:00:00'))

In [109]:
traff = pd.concat([traff, traff_ga4]).drop_duplicates(keep='last')

In [110]:
print("{:,} observations".format(len(traff)))
print("date range: {} - {}".format(traff['date'].min(), traff['date'].max()))
print("{:,} forms".format(len(traff['form'].unique())))

2,144,512 observations
date range: 2022-11-07 00:00:00 - 2024-07-24 00:00:00
48,841 forms


## merge trans + traffic

In [111]:
mrgd = trans_tagged.merge(traff, on=['form', 'date'])

In [112]:
print("{:,} merged observations".format(len(mrgd)))
print("{:,} orgs".format(len(mrgd['org'].unique())))
print("{:,} forms".format(len(mrgd['form'].unique())))
print("date range: {} - {}".format(mrgd['date'].min(), mrgd['date'].max()))
print("{:,} forms using smart amounts".format(len(mrgd[mrgd['is_smart_amounts']]['form'].unique())))
print("mean trans/date: {:.2f}".format(mrgd['trans_count'].mean()))
print("mean views/date: {:.2f}".format(mrgd['views'].mean()))

716,470 merged observations
5,145 orgs
21,822 forms
date range: 2022-11-08 00:00:00 - 2024-07-24 00:00:00
515 forms using smart amounts
mean trans/date: 5.67
mean views/date: 82.41


# analysis

what are the averages and representational counts?

In [113]:
mrgd['conversion'] = mrgd['trans_count'] / mrgd['views']
mrgd['conversion'].describe()

count    7.164700e+05
mean              inf
std               NaN
min      2.456882e-05
25%      4.545455e-02
50%      1.052632e-01
75%      2.500000e-01
max               inf
Name: conversion, dtype: float64

In [114]:
nan_perc = mrgd['conversion'].replace(np.inf, np.nan).isna().sum() / len(mrgd)
print("{:.4f}% NaN".format(nan_perc * 100.))

0.0744% NaN


In [115]:
mrgd['conversion'].replace(np.inf, np.nan, inplace=True)
mrgd = mrgd[~mrgd['conversion'].isna()]

In [116]:
mrgd['mean_trans_value'] = mrgd['trans_vol'] / mrgd['trans_count']

In [117]:
def base_report(d):
    print("observations: {:,}".format(len(d)))
    print("forms: {:,}".format(len(d['form'].unique())))
    print("transactions: {:,}".format(d['trans_count'].sum()))
    print("mean trans/day: {:,.2f}".format(d['trans_count'].mean()))
    print("median trans/day: {:,.2f}".format(d['trans_count'].median()))
    print("mean $/day: ${:,.2f}".format(d['trans_vol'].mean()))
    print("median $/day: ${:,.2f}".format(d['trans_vol'].median()))
    print("mean conversion/day/form: {:.2f}%".format(d['conversion'].mean()))
    print("median conversion/day/form: {:.2f}%".format(d['conversion'].median()))
    print("mean $/trans: ${:.2f}".format(d['mean_trans_value'].mean()))
    print("median $/trans: ${:.2f}".format(d['mean_trans_value'].median()))
    
    print()
    print("aggregate by form daily:")
    print(d.groupby('form')[['trans_count', 'trans_vol']].sum().agg(['mean', 'median']))
    print(d.groupby('form')[['conversion']].agg(['mean', 'median']).mean())
    
    print()
    print("aggregate by form (all observations)")
    grpd = d.groupby('form')[['trans_count', 'trans_vol', 'views']].sum().reset_index()
    grpd['conversion'] = grpd['trans_count'] / grpd['views']
    print("Mean $/form: ${:,.2f}".format(grpd['trans_vol'].sum() / len(grpd['form'].unique())))
    print("Mean $/page view: ${:,.2f}".format(grpd['trans_vol'].sum() / grpd['views'].sum()))
    print("Mean conversion: {:.2f}%".format(grpd['conversion'].mean() * 100.))
    print("Median conversion: {:.2f}%".format(grpd['conversion'].median() * 100.))
    print()
    print("all observations (platform)")
    print("Mean $/trans: ${:.2f}".format(d['trans_vol'].sum() / d['trans_count'].sum()))
    print("conversion: {:.2f}%".format((d['trans_count'].sum() / d['views'].sum()) * 100.))

In [118]:
print("Smart amounts:")
print("-"*40)
base_report(mrgd[mrgd['is_smart_amounts']])

print()

print("Not smart amounts:")
print("-"*40)
base_report(mrgd[~mrgd['is_smart_amounts']])

Smart amounts:
----------------------------------------
observations: 15,837
forms: 515
transactions: 83,969
mean trans/day: 5.30
median trans/day: 2.00
mean $/day: $847.27
median $/day: $180.00
mean conversion/day/form: 0.31%
median conversion/day/form: 0.12%
mean $/trans: $230.76
median $/trans: $77.92

aggregate by form daily:
        trans_count     trans_vol
mean     163.046602  26054.763417
median    29.000000   4308.000000
conversion  mean      0.352173
            median    0.266981
dtype: float64

aggregate by form (all observations)
Mean $/form: $26,054.76
Mean $/page view: $14.42
Mean conversion: 26.88%
Median conversion: 12.01%

all observations (platform)
Mean $/trans: $159.80
conversion: 9.02%

Not smart amounts:
----------------------------------------
observations: 700,100
forms: 21,469
transactions: 3,978,848
mean trans/day: 5.68
median trans/day: 2.00
mean $/day: $957.16
median $/day: $200.00
mean conversion/day/form: 0.36%
median conversion/day/form: 0.10%
mean $/tra

is the current stats greater or lower than the given forms historical averages?

In [119]:
q = '''select
            amount
        from transactions
        where 
            date > '2021-11-21' and date < '2022-02-10' and
            status='A' and
            recurring=0'''
trans_df = redshift_query_read(q, schema='production')

print("all forms in prior year period")

trans_df['amount'].agg(['mean', 'median', 'count']).reset_index()

all forms in prior year period


Unnamed: 0,index,amount
0,mean,228.222207
1,median,55.12
2,count,368420.0


## removing outliers (top & bottom 5%)

In [120]:
trans_tagged['date'].min(), trans_tagged['date'].max()

(Timestamp('2022-11-08 00:00:00'), Timestamp('2024-07-24 00:00:00'))

### all

In [121]:
form_totals = trans_tagged.groupby('form')['trans_vol'].sum().reset_index()
form_totals.sort_values('trans_vol', ascending=False, inplace=True)
len_forms = len(form_totals)
ten_perc = int(len_forms/25)
ex_forms = form_totals[:ten_perc]['form'].tolist() + form_totals[-ten_perc:]['form'].tolist()
print("{:,} outlier forms out of {:,}".format(len(ex_forms), len_forms))

2,266 outlier forms out of 28,336


In [122]:
trans_mid = trans_tagged[~trans_tagged['form'].isin(ex_forms)]
print("All: {:,} forms, {:,} transactions".format(len(trans_tagged['form'].unique()), len(trans_tagged)))
print("All: {:,} forms, {:,} transactions".format(len(trans_mid['form'].unique()), len(trans_mid)))

All: 28,336 forms, 754,789 transactions
All: 26,070 forms, 533,219 transactions


In [123]:
avgs = trans_mid.groupby('is_smart_amounts')['trans_vol'].agg(['count', 'mean', 'median']).reset_index()

used_smart_amounts = len(trans_mid[trans_mid['is_smart_amounts']]['form'].unique())

avgs['perc_count'] = avgs['count'] / avgs['count'].sum()
avgs['perc_count'] = avgs['perc_count'].apply(lambda x: "{:.2f}%".format(x * 100.))

avgs['forms'] = used_smart_amounts
avgs.loc[~avgs['is_smart_amounts'], 'forms'] = len(trans_tagged['form'].unique()) - used_smart_amounts
avgs['trans/form'] = avgs['count'] / avgs['forms']

avgs.transpose()

Unnamed: 0,0,1
is_smart_amounts,False,True
count,521311,11908
mean,487.842899,454.476216
median,110.0,103.95
perc_count,97.77%,2.23%
forms,27708,628
trans/form,18.814458,18.961783


### isolating 2024

In [124]:
form_totals = mrgd[mrgd['date']>='2024-01-01'].groupby('form')['trans_vol'].sum().reset_index()
form_totals.sort_values('trans_vol', ascending=False, inplace=True)
len_forms = len(form_totals)
ten_perc = int(len_forms/25)
ex_forms = form_totals[:ten_perc]['form'].tolist() + form_totals[-ten_perc:]['form'].tolist()
print("{:,} outlier forms out of {:,}".format(len(ex_forms), len_forms))

806 outlier forms out of 10,079


In [125]:
trans_mid = mrgd[~mrgd['form'].isin(ex_forms)&(mrgd['date']>='2024-01-01')].copy()
print("All: {:,} forms, {:,} transactions".format(len(mrgd['form'].unique()), len(mrgd)))
print("All: {:,} forms, {:,} transactions".format(len(trans_mid['form'].unique()), len(trans_mid)))

All: 21,817 forms, 715,937 transactions
All: 9,273 forms, 178,150 transactions


In [126]:
avgs = trans_mid.groupby('is_smart_amounts')['trans_vol'].agg(['count', 'mean', 'median']).reset_index()

used_smart_amounts = len(trans_mid[trans_mid['is_smart_amounts']]['form'].unique())

avgs['perc_count'] = avgs['count'] / avgs['count'].sum()
avgs['perc_count'] = avgs['perc_count'].apply(lambda x: "{:.2f}%".format(x * 100.))

avgs['forms'] = used_smart_amounts
avgs.loc[~avgs['is_smart_amounts'], 'forms'] = len(mrgd['form'].unique()) - used_smart_amounts
avgs['trans/form'] = avgs['count'] / avgs['forms']

avgs.transpose()

Unnamed: 0,0,1
is_smart_amounts,False,True
count,172713,5437
mean,526.228562,492.332146
median,151.25,105.0
perc_count,96.95%,3.05%
forms,21506,311
trans/form,8.030922,17.482315


In [127]:
print("average conversion:")
print(trans_mid.groupby('is_smart_amounts')['conversion'].agg(['mean', 'median']).reset_index())

trans_mid['avg_amount'] = trans_mid['trans_vol'] / trans_mid['trans_count']

print()
print("average transaction amount:")
print(trans_mid.groupby('is_smart_amounts')['avg_amount'].agg(['mean', 'median']).reset_index())

average conversion:
   is_smart_amounts      mean    median
0             False  0.151919  0.083333
1              True  0.199817  0.105769

average transaction amount:
   is_smart_amounts        mean  median
0             False  180.580732  60.000
1              True  193.816953  54.675


### 2024, rep forms

In [128]:
rep_forms = pd.read_csv("../representative forms/filtered_forms.csv")

In [129]:
rep_trans = trans_mid[trans_mid['form'].isin(rep_forms['form'].tolist())]

In [130]:
avgs = rep_trans.groupby('is_smart_amounts')['trans_vol'].agg(['count', 'mean', 'median']).reset_index()

used_smart_amounts = len(rep_trans[rep_trans['is_smart_amounts']]['form'].unique())

avgs['perc_count'] = avgs['count'] / avgs['count'].sum()
avgs['perc_count'] = avgs['perc_count'].apply(lambda x: "{:.2f}%".format(x * 100.))

avgs['forms'] = used_smart_amounts
avgs.loc[~avgs['is_smart_amounts'], 'forms'] = len(mrgd['form'].unique()) - used_smart_amounts
avgs['trans/form'] = avgs['count'] / avgs['forms']

avgs.transpose()

Unnamed: 0,0,1
is_smart_amounts,False,True
count,51665,1530
mean,393.64527,325.976549
median,150.0,46.88
perc_count,97.12%,2.88%
forms,21769,48
trans/form,2.373329,31.875


In [135]:
print("per day:")
print("average conversion:")
print(rep_trans.groupby('is_smart_amounts')['conversion'].agg(['mean', 'median']).reset_index())

rep_trans['avg_amount'] = rep_trans['trans_vol'] / rep_trans['trans_count']

print()
print("average transaction amount:")
print(rep_trans.groupby('is_smart_amounts')['avg_amount'].agg(['mean', 'median']).reset_index())

per day:
average conversion:
   is_smart_amounts      mean    median
0             False  0.140882  0.083333
1              True  0.185102  0.122628

average transaction amount:
   is_smart_amounts        mean     median
0             False  106.783708  52.255556
1              True  120.266628  20.600000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [132]:
form_sums = rep_trans.groupby(['form'])['trans_vol'].sum().reset_index()
smart_amounts_form_ids = rep_trans[rep_trans['is_smart_amounts']]['form'].unique()
form_sums['is_smart_amounts'] = form_sums['form'].isin(smart_amounts_form_ids)

In [137]:
print("per form:")
form_sums.groupby('is_smart_amounts')['trans_vol'].agg(['mean', 'median']).reset_index()

per form:


Unnamed: 0,is_smart_amounts,mean,median
0,False,14277.958743,4985.215
1,True,10512.786667,3671.2
