In [None]:
import pandas as pd
import numpy as np
import datetime

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

In [3]:
%matplotlib inline

# load data

In [4]:
START_DATE = '2022-01-01'

### transactions

In [5]:
q_trans_onetime = '''select 
                        id,
                        date, 
                        form, 
                        amount,
                        recurring,
                        source,
                        useragent
                    from transactions
                    where 
                        date>='{}' and
                        status='A' and 
                        recurring=0 and
                        channel=1 and
                        (source='don_form' or source='mobile')'''
trans_onetime = redshift_query_read(q_trans_onetime.format(START_DATE), schema='production')
trans_onetime['is_recurring'] = False

trans_onetime = trans_onetime[trans_onetime['id']!=12774333]

In [6]:
print("{:,} transactions, {:,} unique".format(len(trans_onetime), len(trans_onetime['id'].unique())))
print("{:,} forms".format(len(trans_onetime['form'].unique())))

4,337 transactions, 4,337 unique
524 forms


In [7]:
q_trans_rec = '''select 
                        id,
                        date, 
                        form, 
                        amount,
                        recurring,
                        source,
                        useragent
                    from transactions
                    where 
                        date>='{}' and
                        status='A' and 
                        recurring!=0 and
                        channel=1 and
                        (source='don_form' or source='mobile')'''
trans_rec = redshift_query_read(q_trans_rec.format(START_DATE), schema='production')
trans_rec['is_recurring'] = True

trans_rec = trans_rec[trans_rec['id']!=12774333]

In [8]:
print("{:,} recurring transactions, {:,} unique".format(len(trans_rec), len(trans_rec['id'].unique())))
print("{:,} forms".format(len(trans_rec['form'].unique())))

9,981 recurring transactions, 9,981 unique
51 forms


In [9]:
trans = trans_onetime.append(trans_rec)

In [10]:
print("forms: {:,}".format(trans['form'].nunique()))
print("transactions: {:,}, {:,} unique".format(len(trans), trans['id'].nunique()))
print("one time transactions: {:,}, {:,} unique".format(len(trans[~trans['is_recurring']]), trans[~trans['is_recurring']]['id'].nunique()))
print("recurring transactions: {:,}, {:,} unique".format(len(trans[trans['is_recurring']]), trans[trans['is_recurring']]['id'].nunique()))

forms: 542
transactions: 14,318, 14,318 unique
one time transactions: 4,337, 4,337 unique
recurring transactions: 9,981, 9,981 unique


In [11]:
trans['date'].min(), trans['date'].max()

(Timestamp('2022-01-01 00:00:00'), Timestamp('2023-03-07 00:00:00'))

### traffic

In [12]:
q_ga = '''select
                date,
                form,
                devicecategory,
                views,
                bounces
            from googleanalytics_traffic
            where
                date>='{}' and
                (qgiv_frontend=1 or p2p_frontend=1) and
                path not like '%/receipt/%' and
                path not like '%account/%' '''
traffic = redshift_query_read(q_ga.format(START_DATE), schema='production')

In [13]:
traffic = traffic.drop_duplicates()

In [14]:
print("{:,} entries; {:,} unique".format(len(traffic), len(traffic.drop_duplicates())))
print("{:,.2f} entries per form".format(traffic.groupby('form')['views'].count().mean()))
print("{:,.2f} observation dates per form".format(traffic.groupby('form')['date'].nunique().mean()))
print("{:,.2f} views per form".format(traffic.groupby('form')['views'].sum().mean()))
print("{:,.2f} views per form per day".format(traffic.groupby(['form', 'date'])['views'].sum().mean()))

3,620,705 entries; 3,620,705 unique
115.24 entries per form
40.40 observation dates per form
886.06 views per form
21.93 views per form per day


In [15]:
traffic['date'].min(), traffic['date'].max()

(Timestamp('2022-01-01 00:00:00'), Timestamp('2023-03-06 00:00:00'))

### merge & compile data

In [16]:
trans_all = trans.copy()

trans_onetime = trans[~trans['is_recurring']]
trans_rec = trans[trans['is_recurring']]
trans_rec = trans_rec.groupby('recurring').first().reset_index()

trans = trans_onetime.append(trans_rec)

In [17]:
print("All transactions: {:,} ({:,} unique)".format(len(trans_all), len(trans_all['id'].unique())))
print("One time and originating recurring: {:,}".format(len(trans)))
print()
print("Trans per form: {:,.2f}".format(trans.groupby('form')['amount'].count().mean()))
print("Trans per form per day: {:,.2f}".format(trans.groupby(['form', 'date'])['amount'].count().mean()))
print("One time trans per form: {:,.2f}".format(trans[~trans['is_recurring']].groupby('form')['amount'].count().mean()))
print("Recurring trans per form: {:,.2f}".format(trans[trans['is_recurring']].groupby('form')['amount'].count().mean()))

All transactions: 14,318 (14,318 unique)
One time and originating recurring: 5,209

Trans per form: 9.61
Trans per form per day: 2.10
One time trans per form: 8.28
Recurring trans per form: 17.10


#### daily (all devices) data

In [18]:
# merge to daily
daily_trans = trans.groupby(['form', 'date', 'is_recurring'])['amount'].agg({'count', 'sum'}).reset_index()
daily_trans['trans_count'] = daily_trans['count']
daily_trans['trans_vol'] = daily_trans['sum']
daily_trans.drop(['count', 'sum'], axis=1, inplace=True)

daily_trans_pvt = daily_trans.pivot(index=['form', 'date'], columns='is_recurring', values=['trans_count', 'trans_vol']).reset_index()
cols = ['form', 'date', 'trans_count_onetime', 
        'trans_count_recurring', 'trans_vol_onetime', 
        'trans_vol_recurring']
daily_trans_pvt.columns = cols
daily_trans_pvt.fillna(0, inplace=True)

traffic_agg = traffic.groupby(['date', 'form'])[['views', 'bounces']].sum().reset_index()
dailies = traffic_agg.merge(daily_trans_pvt, on=['date', 'form'], how='outer')
dailies.fillna(0, inplace=True)
dailies = dailies[dailies['views']>0]

In [19]:
print("daily trans agg len: {:,}".format(len(daily_trans)))
print("daily traffic agg len: {:,}".format(len(traffic_agg)))
print("merged dailies len: {:,}".format(len(dailies)))

daily trans agg len: 2,502
daily traffic agg len: 1,269,323
merged dailies len: 1,258,493


In [20]:
print("{:,} forms in transaction aggregates".format(len(daily_trans['form'].unique())))
print("{:,} forms in traffic aggregates".format(len(traffic_agg['form'].unique())))
print("{:,} forms in merged dailies".format(len(dailies['form'].unique())))
print()

print("Dailies w/ > 0 transactions and 0 pageviews:")
print("-"*40)
print("{:,} total dailies entries".format(len(dailies)))
print("{:,} entries".format(len(dailies[(dailies['views']==0)&(dailies['trans_count_onetime']>0)])))
print("{:,} forms".format(len(dailies[(dailies['views']==0)&(dailies['trans_count_onetime']>0)]['form'].unique())))

542 forms in transaction aggregates
31,419 forms in traffic aggregates
31,284 forms in merged dailies

Dailies w/ > 0 transactions and 0 pageviews:
----------------------------------------
1,258,493 total dailies entries
0 entries
0 forms


In [21]:
dailies['conversion'] = (dailies['trans_count_onetime'] + dailies['trans_count_recurring']) / dailies['views']
dailies['conversion_onetime'] = dailies['trans_count_onetime'] / dailies['views']
dailies['conversion_recurring'] = dailies['trans_count_recurring'] / dailies['views']

In [22]:
# removing outliers
dailies = dailies[dailies['conversion']<1.]

In [23]:
dailies = dailies[dailies['date']>=START_DATE]

In [24]:
dailies['date'].min(), dailies['date'].max()

(Timestamp('2022-01-01 00:00:00'), Timestamp('2023-03-06 00:00:00'))

In [25]:
over_conv_forms = dailies[dailies['conversion']>1.]['form'].unique().tolist()
intersect_over_conv = [f for f in over_conv_forms if f in dailies[dailies['conversion']<1.]['form'].unique()]

print("{:,} forms w/ > 100% conversion".format(len(over_conv_forms)))
print("{:,} forms overlap w/ < 100% conversion".format(len(intersect_over_conv)))

0 forms w/ > 100% conversion
0 forms overlap w/ < 100% conversion


In [26]:
dailies['trans_count'] = dailies['trans_count_onetime'] + dailies['trans_count_recurring']

In [27]:
dailies['trans_vol'] = dailies['trans_vol_onetime'] + dailies['trans_vol_recurring']

In [28]:
#dailies.to_csv("dailies.csv", index=False)

# report

calculate data points from 01-2022 forward, report to include only 2023 with year over year changes

In [29]:
#dailies = pd.read_csv("dailies.csv")

In [30]:
def report(df, df_all_trans):
    df = df.copy().replace(np.inf, np.nan)
    return {
        'form sample size': len(df_all_trans['form'].unique().tolist()),
        'transactions': df_all_trans[df_all_trans['recurring']==0]['id'].nunique() + df['trans_count_recurring'].sum(),
        'transactions per form': df_all_trans.groupby('form')['id'].count().mean(),
        'conversion': df['conversion'].mean(),
        'conversion onetime': df['conversion_onetime'].mean(),
        'conversion recurring': df['conversion_recurring'].mean(),
        'mean transaction onetime': df['trans_vol_onetime'].sum() / df['trans_count_onetime'].sum(),
        'median transaction onetime': df_all_trans[~df_all_trans['is_recurring']]['amount'].median(),
        'mean transaction recurring': df['trans_vol_recurring'].sum() / df['trans_count_recurring'].sum(),
        'median transaction recurring': df_all_trans[df_all_trans['is_recurring']]['amount'].median(),
        'onetime/recurring': df['trans_count_onetime'].sum() / df['trans_count_recurring'].sum(),
        'pageviews': df['views'].sum(),
        'bounce rate': df['bounces'].sum() / df['views'].sum(),
        'transactions one time': len(df_all_trans[~df_all_trans['is_recurring']]),
        'transactions origin recurring': df['trans_count_recurring'].sum(), 
        'transactions all recurring': len(df_all_trans[df_all_trans['is_recurring']])
    }

In [46]:
# buid month ranges list to iterate through
month_ranges = []
for y in [2022, 2023]:
    for m in range(1, 13):
        y_1 = y_2 = y
        m_1 = m
        m_2 = m + 1
        if m == 12:
            m_2 = 1
            y_2 += 1
        month_ranges.append(("{:02d}-01-{}".format(m_1, y_1), ("{:02d}-01-{}".format(m_2, y_2))))

In [47]:
# build monthly dataframe from 01-2022 forward
monthly_data = None
for m in month_ranges:
    this_df = dailies[(dailies['date']>=m[0])&(dailies['date']<m[1])]
    this_df_all = trans_all[(trans_all['date']>=m[0])&(trans_all['date']<m[1])]
    
    report_data = report(this_df, this_df_all)
        
    report_df = pd.DataFrame([report_data])
    
    report_df['pageviews'] = report_df['pageviews'] / report_df['pageviews'].sum()
    report_df = report_df.transpose()
    report_df['date'] = m[0]
    
    monthly_data = pd.concat([monthly_data, report_df])

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  
  app.launch_new_instance()


In [54]:
monthly_data = monthly_data.reset_index()
monthly_data['template'] = monthly_data['index'].str.replace(' template', '')
monthly_data.drop('index', axis=1, inplace=True)

In [55]:
vals_cols = [c for c in monthly_data.columns if c != 'template' and c != 'date']
monthly_pvt = monthly_data.pivot(index=['date'], columns='template', values=vals_cols)

In [56]:
monthly_pvt = monthly_pvt.reset_index().sort_values('date', ascending=True)

In [57]:
monthly_pvt.dropna()

Unnamed: 0_level_0,date,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
template,Unnamed: 1_level_1,bounce rate,conversion,conversion onetime,conversion recurring,form sample size,mean transaction onetime,mean transaction recurring,median transaction onetime,median transaction recurring,onetime/recurring,pageviews,transactions,transactions all recurring,transactions one time,transactions origin recurring,transactions per form
0,01-01-2022,0.193655,4.6e-05,4.5e-05,4.349064e-07,42.0,88.004,25.69,55.78,25.0,32.5,1.0,91.0,23.0,89.0,2.0,2.666667
1,01-01-2023,0.218524,8.8e-05,8e-05,7.552096e-06,90.0,684.421713,162.16,125.0,17.68,39.555556,1.0,450.0,155.0,441.0,9.0,6.622222
2,02-01-2022,0.212442,4.9e-05,4.5e-05,4.111065e-06,48.0,86.321884,50.0,51.97,25.0,69.0,1.0,140.0,21.0,139.0,1.0,3.333333
3,02-01-2023,0.296002,0.000189,0.00018,9.444381e-06,98.0,162.629771,32.875,100.0,10.0,32.75,1.0,208.0,140.0,204.0,4.0,3.510204
4,03-01-2022,0.188997,5.2e-05,5.1e-05,1.252963e-06,64.0,90.023309,17.995,55.0,25.0,69.5,1.0,277.0,24.0,275.0,2.0,4.671875
5,03-01-2023,0.300978,0.00029,0.000264,2.624672e-05,46.0,211.43,25.75,77.5,5.0,26.0,1.0,61.0,40.0,60.0,1.0,2.173913
6,04-01-2022,0.208167,7.1e-05,5.7e-05,1.411295e-05,60.0,152.51518,48.833393,69.0,10.0,0.246892,1.0,766.0,6868.0,203.0,563.0,117.85
8,05-01-2022,0.197364,7.7e-05,7.2e-05,5.34506e-06,69.0,123.656951,29.1348,62.0,10.0,3.28,1.0,335.0,770.0,285.0,50.0,15.289855
10,06-01-2022,0.209638,5.8e-05,5.2e-05,6.158962e-06,64.0,97.141838,9.974359,50.0,10.0,4.74359,1.0,331.0,227.0,292.0,39.0,8.109375
12,07-01-2022,0.193585,7.8e-05,5.7e-05,2.124469e-05,62.0,130.367941,16.398058,67.12,10.0,0.660194,1.0,215.0,1137.0,112.0,103.0,20.145161


In [None]:
#monthly_pvt.dropna().to_csv("new_forms.applepay.csv", index=False)

## Spot checking form 986366

In [70]:
target_form = 986366
trans_cols = ['trans_count_onetime', 'trans_count_recurring', 'trans_vol_onetime', 
              'trans_vol_recurring', 'trans_count', 'trans_vol']
dailies[(dailies['date']>='2022-04-01')&(dailies['date']<'2022-05-01')&(dailies['form']==target_form)][trans_cols].sum()

trans_count_onetime          4.0
trans_count_recurring      562.0
trans_vol_onetime            0.0
trans_vol_recurring      27478.2
trans_count                566.0
trans_vol                27478.2
dtype: float64

In [73]:
this_trans = trans_all[(trans_all['date']>='2022-04-01')&(trans_all['date']<'2022-05-01')&(trans_all['form']==target_form)]

print("{:,} transactions".format(len(this_trans)))
print("{:,} unique recurring".format(len(this_trans['recurring'].unique())))
this_trans[['amount', 'is_recurring']].sum()

6,843 transactions
608 unique recurring


amount          257125.8
is_recurring      6839.0
dtype: float64