In [75]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# transactions

In [32]:
# getting one time
q = '''select 
            form,
            date, 
            count(id) as count 
        from transactions 
        where status='A' and source='don_form' and recurring=0
        group by form, date
    '''
trans = redshift_query_read(q)

In [33]:
print(len(trans))
trans.head(3)

1034526


Unnamed: 0,form,date,count
0,187,2008-09-12,1
1,167,2008-09-12,3
2,163,2008-09-12,1


In [106]:
# get first recurring entry
q = '''select form, min(date), recurring from transactions where status='A' and source='don_form' group by form, recurring'''
rec = redshift_query_read(q)

rec.columns = ['form', 'date', 'recurring']
rec = rec.groupby(['form', 'date'])['recurring'].count().reset_index()

In [107]:
rec.tail(3)

Unnamed: 0,form,date,recurring
128670,960591,2020-07-31,1
128671,960611,2020-07-31,1
128672,960617,2020-07-31,1


In [108]:
# merge recurring w/ one time
trans['onetime'] = trans['count']
trans = trans[['form', 'date', 'onetime']].merge(rec[['form', 'date', 'recurring']], on=['form', 'date'], how='outer')

In [109]:
trans.fillna(0, inplace=True)
trans['count'] = trans['onetime'] + trans['recurring']
trans.tail()

Unnamed: 0,form,date,onetime,recurring,count
1080697,960092,2020-07-20,1.0,1.0,2.0
1080698,960177,2020-07-31,2.0,2.0,4.0
1080699,960263,2020-07-29,2.0,2.0,4.0
1080700,960263,2020-07-31,2.0,2.0,4.0
1080701,960263,2020-08-03,1.0,1.0,2.0


# page views

In [115]:
q = '''select
            form, date, views
        from googleanalytics_traffic'''
traffic = redshift_query_read(q)

In [116]:
print(len(traffic))
traffic.head(3)

24002767


Unnamed: 0,form,date,views
0,78904,2019-07-07,1
1,78904,2019-07-08,1
2,82999,2019-02-18,1


In [146]:
df = traffic.merge(trans, on=['form', 'date'], how='left').fillna(0)
print(len(df))
df.head(3)

24002767


Unnamed: 0,form,date,views,onetime,recurring,count
0,78904,2019-07-07,1,0.0,0.0,0.0
1,78904,2019-07-08,1,0.0,0.0,0.0
2,82999,2019-02-18,1,0.0,0.0,0.0


In [147]:
len_gt_views = len(df[df['count']>df['views']])
len_zero = len(df[df['views']==0])
"{:.2f}%".format((len_gt_views / len(df)) * 100.), "{:.2f}%".format((len_zero / len(df)) * 100.)

('11.12%', '0.00%')

In [148]:
df['conversion'] = df['count'] / df['views']
df['conversion'] = df['conversion'].replace(np.inf, 100.)

len(df[df['date'].dt.year>=2019]), len(df[df['date'].dt.year>=2019]['form'].unique())

(13897983, 18105)

In [120]:
print(df['conversion'].describe())
print(df['conversion'].clip(upper=1.).describe())

count    2.400277e+07
mean     2.099663e+00
std      2.138339e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.296000e+03
Name: conversion, dtype: float64
count    2.400277e+07
mean     1.664643e-01
std      3.622457e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: conversion, dtype: float64


In [149]:
df_2019 = df[df['date'].dt.year>=2019].copy()
print(df_2019['conversion'].mean(), df_2019['conversion'].clip(upper=1.).mean(), df_2019[df_2019['count']<df_2019['views']]['conversion'].mean())

2.3546706066259806 0.13975502745197885 0.012919747375540185


In [122]:
df_2019[['count', 'views', 'conversion']].describe()

Unnamed: 0,count,views,conversion
count,13897980.0,13897980.0,13897980.0
mean,2.603639,3.447387,2.354671
std,26.59937,31.91847,25.78092
min,0.0,1.0,0.0
25%,0.0,1.0,0.0
50%,0.0,1.0,0.0
75%,0.0,2.0,0.0
max,1296.0,40433.0,1296.0
