In [1]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

import pandas as pd
import numpy as np

# load data

## forms

QGIV = 1;
BARNSTORM = 2;
HOBNOB = 3;
AUCTION = 5;

In [4]:
q = '''select id as form, type as product from form'''
forms = redshift_query_read(q, schema='production')

## traffic

In [5]:
q = '''select 
            form,
            date_trunc('week', date) as week,
            sum(views) as views
        from ga 
        where 
            form!=0 and date>='2019-01-01' 
        group by form, date_trunc('week', date)'''
traff = redshift_query_read(q, schema='production')

In [6]:
print("{:,} observations".format(len(traff)))
print("{:,} forms".format(len(traff['form'].unique())))
print("{} to {}".format(traff['week'].min(), traff['week'].max()))

1,398,782 observations
64,560 forms
2018-12-31 00:00:00 to 2024-02-26 00:00:00


## transactions

In [7]:
q = '''select
            form,
            date_trunc('week', date) as week,
            count(id) as trans_count,
            sum(amount) as trans_vol,
            count(distinct(case when recurring=0 then id else null end)) as trans_onetime_count,
            sum(case when recurring=0 then amount else null end) as trans_onetime_vol,
            count(distinct(case when recurring_origin!=0 and recurring!=0 then id else null end)) as trans_rec_count,
            sum(case when recurring_origin!=0 and recurring!=0 then amount else null end) as trans_rec_vol
        from transactions
        where status='A' and year >= 2019
        group by form, date_trunc('week', date)'''
trans = redshift_query_read(q, schema='production')

In [8]:
print("{:,} observations".format(len(trans)))
print("{:,} forms".format(len(trans['form'].unique())))
print("{} to {}".format(trans['week'].min(), trans['week'].max()))

1,085,054 observations
49,166 forms
2018-12-31 00:00:00 to 2024-03-11 00:00:00


## merge

In [9]:
df = trans.merge(traff, on=['form', 'week']).merge(forms, on='form')

In [10]:
print("{:,} observations".format(len(df)))
print("{:,} forms".format(len(df['form'].unique())))
print("{} to {}".format(df['week'].min(), df['week'].max()))

df.groupby('form')['product'].first().value_counts(normalize=True)

554,454 observations
38,126 forms
2018-12-31 00:00:00 to 2024-02-26 00:00:00


1    0.766354
3    0.176940
5    0.056523
2    0.000184
Name: product, dtype: float64

# analysis

In [31]:
df['views'].fillna(0, inplace=True)
df['conversion_ot'] = df['trans_onetime_count'] / df['views']
df['conversion_rec'] = df['trans_rec_count'] / df['views']

## traffic

In [32]:
print("traffic")
df.groupby(['product', 'form'])['views'].agg(['mean', 'median']).reset_index().groupby('product')[['mean', 'median']].mean().reset_index()

traffic


Unnamed: 0,product,mean,median
0,1,117.118591,85.776302
1,2,24.904762,23.0
2,3,169.352768,121.426994
3,5,208.009916,130.056148


In [89]:
grpd_traff = df.groupby('product')['views'].describe().reset_index().drop(['mean'], axis=1)
grpd_traff = grpd_traff.merge(df.groupby('product')['views'].agg(['mean', 'median']).reset_index(), on='product')
grpd_traff.merge(df.groupby('product')['form'].nunique().reset_index(), on='product')

Unnamed: 0,product,count,std,min,25%,50%,75%,max,mean,median,form
0,1,495385.0,1124.134925,0.0,6.0,21.0,73.0,238870.0,132.379327,21.0,29218
1,2,22.0,39.345511,2.0,6.25,26.0,61.25,147.0,38.454545,26.0,7
2,3,48241.0,1041.617997,0.0,6.0,25.0,118.0,55437.0,234.381771,25.0,6746
3,5,10806.0,782.180002,1.0,6.0,28.0,138.0,25068.0,227.187581,28.0,2155


In [43]:
print("Qgiv:")
len_qgiv = len(df[df['product']==1])

len_zero = len(df[(df['product']==1)&(df['views']==0)])
perc_zero = len_zero / len_qgiv
print("{:,} ({:.1f}%) entries with 0 views".format(len_zero, perc_zero * 100.))

len_trans_gt = len(df[(df['product']==1)&(df['views']<df['trans_count'])])
perc_trans_gt = len_trans_gt / len_qgiv
print("{:,} ({:.1f}%) entries with trans > views".format(len_trans_gt, perc_trans_gt * 100.))

Qgiv:
1,410 (0.3%) entries with 0 views
74,058 (14.9%) entries with trans > views


## trans count

In [33]:
print("trans count")
df.groupby(['product', 'form'])['trans_count'].agg(['mean', 'median']).reset_index().groupby('product')[['mean', 'median']].mean().reset_index()

trans count


Unnamed: 0,product,mean,median
0,1,9.45122,7.110035
1,2,3.747619,3.5
2,3,27.103698,19.671287
3,5,29.499516,21.083063


In [90]:
grpd_trans_count = df.groupby('product')['trans_count'].describe().reset_index().drop(['mean'], axis=1)
grpd_trans_count = grpd_trans_count.merge(df.groupby('product')['trans_count'].agg(['mean', 'median']).reset_index(), on='product')
grpd_trans_count.merge(df.groupby('product')['form'].nunique().reset_index(), on='product')

Unnamed: 0,product,count,std,min,25%,50%,75%,max,mean,median,form
0,1,495385.0,105.51217,1.0,2.0,4.0,11.0,29171.0,16.315135,4.0,29218
1,2,22.0,3.328026,1.0,1.0,1.5,3.75,14.0,3.136364,1.5,7
2,3,48241.0,129.268517,1.0,2.0,8.0,26.0,10310.0,32.969113,8.0,6746
3,5,10806.0,44.112651,1.0,3.0,8.0,25.0,746.0,24.760226,8.0,2155


## conversion recurring

In [83]:
print("conversion recurring")
_df = df.copy()
_df['conversion_rec'] = _df['conversion_rec'].replace([np.inf, -np.inf], np.nan).fillna(0)
_df[~_df['conversion_rec'].isna()].groupby(['product', 'form'])['conversion_rec'].agg(['mean', 'median']).reset_index().groupby('product')[['mean', 'median']].mean().reset_index()

conversion recurring


Unnamed: 0,product,mean,median
0,1,0.016555,0.007075
1,2,0.0,0.0
2,3,0.00891,0.004572
3,5,0.0,0.0


## conversion one time

In [34]:
print("conversion one time")
_df = df.copy()
_df['conversion_ot'] = _df['conversion_ot'].replace([np.inf, -np.inf], np.nan).fillna(0)
_df[_df['views']>0].groupby(['product', 'form'])['conversion_ot'].agg(['mean', 'median']).reset_index().groupby('product')[['mean', 'median']].mean().reset_index()

conversion one time


Unnamed: 0,product,mean,median
0,1,0.513544,0.409812
1,2,0.271941,0.253968
2,3,3.709247,2.874078
3,5,2.144569,1.72246


In [93]:
grpd_conv_ot = df[(df['views']>0)&(df['views']>df['trans_count'])].groupby('product')['conversion_ot'].describe().reset_index().drop(['mean'], axis=1)
grpd_conv_ot = grpd_conv_ot.merge(df.groupby('product')['conversion_ot'].agg(['mean', 'median']).reset_index(), on='product')
grpd_conv_ot.merge(df.groupby('product')['form'].nunique().reset_index(), on='product')

Unnamed: 0,product,count,std,min,25%,50%,75%,max,mean,median,form
0,1,405191.0,0.144714,0.0,0.012324,0.059783,0.142857,0.98374,,0.066667,29218
1,2,20.0,0.116174,0.006803,0.04496,0.07884,0.2,0.375,0.248972,0.090993,7
2,3,35212.0,0.24343,0.0,0.036444,0.11,0.37931,0.994764,,0.285714,6746
3,5,7943.0,0.250568,0.000544,0.042735,0.165138,0.433053,0.99,1.454066,0.333333,2155


### qgiv

In [77]:
_df = df[df['product']==1].copy()
_df['conversion_ot'] = _df['conversion_ot'].replace([np.inf, -np.inf], np.nan).fillna(0)
_df.groupby('form')['conversion_ot'].agg(['mean', 'median']).reset_index().mean()

form      942133.046102
mean           0.512771
median         0.409172
dtype: float64

In [61]:
_df['conversion_ot'].agg(['mean', 'median'])

mean      0.371095
median    0.066667
Name: conversion_ot, dtype: float64

In [62]:
_df['conversion_ot'].describe()

count    495385.000000
mean          0.371095
std           5.021071
min           0.000000
25%           0.007246
50%           0.066667
75%           0.190476
max        2230.500000
Name: conversion_ot, dtype: float64

In [64]:
for s in range(0, 10, 1):
    floor = s / 10.
    ceiling = (s + 1) / 10.0
    
    subset = _df[(_df['conversion_ot']>=floor)&(_df['conversion_ot']<ceiling)]
    
    these_forms = subset['form'].unique().tolist()
    subset_median = df[df['form'].isin(these_forms)]['conversion_ot'].median()
    
    len_this = len(subset)
    perc_this = len_this / len(_df)
    print("{} - {}: {:,} ({:.1f}%); subset median: {:.1f}%".format(floor, ceiling, len_this, (perc_this * 100.), subset_median * 100.))

0.0 - 0.1: 293,272 (59.2%); subset median: 6.2%
0.1 - 0.2: 79,766 (16.1%); subset median: 7.8%
0.2 - 0.3: 35,976 (7.3%); subset median: 8.7%
0.3 - 0.4: 14,765 (3.0%); subset median: 10.0%
0.4 - 0.5: 6,108 (1.2%); subset median: 12.2%
0.5 - 0.6: 16,948 (3.4%); subset median: 10.0%
0.6 - 0.7: 5,188 (1.0%); subset median: 12.5%
0.7 - 0.8: 4,028 (0.8%); subset median: 13.8%
0.8 - 0.9: 2,635 (0.5%); subset median: 15.8%
0.9 - 1.0: 909 (0.2%); subset median: 20.0%


In [78]:
_df['views'].describe()

count    495385.000000
mean        132.379327
std        1124.134925
min           0.000000
25%           6.000000
50%          21.000000
75%          73.000000
max      238870.000000
Name: views, dtype: float64

### p2p

In [79]:
_df = df[df['product']==3].copy()
_df['conversion_ot'] = _df['conversion_ot'].replace([np.inf, -np.inf], np.nan).fillna(0)
_df.groupby('form')['conversion_ot'].agg(['mean', 'median']).reset_index().mean()

form      975802.386007
mean           3.709223
median         2.874078
dtype: float64

In [66]:
_df['conversion_ot'].agg(['mean', 'median'])

mean      2.698602
median    0.285714
Name: conversion_ot, dtype: float64

In [67]:
_df['conversion_ot'].describe()

count    48241.000000
mean         2.698602
std         14.510099
min          0.000000
25%          0.054708
50%          0.285714
75%          1.000000
max        735.500000
Name: conversion_ot, dtype: float64

In [69]:
for s in range(0, 10, 1):
    floor = s / 10.
    ceiling = (s + 1) / 10.0
    
    subset = _df[(_df['conversion_ot']>=floor)&(_df['conversion_ot']<ceiling)]
    
    these_forms = subset['form'].unique().tolist()
    subset_median = df[df['form'].isin(these_forms)]['conversion_ot'].median()
    
    len_this = len(subset)
    perc_this = len_this / len(_df)
    print("{} - {}: {:,} ({:.1f}%); subset median: {:.1f}%".format(floor, ceiling, len_this, (perc_this * 100.), subset_median * 100.))

0.0 - 0.1: 16,982 (35.2%); subset median: 6.7%
0.1 - 0.2: 4,398 (9.1%); subset median: 16.7%
0.2 - 0.3: 3,065 (6.4%); subset median: 33.3%
0.3 - 0.4: 2,415 (5.0%); subset median: 46.7%
0.4 - 0.5: 1,544 (3.2%); subset median: 50.0%
0.5 - 0.6: 3,395 (7.0%); subset median: 51.3%
0.6 - 0.7: 1,343 (2.8%); subset median: 63.8%
0.7 - 0.8: 1,012 (2.1%); subset median: 73.0%
0.8 - 0.9: 841 (1.7%); subset median: 75.4%
0.9 - 1.0: 484 (1.0%); subset median: 82.4%


In [80]:
_df['views'].describe()

count    48241.000000
mean       234.381771
std       1041.617997
min          0.000000
25%          6.000000
50%         25.000000
75%        118.000000
max      55437.000000
Name: views, dtype: float64

### auctions

In [81]:
_df = df[df['product']==5].copy()
_df['conversion_ot'] = _df['conversion_ot'].replace([np.inf, -np.inf], np.nan).fillna(0)
_df.groupby('form')['conversion_ot'].agg(['mean', 'median']).reset_index().mean()

form      990200.213457
mean           2.144569
median         1.722460
dtype: float64

In [72]:
_df['conversion_ot'].agg(['mean', 'median'])

mean      1.454066
median    0.333333
Name: conversion_ot, dtype: float64

In [73]:
_df['conversion_ot'].describe()

count    10806.000000
mean         1.454066
std          5.242032
min          0.000544
25%          0.070175
50%          0.333333
75%          1.000000
max        155.500000
Name: conversion_ot, dtype: float64

In [75]:
for s in range(0, 10, 1):
    floor = s / 10.
    ceiling = (s + 1) / 10.0
    
    subset = _df[(_df['conversion_ot']>=floor)&(_df['conversion_ot']<ceiling)]
    
    these_forms = subset['form'].unique().tolist()
    subset_median = df[df['form'].isin(these_forms)]['conversion_ot'].median()
    
    len_this = len(subset)
    perc_this = len_this / len(_df)
    print("{} - {}: {:,} ({:.1f}%); subset median: {:.1f}%".format(floor, ceiling, len_this, (perc_this * 100.), subset_median * 100.))

0.0 - 0.1: 3,181 (29.4%); subset median: 7.1%
0.1 - 0.2: 1,108 (10.3%); subset median: 16.7%
0.2 - 0.3: 863 (8.0%); subset median: 28.6%
0.3 - 0.4: 619 (5.7%); subset median: 37.5%
0.4 - 0.5: 395 (3.7%); subset median: 45.8%
0.5 - 0.6: 774 (7.2%); subset median: 50.0%
0.6 - 0.7: 370 (3.4%); subset median: 60.0%
0.7 - 0.8: 288 (2.7%); subset median: 70.0%
0.8 - 0.9: 228 (2.1%); subset median: 78.9%
0.9 - 1.0: 117 (1.1%); subset median: 87.4%


In [82]:
_df['views'].describe()

count    10806.000000
mean       227.187581
std        782.180002
min          1.000000
25%          6.000000
50%         28.000000
75%        138.000000
max      25068.000000
Name: views, dtype: float64