In [63]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

In [64]:
q = '''select
            form,
            date_trunc('day', timestamp) as date,
            count(distinct(visitorid)) as visitors
        from matomo_traffic
        group by date_trunc('day', timestamp), form'''
traff = redshift_query_read(q, schema='production')

In [65]:
min_date = traff['date'].min()
max_date = traff['date'].max()

min_date, max_date

(Timestamp('2022-09-01 00:00:00'), Timestamp('2023-02-09 00:00:00'))

In [66]:
q = '''select
            form,
            date,
            count(distinct(id)) as transactions
        from transactions
        where
            status='A' and
            (recurring=0 or recurring_origin=1) and
            date>'{}' and
            date<'{}' and
            source not in ('vt', 'mobilevt')
        group by form, date'''.format(min_date, max_date)
trans = redshift_query_read(q, schema='production')

In [67]:
df = traff.merge(trans, how='outer').fillna(0)
df = df[df['form']!=0]

In [68]:
len_df = len(df)
perc_trans_gt_visitors = (len(df[df['transactions']>df['visitors']]) / len_df) * 100.
perc_no_visitors = (len(df[(df['transactions']>0)&(df['visitors']==0)]) / len_df) * 100.

print("{:,} entries".format(len_df))
print("{:.2f}% have more transactions then visitors".format(perc_trans_gt_visitors))
print("{:.2f}% have no visitors, >0 transactions".format(perc_no_visitors))

545,552 entries
16.97% have more transactions then visitors
15.07% have no visitors, >0 transactions


In [69]:
df['conversion'] = df['transactions'] / df['visitors']

In [70]:
df.head()

Unnamed: 0,form,date,visitors,transactions,conversion
0,992930,2022-11-22,26.0,0.0,0.0
1,928795,2023-02-03,6.0,1.0,0.166667
2,444339,2023-01-04,32.0,0.0,0.0
3,961447,2022-09-30,361.0,0.0,0.0
5,438418,2022-12-13,637.0,0.0,0.0


In [71]:
df_inf = df.copy()
df['conversion'] = df['conversion'].replace(np.inf, np.nan)

print("{:,} entries".format(len(df)))
print("{:,} entries after removing inf".format(len(df[~df['conversion'].isna()])))

545,552 entries
463,362 entries after removing inf


539,452 entries

455,649 entries after removing inf

In [72]:
df['conversion'].agg(['mean', 'median']).reset_index()

Unnamed: 0,index,conversion
0,mean,0.233127
1,median,0.0


In [73]:
df['month'] = df['date'].dt.to_period('M')
df.groupby('month')['conversion'].agg(['mean', 'median']).reset_index()

Unnamed: 0,month,mean,median
0,2022-09,0.188445,0.0
1,2022-10,0.199994,0.0
2,2022-11,0.218103,0.0
3,2022-12,0.459168,0.0
4,2023-01,0.181999,0.0
5,2023-02,0.26776,0.0


In [77]:
df_inf.iloc[-100:]

Unnamed: 0,form,date,visitors,transactions,conversion
545608,995683,2023-01-11,0.0,1.0,inf
545609,967503,2022-12-22,0.0,1.0,inf
545610,71732,2022-12-30,0.0,1.0,inf
545611,995488,2022-12-08,0.0,1.0,inf
545612,999063,2022-12-16,0.0,1.0,inf
...,...,...,...,...,...
545703,973969,2022-12-30,0.0,1.0,inf
545704,993272,2023-01-04,0.0,1.0,inf
545705,995732,2023-01-06,0.0,1.0,inf
545706,976888,2023-01-10,0.0,1.0,inf


In [76]:
df_inf[df_inf['form']==994529]

Unnamed: 0,form,date,visitors,transactions,conversion
826,994529,2022-10-18,267.0,20.0,0.074906
985,994529,2022-10-29,57.0,11.0,0.192982
2967,994529,2022-10-19,142.0,11.0,0.077465
8362,994529,2022-11-03,37.0,12.0,0.324324
10482,994529,2022-11-05,19.0,11.0,0.578947
18777,994529,2022-10-27,116.0,13.0,0.112069
19857,994529,2022-10-25,617.0,25.0,0.040519
20131,994529,2022-10-23,159.0,5.0,0.031447
24238,994529,2022-11-09,17.0,5.0,0.294118
32486,994529,2022-11-04,28.0,8.0,0.285714
