In [1]:
import sys, datetime, joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(1, '../../../../scripts/')
from s3_support import *

In [2]:
print("loading data")
print("\tgoogle analytics")
q = '''select
            date_trunc('day', date) as day,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2020
        group by date_trunc('day', date), org, form;'''
pageviews = redshift_query_read(q)
pageviews = pageviews[pageviews['form']!=0]
pageviews['week'] = pd.to_datetime(pageviews['day'])

loading data
	google analytics


In [3]:
print("\ttransactions")
q = '''select 
            form, 
            date_trunc('day', date) as day,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2020
        group by form, date_trunc('day', date)
    '''
trans = redshift_query_read(q)
trans['week'] = pd.to_datetime(trans['day'])

	transactions


In [4]:
# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'day'])
del(pageviews)

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']

print("\tanalytics")
q = "select date as day, form, org, amounts, restrictions, opt_fields, req_fields, min_amount, max_amount from analyticsqgiv_daily where date>=2020"
df_analytics = redshift_query_read(q)
df_analytics['day'] = pd.to_datetime(df_analytics['day'])

	analytics


In [5]:
q = '''select
            users.org as org,
            count(created_widget.id) as widgets_created,
            date_trunc('day', created_widget.original_timestamp) as day
        from created_widget
            left join users on created_widget.uuid=users.uuid
        group by day, org'''
df_widget_created = redshift_query_read(q, schema="secure")

q = '''select
            users.org as org,
            count(deleted_widget.id) as widgets_deleted,
            date_trunc('day', deleted_widget.original_timestamp) as day
        from deleted_widget
            left join users on deleted_widget.uuid=users.uuid
        group by day, org'''
df_widget_deleted = redshift_query_read(q, schema="secure")

In [6]:
df_widget_all = df_widget_created.merge(df_widget_deleted, on=['org', 'day'])
df_widget_all.sort_values('day', ascending=True, inplace=True)
df_widget = None
for org in df_widget_all['org'].unique():
    _df = df_widget_all[df_widget_all['org']==org].copy()
    _df['created_cumsum'] = _df['widgets_created'].cumsum()
    _df['deleted_cumsum'] = _df['widgets_deleted'].cumsum()
    _df['widgets_cumsum'] = _df['created_cumsum'] - _df['deleted_cumsum']
    if df_widget is None:
        df_widget = _df
    else:
        df_widget = df_widget.append(_df)

In [7]:
print("\tmerging data")
def get_widgets_created(r):
    widgets_created = 0
    if r['org'] is not None and r['org'] != 0:
        _df = df_widget[(df_widget['org'].fillna(0).astype(int)==int(r['org']))&(df_widget['day']==r['day'])]
        if len(_df) > 0:
            widgets_created = _df['widgets_created'].iloc[0]
    return widgets_created

trans['widgets_created'] = trans[['org', 'day']].apply(get_widgets_created, axis=1)

	merging data


In [50]:
dataset = trans.dropna()[['form', 'day', 'conversion_rate']]
dataset = dataset.merge(df_analytics, on=['form', 'day'], how='right')
dataset['month'] = dataset['day'].dt.month
dataset = dataset[~dataset['conversion_rate'].isna()]
dataset = dataset.replace(np.inf, np.nan).fillna(0)

In [51]:
print("training")
target = 'conversion_rate'

ftrs = ["restrictions", "amounts", "opt_fields", "req_fields", "min_amount", 
        "max_amount"]

X = dataset[ftrs]
y = dataset[target]

training


In [52]:
rf = RandomForestRegressor()
rf.fit(X, y)

RandomForestRegressor()

In [27]:
df_ftrs = dataset[ftrs]
len(df_ftrs), len(df_ftrs.dropna())

(237735, 208675)

In [53]:
len(X), X.isna().sum()

(208675,
 restrictions    0
 amounts         0
 opt_fields      0
 req_fields      0
 min_amount      0
 max_amount      0
 dtype: int64)

In [49]:
y.isna().sum()

0