In [2]:
import sys, datetime, joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(1, '../../../../scripts/')
from s3_support import *

In [3]:
print("loading data")
print("\tgoogle analytics")

q = '''select
            date_trunc('day', date) as day,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2020
        group by date_trunc('day', date), org, form;'''
pageviews = redshift_query_read(q)
pageviews = pageviews[pageviews['form']!=0]
pageviews['week'] = pd.to_datetime(pageviews['day'])

loading data
	google analytics


In [4]:
print("\ttransactions")

q = '''select 
            form, 
            date_trunc('day', date) as day,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2020
        group by form, org, date_trunc('day', date)
    '''
trans = redshift_query_read(q)
trans['week'] = pd.to_datetime(trans['day'])

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'day'])
del(pageviews)

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']

	transactions


In [5]:
print("\tanalytics data")

q = '''select form, date, pledge_active, min_amount, max_amount, 
            permit_anonymous, donation_active, multirestriction_system,
            collect_company, collect_phone, collect_optin, 
            collect_address_mobile, enable_donorlogins, enable_sms
        from analyticsqgiv_daily where date>=2020'''
df_analytics = redshift_query_read(q)
df_analytics['day'] = pd.to_datetime(df_analytics['date'])
df_analytics.drop('date', axis=1, inplace=True)

	analytics data


In [6]:
print("\twidgets data")
q = '''select 
            created as day, 
            form, 
            message 
        from logs
        where 
            systemtype=36 and
            message like '%widget%'
        '''
df = redshift_query_read(q, schema='public')
df['day'] = pd.to_datetime(df['day'])

def action_sum(x):
    if 'added' in x:
        return 1
    elif 'delete' in x:
        return -1
    else:
        return 0
    
df['action_sum'] = df['message'].apply(action_sum)
df_widget = None
for f in df['form'].unique().tolist():
    _df = df[df['form']==f].sort_values('day', ascending=True).copy()
    _df['embeds'] = _df['action_sum'].cumsum()
    
    if df_widget is None:
        df_widget = _df[['form', 'day', 'embeds']]
    else:
        df_widget = df_widget.append(_df[['form', 'day', 'embeds']])

	widgets data


In [7]:
# goals
q = '''select
            users.org as org,
            count(gs.id) as goals_saved,
            date_trunc('day', gs.original_timestamp) as day
        from goals_saved_item_settings as gs
            left join users on gs.uuid=users.uuid
        group by day, org'''
df_goals_all = redshift_query_read(q, schema="secure")
df_goals = None
for org in df_goals_all['org'].unique():
    _df = df_goals_all[df_goals_all['org']==org].copy().sort_values('day', ascending=True)
    _df['goals_cumsum'] = _df['goals_saved'].cumsum()
    if df_goals is None:
        df_goals = _df
    else:
        df_goals.append(_df)

In [27]:
print("\tmerging data")

def get_widgets_created(r):
    widgets_created = 0
    if r['form'] is not None and r['form'] != 0:
        _df = df_widget[(df_widget['form'].fillna(0).astype(int)==int(r['form']))&(df_widget['day']==r['day'])]
        if len(_df) > 0:
            widgets_created = _df['widgets_created'].iloc[0]
    return widgets_created

def get_goals_saved(r):
    goals_saved = 0
    if r['org'] is not None and r['org'] != 0:
        _df = df_goals[(df_goals['org'].fillna(0).astype(int)==int(r['org']))&(df_goals['day']==r['day'])]
        if len(_df) > 0:
            goals_saved = _df['goals_cumsum'].iloc[0]
    return goals_saved

trans['widgets_created'] = trans[['form', 'day']].apply(get_widgets_created, axis=1)
#trans['goals_saved'] = trans[['org', 'day']].apply(get_goals_saved, axis=1)
trans['goals_saved'] = 0

dataset = trans.dropna()[['form', 'day', 'conversion_rate', 'widgets_created', 'goals_saved']]
dataset = dataset.merge(df_analytics, on=['form', 'day'], how='outer')
dataset['month'] = dataset['day'].dt.month

dataset = dataset[~dataset['conversion_rate'].isna()]
dataset.fillna(0, inplace=True)
dataset['conversion_rate'] = dataset.replace(np.inf, 1.)

dataset['has_min_amount'] = dataset['min_amount']!=0.
dataset['has_max_amount'] = dataset['max_amount']!=0.

forms_widgets = dataset[dataset['widgets_created']!=0]['form'].unique().tolist()
forms_goals = dataset[dataset['goals_saved']!=0]['form'].unique().tolist()

dataset['has_widgets'] = dataset['form'].apply(lambda x: x in forms_widgets)
dataset['has_goals'] = dataset['form'].apply(lambda x: x in forms_goals)

	merging data


In [28]:
print("training")

target = 'conversion_rate'

ftrs = ['month', 'pledge_active', 'has_min_amount', 'has_max_amount', 
        'permit_anonymous', 'donation_active', 'multirestriction_system',
        'collect_company', 'collect_phone', 'collect_optin',
        'collect_address_mobile', 'enable_donorlogins', 'enable_sms',
        'has_widgets', 'has_goals']

training


In [29]:
X = dataset[ftrs]
y = dataset[target]

In [30]:
rf = RandomForestRegressor()
rf.fit(X, y)

RandomForestRegressor()