In [1]:
import sys, datetime
import pandas as pd
import numpy as np

sys.path.insert(1, '../../../../scripts/')
from s3_support import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


# load and prep data

## google traffic

In [2]:
q = '''select
            date_trunc('day', date) as day,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2020
        group by date_trunc('day', date), org, form;'''
pageviews = redshift_query_read(q)

In [3]:
pageviews = pageviews[pageviews['form']!=0]
pageviews['week'] = pd.to_datetime(pageviews['day'])
len(pageviews), len(pageviews['form'].unique())

(544966, 14571)

## transactions

In [4]:
q = '''select 
            form, 
            date_trunc('day', date) as day,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2020
        group by form, date_trunc('day', date)
    '''
trans = redshift_query_read(q)

In [5]:
trans['week'] = pd.to_datetime(trans['day'])

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'day'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']
trans.head(3)

Unnamed: 0,form,day,count,vol,week_x,org,pageviews,week_y,conversion_rate
0,929673,2020-01-01,3,1943.5,2020-01-01,441953,3,2020-01-01,1.0
1,96359,2020-01-01,8,9300.0,2020-01-01,31717,27,2020-01-01,0.296296
2,1197,2020-01-01,25,4158.69,2020-01-01,1214,162,2020-01-01,0.154321


In [6]:
len(trans), len(trans['form'].unique())

(111576, 5472)

## analytics data

In [7]:
q = "select * from analytics_daily where date>=2020"
df_base = redshift_query_read(q)
print("done with analytics")

q = "select * from analyticsqgiv_daily where date>=2020"
df_qgiv = redshift_query_read(q)
print("done with analyticsqgiv")

done with analytics
done with analyticsqgiv


In [8]:
df_analytics = df_base.merge(df_qgiv, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()

df_analytics['day'] = pd.to_datetime(df_analytics['date'])
df_analytics.drop('date', axis=1, inplace=True)

In [9]:
len(df_analytics), len(df_analytics['form'].unique())

(2391407, 24868)

## segment data

In [10]:
# CMS edits
q = '''select
            users.org as org,
            count(saved_page.id) as page_updates,
            date_trunc('day', saved_page.original_timestamp) as day
        from saved_page
            left join users on saved_page.uuid=users.uuid
        group by day, org'''
df_cms = redshift_query_read(q, schema="secure")
len(df_cms), len(df_cms['org'].unique()), df_cms['day'].min(), df_cms['day'].max()

(3812,
 2535,
 Timestamp('2020-01-25 00:00:00'),
 Timestamp('2020-08-03 00:00:00'))

In [11]:
# widgets
q = '''select
            users.org as org,
            count(created_widget.id) as widgets_created,
            date_trunc('day', created_widget.original_timestamp) as day
        from created_widget
            left join users on created_widget.uuid=users.uuid
        group by day, org'''
df_widget_created = redshift_query_read(q, schema="secure")

q = '''select
            users.org as org,
            count(deleted_widget.id) as widgets_deleted,
            date_trunc('day', deleted_widget.original_timestamp) as day
        from deleted_widget
            left join users on deleted_widget.uuid=users.uuid
        group by day, org'''
df_widget_deleted = redshift_query_read(q, schema="secure")

df_widget = df_widget_created.merge(df_widget_deleted, on=['org', 'day'])

df_widget.sort_values('day', ascending=True, inplace=True)
widget_data = None
for org in df_widget['org'].unique():
    _df = df_widget[df_widget['org']==org].copy()
    _df['created_cumsum'] = _df['widgets_created'].cumsum()
    _df['deleted_cumsum'] = _df['widgets_deleted'].cumsum()
    _df['widgets_cumsum'] = _df['created_cumsum'] - _df['deleted_cumsum']
    if widget_data is None:
        widget_data = _df
    else:
        widget_data = widget_data.append(_df)

widget_data.head(3)

Unnamed: 0,org,widgets_created,day,widgets_deleted,created_cumsum,deleted_cumsum,widgets_cumsum
8,443134,1,2020-02-11,2,1,2,-1
11,443134,2,2020-03-05,1,3,3,0
97,443134,1,2020-03-11,1,4,4,0


In [12]:
# goals
q = '''select
            users.org as org,
            count(gs.id) as goals_saved,
            date_trunc('day', gs.original_timestamp) as day
        from goals_saved_item_settings as gs
            left join users on gs.uuid=users.uuid
        group by day, org'''
df_goals = redshift_query_read(q, schema="secure")
len(df_goals), len(df_goals['org'].unique()), df_goals['day'].min(), df_goals['day'].max()

(190, 51, Timestamp('2020-03-17 00:00:00'), Timestamp('2020-08-03 00:00:00'))

In [None]:
goals_data = None
for org in df_goals['org'].unique():
    _df = df_goals[df_goals['org']==org].copy().sort_values('day', ascending=True)
    _df['goals_cumsum'] = _df['goals_saved'].cumsum()
    if goals_data is None:
        goals_data = _df
    else:
        goals_data.append(_df)

In [None]:
df_goals = goals_data

## merge data

In [13]:
def get_cms_updates(r):
    cms_updates = 0
    if r['org'] is not None and r['org'] != 0:
        _df = df_cms[(df_cms['org'].fillna(0).astype(int)==int(r['org']))&(df_cms['day']==r['day'])]
        if len(_df) > 0:
            cms_updates = _df['page_updates'].iloc[0]
    return cms_updates

def get_widgets_created(r):
    widgets_created = 0
    if r['org'] is not None and r['org'] != 0:
        _df = widget_data[(widget_data['org'].fillna(0).astype(int)==int(r['org']))&(widget_data['day']<=r['day'])]
        if len(_df) > 0:
            widgets_created = _df['widgets_cumsum'].iloc[0]
    return widgets_created

def get_goals_saved(r):
    goals_saved = 0
    if r['org'] is not None and r['org'] != 0:
        _df = df_goals[(df_goals['org'].fillna(0).astype(int)==int(r['org']))&(df_goals['day']==r['day'])]
        if len(_df) > 0:
            goals_saved = _df['goals_cumsum'].iloc[0]
    return goals_saved

In [14]:
trans['cms_updates'] = trans[['org', 'day']].apply(get_cms_updates, axis=1)
trans['widgets_created'] = trans[['org', 'day']].apply(get_widgets_created, axis=1)
trans['goals_saved'] = trans[['org', 'day']].apply(get_goals_saved, axis=1)

In [15]:
"{:.4f}".format(len(trans[trans['cms_updates']>0]) / len(trans)), "{:.4f}".format(len(trans[trans['widgets_created']>0]) / len(trans))

('0.0082', '0.0000')

In [16]:
dataset = trans.dropna()[['form', 'day', 'conversion_rate', 'cms_updates', 'widgets_created', 'goals_saved']]
dataset = dataset.merge(df_analytics, on=['form', 'day'], how='outer')
dataset['month'] = dataset['day'].dt.month

original_len = len(dataset)
dataset = dataset[~dataset['conversion_rate'].isna()]
dataset.fillna(0, inplace=True)

original_len, len(dataset)

(2430425, 111576)

In [17]:
cms_perc = "{:.4f}".format(len(dataset[dataset['cms_updates']>0]) / len(dataset))
widgets_perc = "{:.4f}".format(len(dataset[dataset['widgets_created']>0]) / len(dataset))

cms_perc, widgets_perc, dataset['day'].min()

('0.0082', '0.0000', Timestamp('2020-01-01 00:00:00'))

In [18]:
dataset['has_min_amount'] = dataset['min_amount']!=0.
dataset['has_max_amount'] = dataset['max_amount']!=0.

forms_widgets = dataset[dataset['widgets_created']!=0]['form'].unique().tolist()
forms_cms = dataset[dataset['cms_updates']!=0]['form'].unique().tolist()
forms_goals = dataset[dataset['goals_saved']!=0]['form'].unique().tolist()

dataset['has_widgets'] = dataset['form'].apply(lambda x: x in forms_widgets)
dataset['cms_active'] = dataset['form'].apply(lambda x: x in forms_cms)
dataset['has_goals'] = dataset['form'].apply(lambda x: x in forms_goals)

# modeling

In [27]:
target = 'conversion_rate'

ftrs = ['month', 'pledge_active', 'has_min_amount', 'has_max_amount', 
        'permit_anonymous', 'donation_active', 'multirestriction_system',
        'collect_company', 'collect_phone', 'collect_optin',
        'collect_address_mobile', 'enable_donorlogins', 'enable_sms',
        'has_widgets', 'cms_active', 'has_goals']

In [28]:
def test_training(X, y):
    scores = []
    mses = []

    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)

        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))

    print("Random Forest:")
    print("\tr2: {:.4f}".format(np.mean(scores)))
    print("\tmse: {:.4f}".format(np.mean(mses)))
    print()

    scores = []
    mses = []

    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)

        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))

    print("GBM:")
    print("\tr2: {:.4f}".format(np.mean(scores)))
    print("\tmse: {:.4f}".format(np.mean(mses)))
    
    return rf, gbm

In [30]:
print("Manually selected features ({})".format(len(ftrs)))

rf, gbm = test_training(dataset[ftrs], dataset[target])

Manually selected features (16)
Random Forest:
	r2: 0.0409
	mse: 2.1372

GBM:
	r2: 0.0222
	mse: 2.1041


In [21]:
for k, v in zip(X_train.columns, rf.feature_importances_):
    print("{}: {:.4f}".format(k, v))

month: 0.1875
pledge_active: 0.0518
has_min_amount: 0.0836
has_max_amount: 0.0351
permit_anonymous: 0.0481
donation_active: 0.0192
multirestriction_system: 0.0170
collect_company: 0.0352
collect_phone: 0.0431
collect_optin: 0.0642
collect_address_mobile: 0.1241
enable_donorlogins: 0.0764
enable_sms: 0.0707
has_widgets: 0.0001
cms_active: 0.1213
has_goals: 0.0223


# optimizations

1. dropping has_widgets

In [32]:
target = 'conversion_rate'

ftrs = ['month', 'pledge_active', 'has_min_amount', 'has_max_amount', 
        'permit_anonymous', 'donation_active', 'multirestriction_system',
        'collect_company', 'collect_phone', 'collect_optin',
        'collect_address_mobile', 'enable_donorlogins', 'enable_sms',
        'cms_active', 'has_goals']

In [33]:
rf, gbm = test_training(dataset[ftrs], dataset[target])

Random Forest:
	r2: 0.0467
	mse: 2.0724

GBM:
	r2: 0.0225
	mse: 2.1166


2. dropping has_widgets, has_goals

In [34]:
ftrs = ['month', 'pledge_active', 'has_min_amount', 'has_max_amount', 
        'permit_anonymous', 'donation_active', 'multirestriction_system',
        'collect_company', 'collect_phone', 'collect_optin',
        'collect_address_mobile', 'enable_donorlogins', 'enable_sms', 'cms_active']
rf, gbm = test_training(dataset[ftrs], dataset[target])

Random Forest:
	r2: 0.0421
	mse: 2.0583

GBM:
	r2: 0.0231
	mse: 2.1053


3. dropping has_widgets, has_goals, multirestriction_system

In [35]:
ftrs = ['month', 'pledge_active', 'has_min_amount', 'has_max_amount', 
        'permit_anonymous', 'donation_active',
        'collect_company', 'collect_phone', 'collect_optin',
        'collect_address_mobile', 'enable_donorlogins', 'enable_sms', 'cms_active']
rf, gbm = test_training(dataset[ftrs], dataset[target])

Random Forest:
	r2: 0.0428
	mse: 2.0766

GBM:
	r2: 0.0227
	mse: 2.1089


4. dropping has_widgets, has_goals, multirestriction_system, donation_active

In [36]:
ftrs = ['month', 'pledge_active', 'has_min_amount', 'has_max_amount', 
        'permit_anonymous', 'collect_company', 'collect_phone', 'collect_optin',
        'collect_address_mobile', 'enable_donorlogins', 'enable_sms', 'cms_active']
rf, gbm = test_training(dataset[ftrs], dataset[target])

Random Forest:
	r2: 0.0451
	mse: 2.1294

GBM:
	r2: 0.0237
	mse: 2.0837


# retrain full dataset & store model

In [22]:
X = dataset[ftrs]
y = dataset[target]

rf = RandomForestRegressor()
rf.fit(X, y)

rf.score(X, y)

0.10015410726300389

In [23]:
'''
import joblib

joblib.dump({
    "model": rf,
    "features": X.columns
}, "model.settings.joblib", protocol=2)
'''

'\nimport joblib\n\njoblib.dump({\n    "model": rf,\n    "features": X.columns\n}, "model.settings.joblib", protocol=2)\n'