In [3]:
import sys, datetime
import pandas as pd
import numpy as np

sys.path.insert(1, '../../../../scripts/')
from s3_support import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# load and prep data

## google traffic

In [4]:
q = '''select
            date_trunc('day', date) as day,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2020
        group by date_trunc('day', date), org, form;'''
pageviews = redshift_query_read(q)

In [5]:
pageviews = pageviews[pageviews['form']!=0]
pageviews['week'] = pd.to_datetime(pageviews['day'])
len(pageviews), len(pageviews['form'].unique())

(544966, 14571)

## transactions

In [6]:
q = '''select 
            form, 
            date_trunc('day', date) as day,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2020
        group by form, date_trunc('day', date)
    '''
trans = redshift_query_read(q)

In [7]:
trans['week'] = pd.to_datetime(trans['day'])

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'day'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']
trans.head(3)

Unnamed: 0,form,day,count,vol,week_x,org,pageviews,week_y,conversion_rate
0,929673,2020-01-01,3,1943.5,2020-01-01,441953,3,2020-01-01,1.0
1,96359,2020-01-01,8,9300.0,2020-01-01,31717,27,2020-01-01,0.296296
2,1197,2020-01-01,25,4158.69,2020-01-01,1214,162,2020-01-01,0.154321


In [8]:
len(trans), len(trans['form'].unique())

(111576, 5472)

## analytics data

In [9]:
q = "select * from analytics_daily where date>=2020"
df_base = redshift_query_read(q)
print("done with analytics")

q = "select * from analyticsqgiv_daily where date>=2020"
df_qgiv = redshift_query_read(q)
print("done with analyticsqgiv")

done with analytics
done with analyticsqgiv


In [10]:
df_analytics = df_base.merge(df_qgiv, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()

df_analytics['day'] = pd.to_datetime(df_analytics['date'])
df_analytics.drop('date', axis=1, inplace=True)

In [11]:
len(df_analytics), len(df_analytics['form'].unique())

(2391407, 24868)

## segment data

In [12]:
# CMS edits
q = '''select
            users.org as org,
            count(saved_page.id) as page_updates,
            date_trunc('day', saved_page.original_timestamp) as day
        from saved_page
            left join users on saved_page.uuid=users.uuid
        group by day, org'''
df_cms = redshift_query_read(q, schema="secure")
len(df_cms), len(df_cms['org'].unique()), df_cms['day'].min(), df_cms['day'].max()

(3812,
 2535,
 Timestamp('2020-01-25 00:00:00'),
 Timestamp('2020-08-03 00:00:00'))

In [13]:
df_cms.sort_values('day', ascending=True, inplace=True)
cms_data = None
for org in df_cms['org'].unique():
    _df = df_cms[df_cms['org']==org].copy()
    _df['cms_cumsum'] = _df['page_updates'].cumsum()
    if cms_data is None:
        cms_data = _df
    else:
        cms_data = cms_data.append(_df)
cms_data.head(3)

Unnamed: 0,org,page_updates,day,cms_cumsum
1248,438418,1,2020-01-25,1
258,438418,1,2020-01-29,2
7,438418,1,2020-01-31,3


In [14]:
# widgets
q = '''select
            users.org as org,
            count(created_widget.id) as widgets_created,
            date_trunc('day', created_widget.original_timestamp) as day
        from created_widget
            left join users on created_widget.uuid=users.uuid
        group by day, org'''
df_widget_created = redshift_query_read(q, schema="secure")

q = '''select
            users.org as org,
            count(deleted_widget.id) as widgets_deleted,
            date_trunc('day', deleted_widget.original_timestamp) as day
        from deleted_widget
            left join users on deleted_widget.uuid=users.uuid
        group by day, org'''
df_widget_deleted = redshift_query_read(q, schema="secure")

df_widget = df_widget_created.merge(df_widget_deleted, on=['org', 'day'])
len(df_widget), len(df_widget['org'].unique()), df_widget['day'].min(), df_widget['day'].max()

(180, 6, Timestamp('2020-01-27 00:00:00'), Timestamp('2020-08-03 00:00:00'))

In [15]:
df_widget.sort_values('day', ascending=True, inplace=True)
widget_data = None
for org in df_widget['org'].unique():
    _df = df_widget[df_widget['org']==org].copy()
    _df['created_cumsum'] = _df['widgets_created'].cumsum()
    _df['deleted_cumsum'] = _df['widgets_deleted'].cumsum()
    _df['widgets_cumsum'] = _df['created_cumsum'] - _df['deleted_cumsum']
    if widget_data is None:
        widget_data = _df
    else:
        widget_data = widget_data.append(_df)
widget_data.head(3)

Unnamed: 0,org,widgets_created,day,widgets_deleted,created_cumsum,deleted_cumsum,widgets_cumsum
84,443134,1,2020-02-11,2,1,2,-1
1,443134,2,2020-03-05,1,3,3,0
172,443134,1,2020-03-11,1,4,4,0


## merge data

In [16]:
def get_cms_updates(r):
    cms_updates = 0
    if r['org'] is not None and r['org'] != 0:
        _df = cms_data[(cms_data['org'].fillna(0).astype(int)==int(r['org']))&(cms_data['day']<=r['day'])]
        if len(_df) > 0:
            cms_updates = _df['cms_cumsum'].iloc[0]
    return cms_updates

def get_widgets_created(r):
    widgets_created = 0
    if r['org'] is not None and r['org'] != 0:
        _df = widget_data[(widget_data['org'].fillna(0).astype(int)==int(r['org']))&(widget_data['day']<=r['day'])]
        if len(_df) > 0:
            widgets_created = _df['widgets_cumsum'].iloc[0]
    return widgets_created

In [17]:
trans['cms_updates'] = trans[['org', 'day']].apply(get_cms_updates, axis=1)
trans['widgets_created'] = trans[['org', 'day']].apply(get_widgets_created, axis=1)

In [18]:
"{:.4f}".format(len(trans[trans['cms_updates']>0]) / len(trans)), "{:.4f}".format(len(trans[trans['widgets_created']>0]) / len(trans))

('0.1861', '0.0000')

In [19]:
dataset = trans.dropna()[['form', 'day', 'conversion_rate', 'cms_updates', 'widgets_created']]
dataset = dataset.merge(df_analytics, on=['form', 'day'], how='outer')
dataset['month'] = dataset['day'].dt.month

original_len = len(dataset)
dataset = dataset[~dataset['conversion_rate'].isna()]
dataset.fillna(0, inplace=True)

original_len, len(dataset)

(2430425, 111576)

In [20]:
cms_perc = "{:.4f}".format(len(dataset[dataset['cms_updates']>0]) / len(dataset))
widgets_perc = "{:.4f}".format(len(dataset[dataset['widgets_created']>0]) / len(dataset))

cms_perc, widgets_perc, dataset['day'].min()

('0.1861', '0.0000', Timestamp('2020-01-01 00:00:00'))

# modeling

In [21]:
target = 'conversion_rate'

ftrs = ["restrictions", "amounts", "opt_fields", "req_fields", "min_amount", 
        "max_amount", "cms_updates", "widgets_created"]

In [22]:
print("Manually selected features ({})".format(len(ftrs)))

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset[ftrs], dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset[ftrs], dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Manually selected features (8)
Random Forest:
	r2: 0.0639
	mse: 1.9562

GBM:
	r2: 0.0335
	mse: 2.1804


Prior performance:

Random Forest:
- r2: 0.0669
- mse: 2.0112

GBM:
- r2: 0.0352
- mse: 2.0177

In [23]:
for k, v in zip(X_train.columns, rf.feature_importances_):
    print("{}: {:.4f}".format(k, v))

restrictions: 0.1647
amounts: 0.2755
opt_fields: 0.1584
req_fields: 0.0546
min_amount: 0.1553
max_amount: 0.1528
cms_updates: 0.0383
widgets_created: 0.0002


# retrain full dataset & store model

In [24]:
X = dataset[ftrs]
y = dataset[target]

rf = RandomForestRegressor()
rf.fit(X, y)

training_err = rf.score(X, y)
print("training error: {}".format(training_err))

training error: 0.1261147885111943


In [25]:
'''
import joblib

joblib.dump({
    "model": rf,
    "features": X.columns
}, "model.features.joblib", protocol=2)
'''

'\nimport joblib\n\njoblib.dump({\n    "model": rf,\n    "features": X.columns\n}, "model.features.joblib", protocol=2)\n'