In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# load & prep data

## traffic data

In [2]:
q = '''select
            date_trunc('week', date) as week,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date >= 2016 and date <= 2019
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [3]:
pageviews = pageviews[pageviews['form']!=0]
pageviews.sort_values('week', ascending=True, inplace=True)

## transaction data

In [4]:
q = '''select 
            form, 
            date_trunc('week', date) as week,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2016 and date<=2019
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [5]:
len(trans), len(trans['form'].unique())

(256129, 11558)

In [6]:
trans['week'] = pd.to_datetime(trans['week'])
trans.sort_values('week', ascending=True, inplace=True)

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'week'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']
trans.head(3)

Unnamed: 0,form,week,count,vol,org,pageviews,conversion_rate
0,506,2017-02-06,2,2500.0,534,9,0.222222
1,17,2017-02-06,15,833.0,42,37,0.405405
2,106597,2017-02-06,13,500.0,29769,2,6.5


## logs data

In [7]:
q = '''select
            form,
            date_trunc('week', created) as week,
            systemtype,
            count(id) as count
        from logs
        where created>=2016 and created <=2019 and form!=0
        group by form, systemtype, date_trunc('week', created)'''
logs = redshift_query_read(q)

In [8]:
len(logs), len(logs['form'].unique())

(161595, 20563)

In [9]:
logs_pvt = pd.pivot_table(logs, index=['form', 'week'], columns='systemtype', values='count').reset_index()
logs_pvt.fillna(0, inplace=True)
logs_pvt.head(3)

systemtype,form,week,0,4,8,11,12,13,15,18,...,34,35,36,38,39,40,41,42,43,44
0,1,2016-05-02,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2016-05-30,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2016-06-20,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## analytics data

In [10]:
q = "select * from analytics_weekly"
df_base = redshift_query_read(q)
q = "select * from analyticsqgiv_weekly"
df_qgiv = redshift_query_read(q)

In [11]:
len(df_base['date'].unique()), len(df_qgiv['date'].unique())

(152, 86)

In [18]:
df_analytics = df_base.merge(df_qgiv, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()
df_analytics['week'] = pd.to_datetime(df_analytics['date'])
df_analytics.drop('date', axis=1, inplace=True)

In [20]:
len(df_analytics), len(df_analytics['form'].unique()), len(df_analytics['week'].unique())

(1189306, 23750, 86)

In [21]:
df_analytics.head(3)

Unnamed: 0,form,vt_trans_count,don_form_trans_count,kiosk_trans_count,p2p_trans_count,mobile_trans_count,mobilevt_trans_count,sms_trans_count,fb_trans_count,vt_trans_vol,...,enable_donorlogins,enable_sms,new_rec_volume,new_rec_count,reg_count,dl_trans_volume,dl_trans_count,dl_new_rec_count,dl_new_rec_volume,week
0,1,0,0,0,0,0,0,0,0,0.0,...,1,0,0.0,0,0,0.0,0,0,0.0,2017-05-08
1,2,0,0,0,0,0,0,0,0,0.0,...,0,1,0.0,0,0,0.0,0,0,0.0,2017-05-08
2,3,0,0,0,0,0,0,0,0,0.0,...,0,0,0.0,0,0,0.0,0,0,0.0,2017-05-08


## merge data

In [62]:
# build data set
dataset = trans.dropna()[['form', 'week', 'count', 'vol', 'conversion_rate']]
dataset = dataset.merge(logs_pvt, on=['form', 'week'], how='outer')
dataset = dataset.merge(df_analytics, on=['form', 'week'], how='outer')
dataset['month'] = dataset['week'].dt.month
dataset.drop('week', axis=1, inplace=True)

In [63]:
len(dataset), len(trans.dropna()), len(logs_pvt), len(df_analytics)

(1354583, 140680, 102836, 1189306)

In [64]:
dataset.head(3)

Unnamed: 0,form,count,vol,conversion_rate,0,4,8,11,12,13,...,enable_donorlogins,enable_sms,new_rec_volume,new_rec_count,reg_count,dl_trans_volume,dl_trans_count,dl_new_rec_count,dl_new_rec_volume,month
0,506,2.0,2500.0,0.222222,,,,,,,...,,,,,,,,,,2
1,17,15.0,833.0,0.405405,0.0,0.0,0.0,3.0,0.0,0.0,...,,,,,,,,,,2
2,106597,13.0,500.0,6.5,0.0,0.0,0.0,10.0,0.0,0.0,...,,,,,,,,,,2


In [66]:
len(dataset[~dataset['conversion_rate'].isna()]), dataset['conversion_rate'].isna().sum()

(140680, 1213903)

In [67]:
# drop where conversion rate is NA, fill other NA with 0
dataset = dataset[~dataset['conversion_rate'].isna()]
dataset.fillna(0, inplace=True)

In [68]:
len(dataset)

140680

# modeling on raw features

here we will use the full feature set modeled against conversion rate, transaction volume, and transaction count with no embeddings to form a baseline. we will perform some feature selection in order to put in a minimum effort to improve this model before moving on to more advanced techniques such as more complicated models and feature engineering

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [70]:
target_cols = ['conversion_rate', 'vol', 'count']

In [71]:
for target in target_cols:
    print(target)
    
    scores = []
    mses = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target_cols, axis=1), dataset[target])

        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tRandom Forest:")
    print("\t\tr2: {:.4f}".format(np.mean(scores)))
    print("\t\tmse: {:.4f}".format(np.mean(mses)))
    
    scores = []
    mses = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target_cols, axis=1), dataset[target])

        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)
        
        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tGBM:")
    print("\t\tr2: {:.4f}".format(np.mean(scores)))
    print("\t\tmse: {:.4f}".format(np.mean(mses)))
    
    print("-"*40)

conversion_rate
	Random Forest:
		r2: 0.1086
		mse: 9.8257
	GBM:
		r2: 0.1016
		mse: 9.1203
----------------------------------------
vol
	Random Forest:
		r2: -0.1364
		mse: 54355230.4523
	GBM:
		r2: 0.3014
		mse: 47595863.4427
----------------------------------------
count
	Random Forest:
		r2: 0.6520
		mse: 370.9641
	GBM:
		r2: 0.5460
		mse: 469.0126
----------------------------------------


The volume and transaction count models could prove useful but we are primarily interested in the conversion rate model. Here we see the conversion rate model performed very poorly. The volume and transaction count models however appear to be of fair quality with R2's between .80 and .90. Let's now perform some feature selection to see if we can't improve the performance.

## automated feature selection

In [72]:
for target in target_cols:
    print(target)
    
    # model for feature importances
    X_train = dataset.drop(target_cols, axis=1)
    y_train = dataset[target]
    
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    
    feature_importances = []
    for e in zip(X_train.columns, rf.feature_importances_):
        feature_importances.append(e)
    
    for threshold in [0.01, 0.02, 0.05, 0.1, 0.15]:
        print("\tfeature importance threshold: {}".format(threshold))
        important_features = [c[0] for c in feature_importances if c[1] >= threshold]
        print("\tfeatures ({}): {}".format(len(important_features), ", ".join([str(f) for f in important_features])))
        print()
        
        # trial random forest model
        scores = []
        mses = []
        for i in range(50):
            X_train, X_test, y_train, y_test = train_test_split(dataset[important_features], dataset[target])

            rf = RandomForestRegressor()
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)

            scores.append(rf.score(X_test, y_test))
            mses.append(mean_squared_error(y_test, y_pred))

        print("\tRandom Forest:")
        print("\t\tr2: {:.4f}".format(np.mean(scores)))
        print("\t\tmse: {:.4f}".format(np.mean(mses)))

        # trial GBM model
        scores = []
        mses = []
        for i in range(50):
            X_train, X_test, y_train, y_test = train_test_split(dataset[important_features], dataset[target])

            gbm = GradientBoostingRegressor()
            gbm.fit(X_train, y_train)
            y_pred = gbm.predict(X_test)

            scores.append(gbm.score(X_test, y_test))
            mses.append(mean_squared_error(y_test, y_pred))

        print("\tGBM:")
        print("\t\tr2: {:.4f}".format(np.mean(scores)))
        print("\t\tmse: {:.4f}".format(np.mean(mses)))

        print("-"*40)

conversion_rate
	feature importance threshold: 0.01
	features (11): form, 23, 34, don_form_trans_count, kiosk_trans_vol, one_time_trans_count, rec_trans_vol, rec_trans_count, new_rec_volume, new_rec_count, month

	Random Forest:
		r2: 0.0770
		mse: 9.5113
	GBM:
		r2: 0.0575
		mse: 10.5100
----------------------------------------
	feature importance threshold: 0.02
	features (7): form, 23, 34, don_form_trans_count, kiosk_trans_vol, rec_trans_count, month

	Random Forest:
		r2: 0.0738
		mse: 9.3291
	GBM:
		r2: 0.0876
		mse: 9.6900
----------------------------------------
	feature importance threshold: 0.05
	features (4): form, 23, rec_trans_count, month

	Random Forest:
		r2: 0.0891
		mse: 9.5761
	GBM:
		r2: 0.0876
		mse: 9.9544
----------------------------------------
	feature importance threshold: 0.1
	features (3): form, 23, month

	Random Forest:
		r2: 0.0888
		mse: 9.7267
	GBM:
		r2: 0.0938
		mse: 9.6136
----------------------------------------
	feature importance threshold: 0.15
	f

Examining the feature importance filtering of feature sets used proves that this is not a fruitful direction. None of the models appear to significantly improve over the models with all available features.

## manual feature selection

here we will manually pick fields to drop and/or model against

In [84]:
drop_cols = ['vt_trans_count', 'don_form_trans_count', 'kiosk_trans_count',
             'p2p_trans_count', 'mobile_trans_count', 'mobilevt_trans_count',
             'sms_trans_count', 'fb_trans_count', 'vt_trans_vol',
             'don_form_trans_vol', 'kiosk_trans_vol', 'p2p_trans_vol',
             'mobile_trans_vol', 'mobilevt_trans_vol', 'sms_trans_vol',
             'fb_trans_vol', 'one_time_trans_vol', 'one_time_trans_count',
             'rec_trans_vol', 'rec_trans_count', 'pledges_count',
             'dl_trans_volume', 'dl_trans_count', 'dl_new_rec_count', 
             'dl_new_rec_volume']

In [85]:
for target in target_cols:
    print(target)
    
    scores = []
    mses = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target_cols + drop_cols, axis=1), dataset[target])

        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tRandom Forest:")
    print("\t\tr2: {:.4f}".format(np.mean(scores)))
    print("\t\tmse: {:.4f}".format(np.mean(mses)))
    
    scores = []
    mses = []
    for i in range(50):
        X_train, X_test, y_train, y_test = train_test_split(dataset.drop(target_cols + drop_cols, axis=1), dataset[target])

        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)
        
        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tGBM:")
    print("\t\tr2: {:.4f}".format(np.mean(scores)))
    print("\t\tmse: {:.4f}".format(np.mean(mses)))
    
    print("-"*40)

conversion_rate
	Random Forest:
		r2: 0.0702
		mse: 10.2045
	GBM:
		r2: 0.0945
		mse: 9.8222
----------------------------------------
vol
	Random Forest:
		r2: -0.1675
		mse: 61248839.1062
	GBM:
		r2: 0.1447
		mse: 54188345.5706
----------------------------------------
count
	Random Forest:
		r2: 0.5589
		mse: 462.7969
	GBM:
		r2: 0.4450
		mse: 569.6800
----------------------------------------


# embeddings

now we will introduce form embeddings modeled against the target variables (conversion rate, transaction volume, and transaction count)

In [86]:
from keras.layers import Embedding, Flatten, Dense, Conv2D, BatchNormalization
from keras import Sequential
import math

In [87]:
def generate_embedding(features, target, dimensions):
    X_train = np.array(features).reshape(len(features), 1)
    
    # create embedding model
    mdl = Sequential()
    mdl.add(Embedding(dim[0], dim[1], input_length=1))
    mdl.add(Flatten())
    mdl.add(Dense(1, activation='relu'))
    mdl.compile('rmsprop', 'mse')
    
    # train embedding model
    mdl.fit(X_train, target, epochs=10, batch_size=128, verbose=0)
    
    # return latent features
    return mdl.layers[0].get_weights()[0]

In [88]:
embedding_lookup_table = {}

unique_val_len = len(dataset['form'].unique())
col_cat_values = dataset['form'].astype('category').cat.codes
dataset['form'] = col_cat_values

# define embedding
dim = (unique_val_len, 30)

for t in target_cols:
    embedding_lookup_table[t] = generate_embedding(col_cat_values, dataset[t], dim)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [89]:
for target in target_cols:
    print(target)
    # build dataset for the given target
    #     continuous vars + embeddings for this target
    target_col = dataset[target]
    this_dataset = dataset.drop(target_cols + drop_cols, axis=1).copy()
    
    # set new feature columns for embeddings mapped to the categorical values    
    for new_col_i in range(embedding_lookup_table[target].shape[1]):
        new_col_label = "_".join(["form", str(new_col_i)])
        this_dataset[new_col_label] = this_dataset["form"].apply(lambda x: embedding_lookup_table[target][x][new_col_i])
    
    # train & evaluate
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset, target_col)
        
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
    
    print("\tRandom Forest:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))
    
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset, target_col)
        
        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)
        
        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tGBM:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))  
    
    print("-"*40)

conversion_rate
	Random Forest:
		r2: 0.21823923620003863
		mse: 8.76016661250582
	GBM:
		r2: 0.23766000418331212
		mse: 8.336743066518471
----------------------------------------
vol
	Random Forest:
		r2: 0.02881797956232817
		mse: 49014051.75955569
	GBM:
		r2: 0.26870094896470476
		mse: 44078988.19258527
----------------------------------------
count
	Random Forest:
		r2: 0.6788153990386693
		mse: 339.39709611633555
	GBM:
		r2: 0.6721600821022655
		mse: 351.9316075671405
----------------------------------------


the performance here is not substantially improved over the previous models. let's try dropping the logs features.

In [90]:
drop_cols = ['vt_trans_count', 'don_form_trans_count', 'kiosk_trans_count',
             'p2p_trans_count', 'mobile_trans_count', 'mobilevt_trans_count',
             'sms_trans_count', 'fb_trans_count', 'vt_trans_vol',
             'don_form_trans_vol', 'kiosk_trans_vol', 'p2p_trans_vol',
             'mobile_trans_vol', 'mobilevt_trans_vol', 'sms_trans_vol',
             'fb_trans_vol', 'one_time_trans_vol', 'one_time_trans_count',
             'rec_trans_vol', 'rec_trans_count', 'pledges_count',
             'dl_trans_volume', 'dl_trans_count', 'dl_new_rec_count', 
             'dl_new_rec_volume', 0, 4, 8, 11, 12, 13, 15, 18, 20, 21, 23, 24, 25, 
             26, 27, 28, 29, 32, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44]

In [91]:
for target in target_cols:
    print(target)
    # build dataset for the given target
    #     continuous vars + embeddings for this target
    target_col = dataset[target]
    this_dataset = dataset.drop(target_cols + drop_cols, axis=1).copy()
    
    # set new feature columns for embeddings mapped to the categorical values    
    for new_col_i in range(embedding_lookup_table[target].shape[1]):
        new_col_label = "_".join(["form", str(new_col_i)])
        this_dataset[new_col_label] = this_dataset["form"].apply(lambda x: embedding_lookup_table[target][x][new_col_i])
    
    # train & evaluate
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset, target_col)
        
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
    
    print("\tRandom Forest:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))
    
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset, target_col)
        
        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)
        
        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tGBM:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))  
    
    print("-"*40)

conversion_rate
	Random Forest:
		r2: 0.20727948639712246
		mse: 9.036881787617105
	GBM:
		r2: 0.2686532193955593
		mse: 7.734312131069073
----------------------------------------
vol
	Random Forest:
		r2: -0.06297748334943935
		mse: 49806526.6833537
	GBM:
		r2: 0.2916495413296447
		mse: 32806750.993362583
----------------------------------------
count
	Random Forest:
		r2: 0.6551663792320632
		mse: 363.5063640817677
	GBM:
		r2: 0.6519776456726603
		mse: 366.1795924394368
----------------------------------------


In [92]:
target_col = dataset['conversion_rate']
X_train, X_test, y_train, y_test = train_test_split(this_dataset, target_col)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
feature_importances

[('form', 0.08628994719767717),
 ('events_priv_count', 4.666898261872729e-06),
 ('restrictions', 0.010414916090877895),
 ('amounts', 0.012806514643285954),
 ('ded_types', 0.0020796072964005346),
 ('opt_ded_flds', 0.0),
 ('req_ded_flds', 7.366160513063452e-05),
 ('opt_fields', 0.0015442082310413133),
 ('req_fields', 0.004165856382411219),
 ('pledge_active', 0.00042981510693742913),
 ('donation_active', 0.01164288647357134),
 ('multirestriction_system', 0.0012775614798510118),
 ('min_amount', 0.0006971784946854471),
 ('max_amount', 0.009107610261678976),
 ('permit_anonymous', 0.0005113841947945359),
 ('permit_recurring', 0.0),
 ('permit_other_amount', 0.007195401904719066),
 ('permit_create_own_pledge', 0.0022013709370277985),
 ('collect_company', 0.002955845434927846),
 ('collect_phone', 0.006170031094036755),
 ('collect_optin', 0.002321045754145229),
 ('collect_captcha', 0.0),
 ('collect_address_mobile', 0.0004634057427972193),
 ('enable_donorlogins', 0.003301904225400547),
 ('enable_s