In [2]:
import math
import pandas as pd
import numpy as np
from keras.layers import Embedding, Flatten, Dense, Conv2D, BatchNormalization
from keras import Sequential
from sklearn.externals import joblib
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from pandas.core import datetools


# Load and prep data

In [4]:
# read in csv's
df_base = pd.read_csv("~/Repositories/datasets/analytics/a_base.csv")
df_qgiv = pd.read_csv("~/Repositories/datasets/analytics/analytics_qgiv.csv")
df_qgiv_base = pd.read_csv("~/Repositories/datasets/analytics/a_qgiv_base_id.csv")

In [5]:
# merge base ID into qgiv dataframe
df_qgiv = df_qgiv.merge(df_qgiv_base, left_on="id", right_on="id_x")

In [6]:
# merge base dataframe into qgiv dataframe
df = df_qgiv.merge(df_base, left_on="base", right_on="id", how="left")

In [7]:
len(df)

9000000

In [50]:
len(df.tm_stamp.unique())

72

In [8]:
df.columns

Index([u'pledges_count', u'events_count', u'events_priv_count',
       u'restrictions', u'amounts', u'ded_types', u'opt_ded_flds',
       u'req_ded_flds', u'opt_fields', u'req_fields', u'pledge_active',
       u'donation_active', u'multirestriction_system', u'min_amount',
       u'max_amount', u'show_amount', u'permit_anonymous', u'permit_recurring',
       u'permit_other_amount', u'permit_create_own_pledge', u'collect_company',
       u'collect_phone', u'collect_optin', u'collect_captcha',
       u'collect_address_mobile', u'enable_donorlogins', u'enable_sms',
       u'default_recurring_frequency', u'event_stats', u'new_rec_volume',
       u'new_rec_count', u'new_rec_volume.1', u'reg_count', u'dl_trans_volume',
       u'dl_trans_count', u'dl_new_rec_count', u'dl_new_rec_volume', u'id_x',
       u'org_x', u'id_x', u'base', u'id_y', u'org_y', u'sic', u'ein',
       u'visits', u'mobile_visits', u'vt_trans_count', u'don_form_trans_count',
       u'kiosk_trans_count', u'p2p_trans_count', u

In [156]:
df_base.columns
# form embedding columns added out of cell order here

Index([u'id', u'org', u'sic', u'ein', u'visits', u'mobile_visits',
       u'vt_trans_count', u'don_form_trans_count', u'kiosk_trans_count',
       u'p2p_trans_count', u'mobile_trans_count', u'mobilevt_trans_count',
       u'sms_trans_count', u'fb_trans_count', u'vt_trans_vol',
       u'don_form_trans_vol', u'kiosk_trans_vol', u'p2p_trans_vol',
       u'mobile_trans_vol', u'mobilevt_trans_vol', u'sms_trans_vol',
       u'fb_trans_vol', u'tm_stamp', u'one_time_trans_count',
       u'one_time_trans_vol', u'rec_trans_count', u'rec_trans_vol', u'product',
       u'form', u'form_cat', u'form_0', u'form_1', u'form_2', u'form_3',
       u'form_4', u'form_5', u'form_6', u'form_7', u'form_8', u'form_9',
       u'form_10', u'form_11', u'form_12', u'form_13', u'form_14', u'form_15',
       u'form_16', u'form_17', u'form_18', u'form_19', u'form_20', u'form_21',
       u'form_22', u'form_23', u'form_24', u'form_25', u'form_26', u'form_27',
       u'form_28', u'form_29'],
      dtype='object')

In [137]:
# cleaning up some unnecessary columns
df.drop(["id_x", "base", "id_y", "org_y"], axis=1, inplace=True)

In [10]:
df['conversion'] = df['don_form_trans_count'] / (df['visits']+df['mobile_visits'])
df['conversion'].fillna(0, inplace=True)

In [9]:
# breakout datetime components
df['tm_stamp'] = pd.to_datetime(df['tm_stamp'])
df['year'] = df['tm_stamp'].dt.year
df['month'] = df['tm_stamp'].dt.month
df['day'] = df['tm_stamp'].dt.day
df['hour'] = df['tm_stamp'].dt.hour
df['weekofyear'] = df['tm_stamp'].dt.weekofyear
df['dayofweek'] = df['tm_stamp'].dt.dayofweek
df['dayofyear'] = df['tm_stamp'].dt.dayofyear
df['quarter'] = df['tm_stamp'].dt.quarter

df['is_month_start'] = df['tm_stamp'].dt.is_month_start
df['is_month_end'] = df['tm_stamp'].dt.is_month_end
df['is_quarter_start'] = df['tm_stamp'].dt.is_quarter_start
df['is_quarter_end'] = df['tm_stamp'].dt.is_quarter_end
df['is_year_start'] = df['tm_stamp'].dt.is_year_start
df['is_year_end'] = df['tm_stamp'].dt.is_year_end

df.drop('tm_stamp', axis=1, inplace=True)

In [19]:
dt_cols = ['year', 'month', 'day', 'hour', 'weekofyear', 'dayofweek', 'dayofyear', 'quarter', 
           'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start',
          'is_year_end']
for c in dt_cols:
    df[c] = df[c].astype('category').cat.codes
df['collect_captcha'] = df['collect_captcha'].astype('category')

In [14]:
df['weekofyear'].value_counts()

-1    8592979
 0     238870
 1     153093
 2      15058
Name: weekofyear, dtype: int64

In [9]:
ftrs_core = ['opt_fields', 'req_fields', 'donation_active', 'multirestriction_system', 
             'restrictions', 'permit_other_amount', 'collect_captcha', 'form']

ftrs = ['opt_fields', 'req_fields', 'donation_active', 'multirestriction_system', 'restrictions', 
        'permit_other_amount', 'collect_captcha', 'day', 'month', 'restrictionsXmultirestriction', 
        'restrictions^2', 'restrictions^2Xmultirestriction', 'restrictions^3', 'restrictions^3Xmultirestriction', 
        'opt_fields^2', 'opt_fields^3', 'req_fields^2', 'req_fields^3', 'fields', 'fields^2', 'fields^3']

# Quick examination of core features

Here we'll do a brief refresher on the core (non-engineered) chosen features with a look at principally related features that were omitted in the original model.

In [169]:
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs_core+['conversion']]

In [57]:
df_core.tail()

Unnamed: 0,opt_fields,req_fields,donation_active,multirestriction_system,restrictions,permit_other_amount,collect_captcha,conversion
409903,0,0,1,0,6,1,0,0.083333
409910,0,0,1,0,7,1,0,0.0
409958,0,0,1,0,5,1,0,0.0
410008,0,0,1,0,0,1,0,0.0
410032,0,0,0,0,0,0,0,0.0


# Create embeddings and add to dataframe

In [59]:
def generate_embedding(ftr, target, dim):
    X_train = np.array(ftr).reshape(len(ftr), 1)
    
    # create the embedding
    mdl = Sequential()
    mdl.add(Embedding(dim[0], dim[1], input_length=1))
    mdl.add(Flatten())
    mdl.add(Dense(1, activation='relu'))
    mdl.compile('rmsprop', 'mse')
    
    # train the embedding
    mdl.fit(X_train, target, epochs=10, batch_size=128, verbose=0)
    
    # return the latent features
    return mdl.layers[0].get_weights()[0]

In [199]:
u = 2
float(u + 1) / float(2)

1.5

In [200]:
target = 'conversion'
df_core = df_core.copy()
cat_ftrs = ['opt_fields', 'req_fields', 'donation_active', 'multirestriction_system',
           'restrictions', 'permit_other_amount', 'collect_captcha']
embedding_lookup_tables = {}

for c in cat_ftrs:
    print("Working on {}".format(c))
    df_core[c] = df_core[c].astype('category').cat.codes

    # define our embedding dimensions
    unique_values = len(df_core[c].unique())
    dim = (unique_values, int(math.ceil(float(unique_values + 1) / float(2))))

    print("\tcreating embedding of dimension {}".format(dim))
    embedding_lookup_tables[c] = generate_embedding(df_core[c], df_core['conversion'], dim)

Working on opt_fields
	creating embedding of dimension (15, 8)
Working on req_fields
	creating embedding of dimension (10, 6)
Working on donation_active
	creating embedding of dimension (2, 2)
Working on multirestriction_system
	creating embedding of dimension (2, 2)
Working on restrictions
	creating embedding of dimension (44, 23)
Working on permit_other_amount
	creating embedding of dimension (2, 2)
Working on collect_captcha
	creating embedding of dimension (1, 1)


In [73]:
embedding_lookup_tables['restrictions'][0]

array([ 2.72454433e-02, -7.51600601e-05,  3.06813158e-02, -6.48003258e-03,
       -1.32987285e-02,  4.84039001e-02, -2.72805374e-02, -1.34822624e-02,
        7.79448962e-03, -1.44388974e-02,  4.75697592e-03,  4.39986587e-02,
       -2.42316276e-02, -4.00250033e-03,  1.66943409e-02, -1.88743416e-02,
       -1.55542763e-02,  1.10865943e-02, -3.05083301e-02, -6.98660559e-04,
        2.20815856e-02, -3.45096923e-02], dtype=float32)

In [201]:
new_ftrs = []
for e in embedding_lookup_tables:
    print("adding embedding as features for {}".format(e))
    for new_col_i in range(embedding_lookup_tables[e].shape[1]):
        new_col_label = "_".join([e, str(new_col_i)])
        df_core[new_col_label] = df_core[c].apply(lambda x: embedding_lookup_tables[e][x][new_col_i])
        new_ftrs.append(new_col_label)

adding embedding as features for donation_active
adding embedding as features for restrictions
adding embedding as features for collect_captcha
adding embedding as features for req_fields
adding embedding as features for opt_fields
adding embedding as features for permit_other_amount
adding embedding as features for multirestriction_system


In [202]:
df_core.head()

Unnamed: 0,opt_fields,req_fields,donation_active,multirestriction_system,restrictions,permit_other_amount,collect_captcha,conversion,day,month,...,opt_fields_5,opt_fields_6,opt_fields_7,permit_other_amount_0,multirestriction_system_0,donation_active_1,restrictions_22,req_fields_5,permit_other_amount_1,multirestriction_system_1
3022,1,1,1,0,0,1,0,0.416667,10,1,...,-0.001259,0.017701,0.001325,-0.018796,0.030017,0.007289,0.008139,-0.020324,-0.000665,0.047206
3033,1,1,1,0,0,1,0,0.0,10,1,...,-0.001259,0.017701,0.001325,-0.018796,0.030017,0.007289,0.008139,-0.020324,-0.000665,0.047206
3035,3,0,1,0,5,1,0,0.0,10,1,...,-0.001259,0.017701,0.001325,-0.018796,0.030017,0.007289,0.008139,-0.020324,-0.000665,0.047206
3046,0,1,1,0,0,1,0,0.0,10,1,...,-0.001259,0.017701,0.001325,-0.018796,0.030017,0.007289,0.008139,-0.020324,-0.000665,0.047206
3050,2,0,1,0,0,1,0,0.0,10,1,...,-0.001259,0.017701,0.001325,-0.018796,0.030017,0.007289,0.008139,-0.020324,-0.000665,0.047206


## Modeling on embeddings for form features

In [203]:
X_train, X_test, y_train, y_test = train_test_split(df_core[new_ftrs], df_core['conversion'])

In [204]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [205]:
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: {}".format(mse))

MSE: 0.0260674835779


In [206]:
gbm = GradientBoostingRegressor()
gbm.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [207]:
y_pred = gbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: {}".format(mse))

MSE: 0.0260683568017


### Retraining old model

In [89]:
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs_core+['conversion', 'day', 'month']]

In [90]:
df_core['restrictionsXmultirestriction'] = df_core['restrictions'] * df_core['multirestriction_system']
df_core['restrictions^2'] = df_core['restrictions']**2
df_core['restrictions^2Xmultirestriction'] = df_core['restrictions^2'] * df_core['multirestriction_system']
df_core['restrictions^3'] = df_core['restrictions']**3
df_core['restrictions^3Xmultirestriction'] = df_core['restrictions^3'] * df_core['multirestriction_system']
df_core['opt_fields^2'] = df_core['opt_fields']**2
df_core['opt_fields^3'] = df_core['opt_fields']**3
df_core['req_fields^2'] = df_core['req_fields']**2
df_core['req_fields^3'] = df_core['req_fields']**3
df_core['fields'] = df_core['opt_fields'] + df_core['req_fields']
df_core['fields^2'] = df_core['fields']**2
df_core['fields^3'] = df_core['fields']**3

In [217]:
X_train, X_test, y_train, y_test = train_test_split(df_core[ftrs], df_core['conversion'])

In [92]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [93]:
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: {}".format(mse))

MSE: 0.0228926035439


# Deep learning with embeddings

Looking to beat an MSE of **0.02289**

In [218]:
X_train, X_test, y_train, y_test = train_test_split(df_core[cat_ftrs], df_core['conversion'])

In [219]:
mdl = Sequential()
mdl.add(Dense(250, activation='sigmoid', input_shape=(len(X_train.columns), 1)))
mdl.add(Dense(250, activation='sigmoid'))
mdl.add(Dense(250, activation='sigmoid'))
mdl.add(Flatten())
mdl.add(Dense(1, activation='sigmoid'))
mdl.compile('rmsprop', 'mse')

In [220]:
mdl.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_115 (Dense)            (None, 7, 250)            500       
_________________________________________________________________
dense_116 (Dense)            (None, 7, 250)            62750     
_________________________________________________________________
dense_117 (Dense)            (None, 7, 250)            62750     
_________________________________________________________________
flatten_51 (Flatten)         (None, 1750)              0         
_________________________________________________________________
dense_118 (Dense)            (None, 1)                 1751      
Total params: 127,751
Trainable params: 127,751
Non-trainable params: 0
_________________________________________________________________


In [221]:
X_train_inp = np.array(X_train.values).reshape(len(X_train), len(X_train.columns), 1)
X_test_inp = np.array(X_test.values).reshape(len(X_test), len(X_test.columns), 1)

In [222]:
mdl.fit(X_train_inp, y_train, epochs=1, validation_data=(X_test_inp, y_test))

Train on 19079 samples, validate on 6360 samples
Epoch 1/1


<keras.callbacks.History at 0x173acd4d0>

In [223]:
mdl.fit(X_train_inp, y_train, epochs=10, validation_data=(X_test_inp, y_test))

Train on 19079 samples, validate on 6360 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x174472a90>

# Create form embeddings

In [147]:
df_base['form_cat'] = df_base.form.astype('category').cat.codes
df_trn = df_base[df_base.form_cat!=-1][['form_cat', 'conversion']]

In [123]:
df_trn['conversion'] = df_trn.conversion.apply(lambda x: 0.0000001 if x == 0. else x)
df_trn['conversion'] = df_trn['conversion'].replace(np.inf, 1.)

In [124]:
df_trn.conversion.describe()

count    5.124410e+05
mean     3.952223e-03
std      4.005100e-02
min      1.000000e-07
25%      1.000000e-07
50%      1.000000e-07
75%      1.000000e-07
max      1.000000e+00
Name: conversion, dtype: float64

In [125]:
# create the embedding
mdl = Sequential()
mdl.add(Embedding(len(df_trn['form_cat']), 30, input_length=1))
mdl.add(Flatten())
mdl.add(Dense(1, activation='relu'))
mdl.compile('rmsprop', 'mse')
mdl.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 1, 30)             15373230  
_________________________________________________________________
flatten_20 (Flatten)         (None, 30)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 31        
Total params: 15,373,261
Trainable params: 15,373,261
Non-trainable params: 0
_________________________________________________________________


In [126]:
# train the embedding
mdl.fit(df_trn['form_cat'].values, df_trn['conversion'].values, epochs=1, batch_size=128, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x176be5cd0>

In [128]:
# return the latent features
form_embedding_lookup = mdl.layers[0].get_weights()[0]

In [162]:
df_base['form_cat'] = df_base.form.astype('category').cat.codes
for i in range(form_embedding_lookup.shape[1]):
    df_base['form_'+str(i)] = df_base['form_cat'].apply(lambda x: form_embedding_lookup[x][i])

In [155]:
df_base.columns

Index([u'id', u'org', u'sic', u'ein', u'visits', u'mobile_visits',
       u'vt_trans_count', u'don_form_trans_count', u'kiosk_trans_count',
       u'p2p_trans_count', u'mobile_trans_count', u'mobilevt_trans_count',
       u'sms_trans_count', u'fb_trans_count', u'vt_trans_vol',
       u'don_form_trans_vol', u'kiosk_trans_vol', u'p2p_trans_vol',
       u'mobile_trans_vol', u'mobilevt_trans_vol', u'sms_trans_vol',
       u'fb_trans_vol', u'tm_stamp', u'one_time_trans_count',
       u'one_time_trans_vol', u'rec_trans_count', u'rec_trans_vol', u'product',
       u'form', u'form_cat', u'form_0', u'form_1', u'form_2', u'form_3',
       u'form_4', u'form_5', u'form_6', u'form_7', u'form_8', u'form_9',
       u'form_10', u'form_11', u'form_12', u'form_13', u'form_14', u'form_15',
       u'form_16', u'form_17', u'form_18', u'form_19', u'form_20', u'form_21',
       u'form_22', u'form_23', u'form_24', u'form_25', u'form_26', u'form_27',
       u'form_28', u'form_29'],
      dtype='object')

In [11]:
emb_features = [u'form_0', u'form_1', u'form_2', u'form_3',
       u'form_4', u'form_5', u'form_6', u'form_7', u'form_8', u'form_9',
       u'form_10', u'form_11', u'form_12', u'form_13', u'form_14', u'form_15',
       u'form_16', u'form_17', u'form_18', u'form_19', u'form_20', u'form_21',
       u'form_22', u'form_23', u'form_24', u'form_25', u'form_26', u'form_27',
       u'form_28', u'form_29']

In [16]:
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs_core+['conversion']+emb_features]

KeyError: "[u'form_0' u'form_1' u'form_2' u'form_3' u'form_4' u'form_5' u'form_6'\n u'form_7' u'form_8' u'form_9' u'form_10' u'form_11' u'form_12' u'form_13'\n u'form_14' u'form_15' u'form_16' u'form_17' u'form_18' u'form_19'\n u'form_20' u'form_21' u'form_22' u'form_23' u'form_24' u'form_25'\n u'form_26' u'form_27' u'form_28' u'form_29'] not in index"

In [173]:
df_core.columns

Index([u'opt_fields', u'req_fields', u'donation_active',
       u'multirestriction_system', u'restrictions', u'permit_other_amount',
       u'collect_captcha', u'form', u'conversion', u'form_0', u'form_1',
       u'form_2', u'form_3', u'form_4', u'form_5', u'form_6', u'form_7',
       u'form_8', u'form_9', u'form_10', u'form_11', u'form_12', u'form_13',
       u'form_14', u'form_15', u'form_16', u'form_17', u'form_18', u'form_19',
       u'form_20', u'form_21', u'form_22', u'form_23', u'form_24', u'form_25',
       u'form_26', u'form_27', u'form_28', u'form_29'],
      dtype='object')

In [174]:
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

In [175]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [178]:
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: {}".format(mse))

MSE: 0.0155423188874


Standard (original) model resulted in an MSE of **0.02289** and here we beat that by achieving **0.01554**. This is pretty good but let's see if we can push that further with a relatively simple DNN.

In [18]:
# first we want to store the form embedding with ID's
# df_core[['form']+emb_features].to_csv("~/Repositories/datasets/analytics/form_conversion_embedding.csv", index=False)

In [26]:
df_emb = pd.read_csv("~/Repositories/datasets/analytics/form_conversion_embedding.csv")
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs_core+['conversion']]
df_core = df_core.merge(df_emb, on="form")
print("len(df): {}; len(df_core): {}".format(len(df), len(df_core)))

len(df): 9000000; len(df_core): 833215


In [27]:
df_core.columns

Index([u'opt_fields', u'req_fields', u'donation_active',
       u'multirestriction_system', u'restrictions', u'permit_other_amount',
       u'collect_captcha', u'form', u'conversion', u'form_0', u'form_1',
       u'form_2', u'form_3', u'form_4', u'form_5', u'form_6', u'form_7',
       u'form_8', u'form_9', u'form_10', u'form_11', u'form_12', u'form_13',
       u'form_14', u'form_15', u'form_16', u'form_17', u'form_18', u'form_19',
       u'form_20', u'form_21', u'form_22', u'form_23', u'form_24', u'form_25',
       u'form_26', u'form_27', u'form_28', u'form_29'],
      dtype='object')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form'd, 'conversion'], axis=1), df_core['conversion'])

In [30]:
mdl = Sequential()
mdl.add(Dense(250, activation='relu', input_shape=(len(X_train.columns), 1)))
mdl.add(Dense(250, activation='relu'))
mdl.add(Dense(250, activation='relu'))
mdl.add(Dense(250, activation='relu'))
mdl.add(Dense(250, activation='relu'))
mdl.add(Flatten())
mdl.add(Dense(1, activation='sigmoid'))
mdl.compile('rmsprop', 'mse')

In [31]:
mdl.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 37, 250)           500       
_________________________________________________________________
dense_8 (Dense)              (None, 37, 250)           62750     
_________________________________________________________________
dense_9 (Dense)              (None, 37, 250)           62750     
_________________________________________________________________
dense_10 (Dense)             (None, 37, 250)           62750     
_________________________________________________________________
dense_11 (Dense)             (None, 37, 250)           62750     
_________________________________________________________________
flatten_2 (Flatten)          (None, 9250)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 9251      
Total para

In [32]:
X_train_inp = np.array(X_train).reshape(len(X_train), len(X_train.columns), 1)
X_test_inp = np.array(X_test).reshape(len(X_test), len(X_test.columns), 1)

In [33]:
mdl.fit(X_train_inp, y_train.values, epochs=1, validation_data=(X_test_inp, y_test.values))

Train on 624911 samples, validate on 208304 samples
Epoch 1/1


<keras.callbacks.History at 0x1264653d0>

In [None]:
mdl.fit(X_train_inp, y_train.values, epochs=100, validation_data=(X_test_inp, y_test.values))

At 50 epochs, we reached an MSE of **0.0196** (epoch 49). This is definitely a promising path and the network doesn't appear to be overtraining so let's keep it going and see if we can stabilize under **0.02**.

And after 101 epochs we're landing at an MSE of **0.0194**, and reached a minimum within a handful of epochs of the end at **0.0181**. While the improvement has leveled off and slowed quite a bit, it does not yet appear to have bottomed out so there's still improvement to be attained here.

___

_First training, lost kernel_. 

___

The improvement slowed to a crawl. We might be able to beat the **0.01554** we achieved with the random forest but it's obvious that it will take _a lot_ of training. The random forest model is a good middle step, and can be relied upon as a trial deployment of the embedding lookup while the DNN continues to be developed and tested.

### Clean form feature modeling w/ hyperparameter tuning

In [3]:
ftrs_core = ['opt_fields', 'req_fields', 'donation_active', 'multirestriction_system', 
             'restrictions', 'permit_other_amount', 'collect_captcha', 'form']

print("load & prep data")
print("\treading CSV's")
df_base = pd.read_csv("~/Repositories/datasets/analytics/a_base.csv")
df_qgiv = pd.read_csv("~/Repositories/datasets/analytics/analytics_qgiv.csv")
df_qgiv_base = pd.read_csv("~/Repositories/datasets/analytics/a_qgiv_base_id.csv")

print("\tmerge dataframes")
df_qgiv = df_qgiv.merge(df_qgiv_base, left_on="id", right_on="id_x")
df = df_qgiv.merge(df_base, left_on="base", right_on="id", how="left")

print("\tadding conversion")
df['conversion'] = df['don_form_trans_count'] / (df['visits']+df['mobile_visits'])
df['conversion'].fillna(0., inplace=True)
df['conversion'].replace(np.inf, 1.)

print("\tadding embedding columns")
df_emb = pd.read_csv("~/Repositories/datasets/analytics/form_conversion_embedding.csv")
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs_core+['conversion']]
df_core = df_core.merge(df_emb, on="form")

print("\tdata prepped, {} features of {} observations".format(len(df_core.columns) - 2, len(df_core)))
# ...len(columns) - 2 because form ID & conversion are in there but are not to be used as features

load & prep data
	reading CSV's
	merge dataframes
	adding conversion & embedding columns
	data prepped, 37 features of 833215 observations


In [4]:
print("hyperparameter tuning")
print("\ttrain/test split")
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\ttrees...")
for i in [10, 25, 50, 75, 100]:
    rf = RandomForestRegressor(n_estimators=i)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\t\t{} trees: MSE: {}".format(i, mse))

hyperparameter tuning
	train/test split
	trees...
		10 trees: MSE: 0.0134294187075
		25 trees: MSE: 0.0134282826641
		50 trees: MSE: 0.0134259004581
		75 trees: MSE: 0.0134250891819
		100 trees: MSE: 0.0134260802508


In [6]:
print("\tmin samples split...")
for i in [2, 3, 4, 5]:
    rf = RandomForestRegressor(min_samples_split=i)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\t\t{} min_samples_split: MSE: {}".format(i, mse))

	min samples split...
		2 min_samples_split: MSE: 0.0134272272924
		3 min_samples_split: MSE: 0.0134290576679
		4 min_samples_split: MSE: 0.0134297032152
		5 min_samples_split: MSE: 0.013429558535


In [7]:
print("\tmin samples leaf...")
for i in [1, 2, 3, 4, 5]:
    rf = RandomForestRegressor(min_samples_leaf=i)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\t\t{} min_samples_leaf: MSE: {}".format(i, mse))

	min samples leaf...
		1 min_samples_leaf: MSE: 0.0134276764383
		2 min_samples_leaf: MSE: 0.0134302918376
		3 min_samples_leaf: MSE: 0.0134293436736
		4 min_samples_leaf: MSE: 0.0134327199593
		5 min_samples_leaf: MSE: 0.0134306295739


Looks like hyperparameter tuning isn't going to improve anything by a meaningful amount. The default settings for Random Forest Regressor performed better than any of the alterations so _sticking with defaults for all_.

In [8]:
print("fit on training data & verify performance")
print("\ttrain/test split")
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\tfitting model")
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

print("\tevaluating model")
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("\t\tMSE: {}".format(mse))

fit on training data & verify performance
	train/test split
	fitting model
	evaluating model
		MSE: 0.0133835646281


## Modeling with embeddings and form settings

In [19]:
ftrs = ['pledge_active', 'donation_active', 'multirestriction_system', 'min_amount',
       'max_amount', 'show_amount', 'permit_anonymous', 'permit_recurring',
       'permit_other_amount', 'permit_create_own_pledge', 'collect_company',
       'collect_phone', 'collect_optin', 'collect_captcha',
       'collect_address_mobile', 'enable_donorlogins', 'enable_sms', 'form']

print("load & prep data")
print("\treading CSV's")
df_base = pd.read_csv("~/Repositories/datasets/analytics/a_base.csv")
df_qgiv = pd.read_csv("~/Repositories/datasets/analytics/analytics_qgiv.csv")
df_qgiv_base = pd.read_csv("~/Repositories/datasets/analytics/a_qgiv_base_id.csv")

print("\tmerge dataframes")
df_qgiv = df_qgiv.merge(df_qgiv_base, left_on="id", right_on="id_x")
df = df_qgiv.merge(df_base, left_on="base", right_on="id", how="left")

print("\tadding conversion & embedding columns")
df['conversion'] = df['don_form_trans_count'] / (df['visits']+df['mobile_visits'])
df['conversion'].fillna(0., inplace=True)
df['conversion'].replace(np.inf, 1.)

df_emb = pd.read_csv("~/Repositories/datasets/analytics/form_conversion_embedding.csv")
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs+['conversion']]
df_core = df_core.merge(df_emb, on="form")

print("\tdata prepped, {} features of {} observations".format(len(df_core.columns) - 2, len(df_core)))
# ...len(columns) - 2 because form & conversion are in there

load & prep data
	reading CSV's
	merge dataframes
	adding conversion & embedding columns
	data prepped, 47 features of 833215 observations


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

In [12]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [13]:
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: {}".format(mse))

MSE: 0.0132923321163


In [18]:
# let's get the old model MSE
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs+['conversion']]
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Old model MSE: {}".format(mse))

Old model MSE: 0.0194061475176


Looks like the form settings model is seeing the same range of improvement as features, in the 30% to 40% reduction in error. Let's evaluate hyperparameter tuning on this for the sake of thoroughness.

In [20]:
print("hyperparameter tuning")
print("\ttrain/test split")
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\ttrees...")
for i in [10, 25, 50, 75, 100]:
    rf = RandomForestRegressor(n_estimators=i)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\t\t{} trees: MSE: {}".format(i, mse))
print("\tmin samples split...")
for i in [2, 3, 4, 5]:
    rf = RandomForestRegressor(min_samples_split=i)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\t\t{} min_samples_split: MSE: {}".format(i, mse))
print("\tmin samples leaf...")
for i in [1, 2, 3, 4, 5]:
    rf = RandomForestRegressor(min_samples_leaf=i)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print("\t\t{} min_samples_leaf: MSE: {}".format(i, mse))

hyperparameter tuning
	train/test split
	trees...
		10 trees: MSE: 0.013467356763
		25 trees: MSE: 0.0134631413643
		50 trees: MSE: 0.0134624679065
		75 trees: MSE: 0.0134620077648
		100 trees: MSE: 0.0134621382401
	min samples split...
		2 min_samples_split: MSE: 0.0134636985479
		3 min_samples_split: MSE: 0.0134665846582
		4 min_samples_split: MSE: 0.0134663590206
		5 min_samples_split: MSE: 0.0134651985713
	min samples leaf...
		1 min_samples_leaf: MSE: 0.0134644503534
		2 min_samples_leaf: MSE: 0.0134638281396
		3 min_samples_leaf: MSE: 0.0134667043315
		4 min_samples_leaf: MSE: 0.0134647189421
		5 min_samples_leaf: MSE: 0.013466084465


Looks again like there's no meaningul improvement with hyperparameter tuning so _sticking with defaults_.

In [21]:
print("fit on training data & verify performance")
print("\ttrain/test split")
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\tfitting model")
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

print("\tevaluating model")
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("\t\tMSE: {}".format(mse))

fit on training data & verify performance
	train/test split
	fitting model
	evaluating model
		MSE: 0.0132296872743


In [None]:
# try training an embedding for these categorical features