In [133]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# load & prep data

## traffic data

In [134]:
q = '''select
            date_trunc('week', date) as week,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date >= 2016 and date <= 2019
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [135]:
pageviews = pageviews[pageviews['form']!=0]
pageviews.sort_values('week', ascending=True, inplace=True)

## transaction data

In [136]:
q = '''select 
            form, 
            date_trunc('week', date) as week,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2016 and date<=2019
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [137]:
len(trans), len(trans['form'].unique())

(256129, 11558)

In [138]:
trans['week'] = pd.to_datetime(trans['week'])
trans.sort_values('week', ascending=True, inplace=True)

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'week'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']
trans.head(3)

Unnamed: 0,form,week,count,vol,org,pageviews,conversion_rate
0,915548,2017-02-06,5,1525.99,31832,44,0.113636
1,526,2017-02-06,42,2243.8,554,25,1.68
2,43052,2017-02-06,6,725.0,15283,1,6.0


In [139]:
# build form growth data for forms with > 12 months of data
form_data = None

for form in trans['form'].unique():
    if form == 0:
        continue
    this_df = trans[trans['form']==form].copy()
    if len(this_df) >= 12:
        this_df['count_growth'] = this_df['count'].pct_change()
        this_df['vol_growth'] = this_df['vol'].pct_change()
        this_df['conversion_growth'] = this_df['conversion_rate'].pct_change()

        if form_data is None:
            form_data = this_df
        else:
            form_data = form_data.append(this_df)
len(form_data), len(form_data['form'].unique())

(118784, 2592)

## logs data

In [140]:
q = '''select
            form,
            date_trunc('week', created) as week,
            systemtype,
            count(id) as count
        from logs
        where created>=2016 and created <=2019 and form!=0
        group by form, systemtype, date_trunc('week', created)'''
logs = redshift_query_read(q)

In [141]:
len(logs), len(logs['form'].unique())

(161595, 20563)

In [142]:
logs_pvt = pd.pivot_table(logs, index=['form', 'week'], columns='systemtype', values='count').reset_index()
logs_pvt.fillna(0, inplace=True)
logs_pvt.head(3)

systemtype,form,week,0,4,8,11,12,13,15,18,...,34,35,36,38,39,40,41,42,43,44
0,1,2016-05-02,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2016-05-30,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2016-06-20,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
idxd_month = logs_pvt['week']
idxd_form = logs_pvt['form']
logs_pct_change = logs_pvt.drop('week', axis=1).groupby('form').pct_change()
logs_pct_change['week'] = idxd_month
logs_pct_change['form'] = idxd_form
logs_pct_change = logs_pct_change.fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)
logs_pct_change.head(3)

Unnamed: 0,0,4,8,11,12,13,15,18,20,21,...,36,38,39,40,41,42,43,44,week,form
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-05-02,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-05-30,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-06-20,1


## analytics data

In [144]:
q = "select * from analytics_weekly"
df_base = redshift_query_read(q)
q = "select * from analyticsqgiv_weekly"
df_qgiv = redshift_query_read(q)

In [145]:
len(df_base['date'].unique()), len(df_qgiv['date'].unique())

(152, 86)

In [146]:
df_analytics = df_base.merge(df_qgiv, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()
df_analytics['date'] = pd.to_datetime(df_analytics['date'])

In [147]:
len(df_analytics), len(df_analytics['form'].unique()), len(df_analytics['date'].unique())

(1189306, 23750, 86)

In [148]:
agg_analytics = None
for form in df_analytics['form'].unique():
    this_df = df_analytics[df_analytics['form']==form].copy()
    this_df['week'] = this_df['date']

    for c in this_df.columns:
        if c not in ['date', 'form', 'week']:
            this_df["{}_pct_change".format(c)] = this_df[c].pct_change()
            
    ext_cols = ['form', 'week'] + [c for c in this_df.columns if '_pct_change' in c]
    
    if agg_analytics is None:
        agg_analytics = this_df[ext_cols]
    else:
        agg_analytics = agg_analytics.append(this_df[ext_cols])

In [149]:
agg_analytics = agg_analytics.fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)
agg_analytics.head(3)

Unnamed: 0,form,week,vt_trans_count_pct_change,don_form_trans_count_pct_change,kiosk_trans_count_pct_change,p2p_trans_count_pct_change,mobile_trans_count_pct_change,mobilevt_trans_count_pct_change,sms_trans_count_pct_change,fb_trans_count_pct_change,...,collect_address_mobile_pct_change,enable_donorlogins_pct_change,enable_sms_pct_change,new_rec_volume_pct_change,new_rec_count_pct_change,reg_count_pct_change,dl_trans_volume_pct_change,dl_trans_count_pct_change,dl_new_rec_count_pct_change,dl_new_rec_volume_pct_change
0,1,2017-05-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5534,1,2017-05-15,100.0,100.0,0.0,0.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,0.0,0.0
11106,1,2017-05-22,1.777778,4.5,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,100.0,100.0,5.333333,8.081803,11.0,0.0,0.0


In [150]:
len(agg_analytics), len(agg_analytics['week'].unique()), len(agg_analytics['form'].unique())

(1189306, 86, 23750)

## merge data

In [151]:
# build data set
dataset = form_data.dropna()[['form', 'week', 'count_growth', 'vol_growth', 'conversion_rate', 'conversion_growth']]
dataset = dataset.merge(logs_pct_change, on=['form', 'week'])
dataset = dataset.merge(agg_analytics, on=['form', 'week'])

In [152]:
len(dataset), len(dataset['form'].unique()), len(dataset['week'].unique())

(7031, 1291, 39)

In [153]:
dataset['month'] = dataset['week'].dt.month
dataset['dayofmonth'] = dataset['week'].dt.day

for column in dataset.columns:
    if 'growth' in str(column) or 'pct_change' in str(column):
        dataset[column] = dataset[column].replace(np.inf, 100.).replace(-np.inf, -100.).fillna(0)
dataset.columns = [str(c) for c in dataset.columns]

In [154]:
print("dataset length: {} rows".format(len(dataset)))
print()
for column in dataset.columns:
    unique_val_len = len(dataset[column].unique())
    if unique_val_len > 1:
        print("{}: {} unique values".format(column, unique_val_len))

dataset length: 7031 rows

form: 1291 unique values
week: 39 unique values
count_growth: 2159 unique values
vol_growth: 6360 unique values
conversion_rate: 3684 unique values
conversion_growth: 5788 unique values
11: 188 unique values
15: 11 unique values
20: 2 unique values
21: 47 unique values
23: 179 unique values
24: 10 unique values
27: 2 unique values
29: 14 unique values
32: 39 unique values
34: 119 unique values
35: 3 unique values
36: 14 unique values
38: 3 unique values
39: 3 unique values
40: 3 unique values
41: 3 unique values
42: 3 unique values
43: 2 unique values
44: 5 unique values
vt_trans_count_pct_change: 373 unique values
don_form_trans_count_pct_change: 1523 unique values
kiosk_trans_count_pct_change: 268 unique values
mobile_trans_count_pct_change: 603 unique values
mobilevt_trans_count_pct_change: 15 unique values
sms_trans_count_pct_change: 87 unique values
fb_trans_count_pct_change: 14 unique values
vt_trans_vol_pct_change: 649 unique values
don_form_trans_vol_

# modeling full feature set

## build training data

In [155]:
from keras.layers import Embedding, Flatten, Dense, Conv2D, BatchNormalization
from keras import Sequential
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [156]:
drop_cols_features = ['form', 'week']
target_cols = ['conversion_rate', 'conversion_growth', 'vol_growth', 'count_growth']

In [157]:
def generate_embedding(features, target, dimensions):
    X_train = np.array(features).reshape(len(features), 1)
    
    # create embedding model
    mdl = Sequential()
    mdl.add(Embedding(dim[0], dim[1], input_length=1))
    mdl.add(Flatten())
    mdl.add(Dense(1, activation='relu'))
    mdl.compile('rmsprop', 'mse')
    
    # train embedding model
    mdl.fit(X_train, target, epochs=10, batch_size=128, verbose=0)
    
    # return latent features
    return mdl.layers[0].get_weights()[0]

In [170]:
# list continuous variables and casting to category and generating an embedding 
# for any feature with fewer than 100 unique values
embedding_lookup_tables = {}
for t in target_cols:
    embedding_lookup_tables[t] = {}
continuous_vars = []

for column in dataset.columns:
    unique_val_len = len(dataset[column].unique())
    
    if unique_val_len > 1 and str(column) not in drop_cols_features + target_cols:
        if unique_val_len <= 100:
            # 100 or fewer unique values, treat as category
            col_cat_values = dataset[column].astype('category').cat.codes
            dataset[column] = col_cat_values

            # define embedding
            dim = (unique_val_len, int(math.ceil(float(unique_val_len + 1) / float(2))))

            for t in target_cols:
                embedding_lookup_tables[t][str(column)] = generate_embedding(col_cat_values, dataset[t], dim)
        else:
            continuous_vars.append(column)

In [171]:
for target in target_cols:
    print(target)
    # build dataset for the given target
    #     continuous vars + embeddings for this target
    target_col = dataset[target]
    this_dataset = dataset.drop(drop_cols_features + target_cols, axis=1).copy()
    
    # set new feature columns for embeddings mapped to the categorical values
    new_features = []
    for e in embedding_lookup_tables[target]:
        for new_col_i in range(embedding_lookup_tables[target][e].shape[1]):
            new_col_label = "_".join([str(e), str(new_col_i)])
            this_dataset[new_col_label] = this_dataset[e].apply(lambda x: embedding_lookup_tables[target][e][x][new_col_i])
            new_features.append(new_col_label)
    
    # collect continuous and embedding features into clean training dataset
    this_dataset_clean = this_dataset[new_features + continuous_vars]
    this_dataset_clean[target] = target_col
    
    # train & evaluate
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset_clean.drop(target, axis=1), this_dataset_clean[target])
        
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
    
    print("\tRandom Forest:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))
    
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset_clean.drop(target, axis=1), this_dataset_clean[target])
        
        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)
        
        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tGBM:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))  
    
    print("-"*40)

conversion_rate


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -0.1823561195162361
		mse: 38.60357874936636
	GBM:
		r2: -0.038198996584297575
		mse: 33.53748036552423
----------------------------------------
conversion_growth


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -1.317451438002779
		mse: 44.046966751314805
	GBM:
		r2: -0.36392690417969836
		mse: 39.7609976107329
----------------------------------------
vol_growth


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -3.407605770159332
		mse: 16283.868368046815
	GBM:
		r2: -1.9647261383698764
		mse: 13363.708650931432
----------------------------------------
count_growth


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -0.2487115346696035
		mse: 40.85145043119823
	GBM:
		r2: -0.09682488479450942
		mse: 46.8176380160755
----------------------------------------


The old model achieved an MSE of **0.01554** with a random forest trained only the forms  to conversion rates rather than every categorical variable. 

**Original model features**

[u'opt_fields', u'req_fields', u'donation_active',
       u'multirestriction_system', u'restrictions', u'permit_other_amount',
       u'collect_captcha', u'form', u'conversion', u'form_0', u'form_1',
       u'form_2', u'form_3', u'form_4', u'form_5', u'form_6', u'form_7',
       u'form_8', u'form_9', u'form_10', u'form_11', u'form_12', u'form_13',
       u'form_14', u'form_15', u'form_16', u'form_17', u'form_18', u'form_19',
       u'form_20', u'form_21', u'form_22', u'form_23', u'form_24', u'form_25',
       u'form_26', u'form_27', u'form_28', u'form_29']

# modeling form & date embeddings

here we're going to focus the embedding on form and date variables. specifically,

- form ID
- month
- day of month

In [172]:
from keras.layers import Embedding, Flatten, Dense, Conv2D, BatchNormalization
from keras import Sequential
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [177]:
features_cols = ['form', 'month']
target_cols = ['conversion_rate', 'conversion_growth', 'vol_growth', 'count_growth']

In [178]:
def generate_embedding(features, target, dimensions):
    X_train = np.array(features).reshape(len(features), 1)
    
    # create embedding model
    mdl = Sequential()
    mdl.add(Embedding(dim[0], dim[1], input_length=1))
    mdl.add(Flatten())
    mdl.add(Dense(1, activation='relu'))
    mdl.compile('rmsprop', 'mse')
    
    # train embedding model
    mdl.fit(X_train, target, epochs=10, batch_size=128, verbose=0)
    
    # return latent features
    return mdl.layers[0].get_weights()[0]

In [179]:
# list continuous variables and casting to category and generating an embedding 
# for any feature with fewer than 100 unique values
embedding_lookup_tables = {}
for t in target_cols:
    embedding_lookup_tables[t] = {}

for column in features_cols:
    unique_val_len = len(dataset[column].unique())
    col_cat_values = dataset[column].astype('category').cat.codes
    dataset[column] = col_cat_values

    # define embedding
    if column == 'form':
        dim = (unique_val_len, 30)
    else:
        dim = (unique_val_len, int(math.ceil(float(unique_val_len + 1) / float(2))))

    for t in target_cols:
        embedding_lookup_tables[t][column] = generate_embedding(col_cat_values, dataset[t], dim)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [180]:
for target in target_cols:
    print(target)
    # build dataset for the given target
    #     continuous vars + embeddings for this target
    target_col = dataset[target]
    this_dataset = dataset[features_cols].copy()
    
    # set new feature columns for embeddings mapped to the categorical values
    new_features = []
    for e in embedding_lookup_tables[target]:
        for new_col_i in range(embedding_lookup_tables[target][e].shape[1]):
            new_col_label = "_".join([e, str(new_col_i)])
            this_dataset[new_col_label] = this_dataset[str(e)].apply(lambda x: embedding_lookup_tables[target][e][x][new_col_i])
            new_features.append(new_col_label)
    
    # collect continuous and embedding features into clean training dataset
    this_dataset_clean = this_dataset[new_features]
    this_dataset_clean[target] = target_col
    
    # train & evaluate
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset_clean.drop(target, axis=1), this_dataset_clean[target])
        
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
    
    print("\tRandom Forest:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))
    
    scores = []
    mses = []
    for i in range(50):
        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset_clean.drop(target, axis=1), this_dataset_clean[target])
        
        gbm = GradientBoostingRegressor()
        gbm.fit(X_train, y_train)
        y_pred = gbm.predict(X_test)
        
        scores.append(gbm.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("\tGBM:")
    print("\t\tr2: {}".format(np.mean(scores)))
    print("\t\tmse: {}".format(np.mean(mses)))  
    
    print("-"*40)

conversion_rate


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: 0.14794473204469047
		mse: 23.620259558393787
	GBM:
		r2: 0.24839218012213532
		mse: 22.467871449972094
----------------------------------------
conversion_growth


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -0.29686756079140664
		mse: 41.41156531078358
	GBM:
		r2: -0.522127151824169
		mse: 37.11788093929876
----------------------------------------
vol_growth


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -1.4842613660420296
		mse: 11887.338506571123
	GBM:
		r2: -0.6192272764981258
		mse: 10416.77993744346
----------------------------------------
count_growth


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


	Random Forest:
		r2: -0.3691423231236264
		mse: 40.61228571587713
	GBM:
		r2: -0.1997723109964289
		mse: 36.72563805124084
----------------------------------------
