In [2]:
import sys, datetime
import pandas as pd
import numpy as np

sys.path.insert(1, '../../../../scripts/')
from s3_support import *

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

  from numpy.core.umath_tests import inner1d


# load & prep data weekly

## google traffic

In [3]:
q = '''select
            date_trunc('week', date) as week,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date >= 2016 and date <= 2019
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [4]:
pageviews = pageviews[pageviews['form']!=0]
pageviews['week'] = pd.to_datetime(pageviews['week'])

In [5]:
len(pageviews), len(pageviews['form'].unique())

(321426, 12465)

## transactions

In [6]:
q = '''select 
            form, 
            date_trunc('week', date) as week,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2016 and date<=2019
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [7]:
trans['week'] = pd.to_datetime(trans['week'])

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'week'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']
trans.head(3)

Unnamed: 0,form,week,count,vol,org,pageviews,conversion_rate
0,421959,2017-02-06,5,250.0,197635,4,1.25
1,66611,2017-02-06,124,16174.0,19453,238,0.521008
2,30772,2017-02-06,76,11292.0,10172,135,0.562963


In [8]:
len(trans), len(trans['form'].unique())

(140680, 8075)

## analytics data

In [9]:
q = "select * from analytics_weekly"
df_base = redshift_query_read(q)
print("done with analytics")

q = "select * from analyticsqgiv_weekly"
df_qgiv = redshift_query_read(q)
print("done with analyticsqgiv")

done with analytics
done with analyticsqgiv


In [10]:
df_analytics = df_base.merge(df_qgiv, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()

df_analytics['week'] = pd.to_datetime(df_analytics['date'])
df_analytics.drop('date', axis=1, inplace=True)

In [11]:
len(df_analytics), len(df_analytics['form'].unique())

(1189306, 23750)

## merge data

In [12]:
dataset = trans.dropna()[['form', 'week', 'conversion_rate']]
dataset = dataset.merge(df_analytics, on=['form', 'week'], how='outer')
dataset['month'] = dataset['week'].dt.month
dataset.drop('week', axis=1, inplace=True)
len(dataset)

1281165

In [13]:
# store
#save_dataframe_to_file('qgiv-stats-data', 'form_health.analytics.csv', dataset)

# modeling weekly

In [14]:
target = 'conversion_rate'

drop_cols = ['form', 'vt_trans_count', 'don_form_trans_count',
           'kiosk_trans_count', 'p2p_trans_count', 'mobile_trans_count',
           'mobilevt_trans_count', 'sms_trans_count', 'fb_trans_count',
           'vt_trans_vol', 'don_form_trans_vol', 'kiosk_trans_vol',
           'p2p_trans_vol', 'mobile_trans_vol', 'mobilevt_trans_vol',
           'sms_trans_vol', 'fb_trans_vol', 'one_time_trans_vol',
           'one_time_trans_count', 'rec_trans_vol', 'rec_trans_count',
           'new_rec_volume', 'new_rec_count', 'reg_count', 'dl_trans_volume', 
           'dl_trans_count', 'dl_new_rec_count', 'dl_new_rec_volume']

In [15]:
dataset.drop(drop_cols, axis=1).columns

Index(['conversion_rate', 'pledges_count', 'events_priv_count', 'restrictions',
       'amounts', 'ded_types', 'opt_ded_flds', 'req_ded_flds', 'opt_fields',
       'req_fields', 'pledge_active', 'donation_active',
       'multirestriction_system', 'min_amount', 'max_amount',
       'permit_anonymous', 'permit_recurring', 'permit_other_amount',
       'permit_create_own_pledge', 'collect_company', 'collect_phone',
       'collect_optin', 'collect_captcha', 'collect_address_mobile',
       'enable_donorlogins', 'enable_sms', 'month'],
      dtype='object')

In [16]:
# cleaning up NAN
original_len = len(dataset)

dataset = dataset[~dataset['conversion_rate'].isna()]
dataset.fillna(0, inplace=True)

original_len, len(dataset)

(1281165, 140680)

In [17]:
scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + [target], axis=1), dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + [target], axis=1), dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Random Forest:
	r2: 0.0223
	mse: 11.1223

GBM:
	r2: 0.0192
	mse: 10.3593


# load & prep data daily

## page views

In [18]:
# (1438392, 12465)
q = '''select
            date_trunc('day', date) as date,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
        group by date_trunc('day', date), org, form;'''
pageviews = redshift_query_read(q)

pageviews = pageviews[pageviews['form']!=0]
pageviews['date'] = pd.to_datetime(pageviews['date'])

len(pageviews), len(pageviews['form'].unique())

(2965415, 20954)

## transactions

In [19]:
# 370336 7940
q = '''select 
            form, 
            date_trunc('day', date) as date,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A'
        group by form, date_trunc('day', date)
    '''
trans = redshift_query_read(q)

trans['date'] = pd.to_datetime(trans['date'])

# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'date'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']

print(len(trans), len(trans['form'].unique()))
trans.head(3)

708367 11558


Unnamed: 0,form,date,count,vol,org,pageviews,conversion_rate
0,831582,2017-02-11,3,350.0,428036,10,0.3
1,784488,2017-02-11,2,0.0,426008,18,0.111111
2,797776,2017-02-11,2,26.0,426013,15,0.133333


## qgiv analytics

In [20]:
# (8459914, 23750)
# new data
df_analytics = get_dataframe_from_file("form-health-v2", "analyticsqgiv_daily.csv000", separator="|")

df_analytics['date'] = pd.to_datetime(df_analytics['date'])
len(df_analytics), len(df_analytics['form'].unique())

(10851321, 26241)

## merge data

In [21]:
dataset = trans.dropna()[['form', 'date', 'conversion_rate']]
dataset = dataset.merge(df_analytics, on=['form', 'date'], how='outer')
dataset['month'] = dataset['date'].dt.month
dataset.drop('date', axis=1, inplace=True)
len(dataset)  # 8703987

11165941

# modeling daily

In [22]:
target = 'conversion_rate'

drop_cols = ['vt_trans_count', 'don_form_trans_count',
           'kiosk_trans_count', 'p2p_trans_count', 'mobile_trans_count',
           'mobilevt_trans_count', 'sms_trans_count', 'fb_trans_count',
           'vt_trans_vol', 'don_form_trans_vol', 'kiosk_trans_vol',
           'p2p_trans_vol', 'mobile_trans_vol', 'mobilevt_trans_vol',
           'sms_trans_vol', 'fb_trans_vol', 'one_time_trans_vol',
           'one_time_trans_count', 'rec_trans_vol', 'rec_trans_count',
           'new_rec_volume', 'new_rec_count', 'reg_count', 'dl_trans_volume', 
           'dl_trans_count', 'dl_new_rec_count', 'dl_new_rec_volume']
for c in dataset.columns:
    if 'form' in c or 'date' in c or 'org' in c or 'product' in c:
        drop_cols.append(c)

In [23]:
# cleaning up NAN
original_len = len(dataset)

dataset = dataset[~dataset['conversion_rate'].isna()]
dataset.fillna(0, inplace=True)

original_len, len(dataset)  # (8703987, 370337)

(11165941, 708368)

In [24]:
scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + [target], axis=1), dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + [target], axis=1), dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Random Forest:
	r2: 0.0215
	mse: 11.0792

GBM:
	r2: 0.0301
	mse: 13.5718


In [25]:
X_train.columns

Index(['pledges_count', 'events_priv_count', 'restrictions', 'amounts',
       'ded_types', 'opt_ded_flds', 'req_ded_flds', 'opt_fields', 'req_fields',
       'pledge_active', 'donation_active', 'multirestriction_system',
       'min_amount', 'max_amount', 'permit_anonymous', 'permit_recurring',
       'permit_other_amount', 'permit_create_own_pledge', 'collect_company',
       'collect_phone', 'collect_optin', 'collect_captcha',
       'collect_address_mobile', 'enable_donorlogins', 'enable_sms', 'month'],
      dtype='object')

In [26]:
for l, v in zip(X_train.columns, gbm.feature_importances_):
    print("{}: {:.4f}".format(l, v))

pledges_count: 0.1071
events_priv_count: 0.0106
restrictions: 0.1658
amounts: 0.0892
ded_types: 0.0334
opt_ded_flds: 0.0000
req_ded_flds: 0.0038
opt_fields: 0.0329
req_fields: 0.0130
pledge_active: 0.0182
donation_active: 0.0451
multirestriction_system: 0.0079
min_amount: 0.0707
max_amount: 0.0899
permit_anonymous: 0.0090
permit_recurring: 0.0000
permit_other_amount: 0.0501
permit_create_own_pledge: 0.0025
collect_company: 0.0112
collect_phone: 0.0284
collect_optin: 0.0557
collect_captcha: 0.0000
collect_address_mobile: 0.0236
enable_donorlogins: 0.0282
enable_sms: 0.0205
month: 0.0834


In [27]:
zero_importance_features = ["opt_ded_flds", "permit_recurring", "collect_captcha"]
weak_importance_features = ["req_ded_flds", "permit_create_own_pledge"]

In [28]:
print("Removing zero importance features:")
scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + zero_importance_features + [target], axis=1), dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + zero_importance_features + [target], axis=1), dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Removing zero importance features:
Random Forest:
	r2: 0.0231
	mse: 9.9119

GBM:
	r2: 0.0353
	mse: 9.3132


In [29]:
print("Removing features with importance < 0.005")

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + zero_importance_features + weak_importance_features + [target], axis=1), dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + zero_importance_features + weak_importance_features + [target], axis=1), dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Removing features with importance < 0.005
Random Forest:
	r2: 0.0188
	mse: 8.5973

GBM:
	r2: 0.0334
	mse: 10.4739


In [30]:
for k, v in zip(X_train.columns, gbm.feature_importances_):
    print("{}: {:.4f}".format(k, v))

pledges_count: 0.0673
events_priv_count: 0.0078
restrictions: 0.1860
amounts: 0.1029
ded_types: 0.0374
opt_fields: 0.0356
req_fields: 0.0134
pledge_active: 0.0090
donation_active: 0.0331
multirestriction_system: 0.0081
min_amount: 0.0857
max_amount: 0.0842
permit_anonymous: 0.0068
permit_other_amount: 0.0608
collect_company: 0.0113
collect_phone: 0.0514
collect_optin: 0.0596
collect_address_mobile: 0.0227
enable_donorlogins: 0.0211
enable_sms: 0.0077
month: 0.0879


In [31]:
more_drop_cols = ["multirestriction_system", "permit_anonymous", "collect_company"]

In [32]:
print("Removing features with importance < 0.01")

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + zero_importance_features + weak_importance_features + more_drop_cols + [target], axis=1), dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(drop_cols + zero_importance_features + weak_importance_features + more_drop_cols + [target], axis=1), dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Removing features with importance < 0.01
Random Forest:
	r2: 0.0272
	mse: 9.2238

GBM:
	r2: 0.0285
	mse: 14.7792


In [33]:
# manually selected features
ftrs = ["restrictions", "amounts", "opt_fields", "req_fields", "donation_active", 
        "multirestriction_system", "min_amount", "max_amount", 
        "permit_anonymous", "permit_other_amount", "collect_company", 
        "collect_phone", "collect_optin", "collect_address_mobile", 
        "enable_donorlogins", "enable_sms", "month"]

In [35]:
print("Manually selected features ({})".format(len(ftrs)))

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset[ftrs], dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset[ftrs], dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Manually selected features (17)
Random Forest:
	r2: 0.0341
	mse: 9.8444

GBM:
	r2: 0.0191
	mse: 19.7255


In [36]:
ftrs = ["restrictions", "amounts", "opt_fields", "req_fields", "min_amount", "max_amount"]

In [41]:
print("Manually selected features ({})".format(len(ftrs)))

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset[ftrs], dataset[target])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    scores.append(rf.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("Random Forest:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))
print()

scores = []
mses = []

for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(dataset[ftrs], dataset[target])

    gbm = GradientBoostingRegressor()
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict(X_test)

    scores.append(gbm.score(X_test, y_test))
    mses.append(mean_squared_error(y_test, y_pred))
    
print("GBM:")
print("\tr2: {:.4f}".format(np.mean(scores)))
print("\tmse: {:.4f}".format(np.mean(mses)))

Manually selected features (7)
Random Forest:
	r2: 0.0268
	mse: 15.9739

GBM:
	r2: 0.0177
	mse: 12.4544
