In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append("../../../../scripts/")
from s3_support import *

We're going to model on everything conceivably relevant and use random forest regressor to select important features.

# load traffic data

In [2]:
q = '''select
            date_trunc('week', date) as week,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date >= 2016 and date <= 2019
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [3]:
pageviews = pageviews[pageviews['form']!=0]
pageviews.sort_values('week', ascending=True, inplace=True)

# load transaction growth data

In [4]:
q = '''select 
            form, 
            date_trunc('week', date) as week,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2016 and date<=2019
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [5]:
len(trans), len(trans['form'].unique())

(256129, 11558)

In [6]:
trans['week'] = pd.to_datetime(trans['week'])
trans.sort_values('week', ascending=True, inplace=True)

In [7]:
# merge traffic and transactions
trans = trans.merge(pageviews, on=['form', 'week'])

# calculate conversion rate & change
trans['conversion_rate'] = trans['count'] / trans['pageviews']

In [8]:
trans.head(3)

Unnamed: 0,form,week,count,vol,org,pageviews,conversion_rate
0,1849,2017-02-06,1,84.0,1793,9,0.111111
1,16414,2017-02-06,1,0.0,2994,26,0.038462
2,842872,2017-02-06,1,100.0,430121,12,0.083333


In [9]:
form_data = None

for form in trans['form'].unique():
    if form == 0:
        continue
    this_df = trans[trans['form']==form].copy()
    if len(this_df) >= 12:
        this_df['count_growth'] = this_df['count'].pct_change()
        this_df['vol_growth'] = this_df['vol'].pct_change()
        this_df['conversion_growth'] = this_df['conversion_rate'].pct_change()

        if form_data is None:
            form_data = this_df
        else:
            form_data = form_data.append(this_df)
len(form_data), len(form_data['form'].unique())

(118784, 2592)

# logs

In [10]:
q = '''select
            form,
            date_trunc('week', created) as week,
            systemtype,
            count(id) as count
        from logs
        where created>=2016 and created <=2019 and form!=0
        group by form, systemtype, date_trunc('week', created)'''
logs = redshift_query_read(q)

In [11]:
len(logs), len(logs['form'].unique())

(161595, 20563)

In [12]:
logs_pvt = pd.pivot_table(logs, index=['form', 'week'], columns='systemtype', values='count').reset_index()
logs_pvt.fillna(0, inplace=True)
logs_pvt.head()

systemtype,form,week,0,4,8,11,12,13,15,18,...,34,35,36,38,39,40,41,42,43,44
0,1,2016-05-02,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2016-05-30,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2016-06-20,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2016-06-27,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2016-07-04,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
idxd_month = logs_pvt['week']
idxd_form = logs_pvt['form']
logs_pct_change = logs_pvt.drop('week', axis=1).groupby('form').pct_change()
logs_pct_change['week'] = idxd_month
logs_pct_change['form'] = idxd_form
logs_pct_change = logs_pct_change.fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)
logs_pct_change.head()

Unnamed: 0,0,4,8,11,12,13,15,18,20,21,...,36,38,39,40,41,42,43,44,week,form
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-05-02,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-05-30,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-06-20,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-06-27,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-07-04,1


# analytics

In [14]:
q = "select * from analytics_weekly"
df_base = redshift_query_read(q)
q = "select * from analyticsqgiv_weekly"
df_qgiv = redshift_query_read(q)

In [15]:
len(df_base['date'].unique()), len(df_qgiv['date'].unique())

(152, 86)

In [16]:
df_analytics = df_base.merge(df_qgiv, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()
df_analytics['date'] = pd.to_datetime(df_analytics['date'])

In [17]:
len(df_analytics), len(df_analytics['form'].unique()), len(df_analytics['date'].unique())

(1189306, 23750, 86)

In [18]:
agg_analytics = None
for form in df_analytics['form'].unique():
    this_df = df_analytics[df_analytics['form']==form].copy()
    this_df['week'] = this_df['date']

    for c in this_df.columns:
        if c not in ['date', 'form', 'week']:
            this_df["{}_pct_change".format(c)] = this_df[c].pct_change()
            
    ext_cols = ['form', 'week'] + [c for c in this_df.columns if '_pct_change' in c]
    
    if agg_analytics is None:
        agg_analytics = this_df[ext_cols]
    else:
        agg_analytics = agg_analytics.append(this_df[ext_cols])

In [19]:
agg_analytics = agg_analytics.fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)
agg_analytics.head()

Unnamed: 0,form,week,vt_trans_count_pct_change,don_form_trans_count_pct_change,kiosk_trans_count_pct_change,p2p_trans_count_pct_change,mobile_trans_count_pct_change,mobilevt_trans_count_pct_change,sms_trans_count_pct_change,fb_trans_count_pct_change,...,collect_address_mobile_pct_change,enable_donorlogins_pct_change,enable_sms_pct_change,new_rec_volume_pct_change,new_rec_count_pct_change,reg_count_pct_change,dl_trans_volume_pct_change,dl_trans_count_pct_change,dl_new_rec_count_pct_change,dl_new_rec_volume_pct_change
0,1,2017-05-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5534,1,2017-05-15,100.0,100.0,0.0,0.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,0.0,0.0
11106,1,2017-05-22,1.777778,4.5,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,100.0,100.0,5.333333,8.081803,11.0,0.0,0.0
17186,1,2017-05-29,0.92,-0.454545,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-1.0,1.368421,-0.801471,-0.666667,0.0,0.0
23295,1,2017-06-05,-0.354167,2.166667,0.0,0.0,-0.833333,0.0,0.0,0.0,...,0.0,0.0,0.0,100.0,100.0,-0.488889,-0.796296,-0.75,100.0,100.0


In [20]:
len(agg_analytics), len(agg_analytics['week'].unique()), len(agg_analytics['form'].unique())

(1189306, 86, 23750)

# modeling

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

  from numpy.core.umath_tests import inner1d


In [22]:
# build data set
dataset = form_data.dropna()[['form', 'week', 'count_growth', 'vol_growth', 'conversion_rate', 'conversion_growth']]
dataset = dataset.merge(logs_pct_change, on=['form', 'week'])
dataset = dataset.merge(agg_analytics, on=['form', 'week'])

In [23]:
len(dataset), len(dataset['form'].unique()), len(dataset['week'].unique())

(7031, 1291, 39)

In [24]:
dataset['month_cat'] = dataset['week'].dt.month

target_cols = ['count_growth', 'vol_growth', 'vt_trans_count_pct_change', 
               'don_form_trans_count_pct_change', 'kiosk_trans_count_pct_change', 
               'p2p_trans_count_pct_change', 'mobile_trans_count_pct_change', 
               'mobilevt_trans_count_pct_change', 'sms_trans_count_pct_change', 
               'fb_trans_count_pct_change', 'vt_trans_vol_pct_change', 
               'don_form_trans_vol_pct_change', 'kiosk_trans_vol_pct_change', 
               'p2p_trans_vol_pct_change', 'mobile_trans_vol_pct_change', 
               'mobilevt_trans_vol_pct_change', 'sms_trans_vol_pct_change', 
               'fb_trans_vol_pct_change', 'one_time_trans_vol_pct_change', 
               'one_time_trans_count_pct_change', 'rec_trans_vol_pct_change', 
               'rec_trans_count_pct_change', 'new_rec_volume_pct_change', 
               'new_rec_count_pct_change', 'reg_count_pct_change', 
               'dl_trans_volume_pct_change', 'dl_trans_count_pct_change', 
               'dl_new_rec_count_pct_change', 'dl_new_rec_volume_pct_change',
               'conversion_rate', 'conversion_growth']

feature_cols = [c for c in dataset.columns if c not in target_cols and c != 'month']

In [25]:
for target in target_cols:
    scores = []
    for i in range(10):
        # build training data set
        feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        scores.append(rf.score(X_test, y_test))
    
    print(target)
    print(np.mean(scores))
    print("-"*40)

count_growth
-0.5271733914176864
----------------------------------------
vol_growth
-1.666188140949322
----------------------------------------
vt_trans_count_pct_change
-0.22440504992914817
----------------------------------------
don_form_trans_count_pct_change
-0.13357885673147699
----------------------------------------
kiosk_trans_count_pct_change
-0.1753934993953516
----------------------------------------
p2p_trans_count_pct_change
1.0
----------------------------------------
mobile_trans_count_pct_change
-0.24969752555534286
----------------------------------------
mobilevt_trans_count_pct_change
-0.4091664699098265
----------------------------------------
sms_trans_count_pct_change
-0.23329877055533316
----------------------------------------
fb_trans_count_pct_change
-0.3300465762810979
----------------------------------------
vt_trans_vol_pct_change
-0.22998106933385762
----------------------------------------
don_form_trans_vol_pct_change
-0.10390985366877067
-------------

## conversion rate

In [44]:
from sklearn.metrics import mean_squared_error

In [45]:
target = 'conversion_rate'
# build training data set
feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

# fit & evaluate
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

score = rf.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print("Score: {:.4f}".format(score))
print("MSE: {:.4f}".format(mse))

Score: -0.2409
MSE: 27.7901


In [46]:
feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
sorted(feature_importances, key=lambda x: x[1], reverse=True)[:20]

[(23, 0.46165781559366276),
 (34, 0.27147130423714994),
 ('month_cat', 0.16718135920046273),
 (15, 0.0679959041853855),
 (11, 0.010728607293287801),
 ('restrictions_pct_change', 0.005859539997873421),
 (21, 0.0039453657215477705),
 (24, 0.0033608831249170926),
 (32, 0.002250226669552224),
 (29, 0.0016749183805165491),
 ('amounts_pct_change', 0.0008804325266941951),
 (36, 0.000715724873861379),
 (35, 0.0005578935457734992),
 ('max_amount_pct_change', 0.00042154034010384166),
 ('req_fields_pct_change', 0.0003425451438763546),
 ('collect_optin_pct_change', 0.00015902283879378243),
 ('permit_anonymous_pct_change', 0.0001465168660196595),
 ('donation_active_pct_change', 0.00010854831786132563),
 ('opt_fields_pct_change', 7.183126763265795e-05),
 ('pledge_active_pct_change', 6.484622694400228e-05)]

In [47]:
for threshold in [0.01, 0.02, 0.05, 0.1, 0.15]:
    scores = []
    mses = []
    
    for i in range(50):
        # features with importances greater than 1%
        features_gt_one = [c[0] for c in feature_importances if c[1] >= threshold]
        target = 'conversion_rate'

        # build training data set
        feature_cols = features_gt_one
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_test)

        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("Feature importance threshold: {:.2f}%".format(threshold * 100.))
    print("\tscore: {:.4f}".format(np.mean(scores)))
    print("\tmse: {:.4f}".format(np.mean(mses)))
    print("\tfeature count: {}".format(len(features_gt_one)))

Feature importance threshold: 1.00%
	score: -0.1488
	mse: 36.5524
	feature count: 5
Feature importance threshold: 2.00%
	score: -0.1514
	mse: 35.7776
	feature count: 4
Feature importance threshold: 5.00%
	score: -0.1145
	mse: 34.8520
	feature count: 4
Feature importance threshold: 10.00%
	score: -0.1932
	mse: 36.7331
	feature count: 3
Feature importance threshold: 15.00%
	score: -0.1851
	mse: 37.8118
	feature count: 3


## conversion_growth

In [48]:
target = 'conversion_growth'
# build training data set
feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

# fit & evaluate
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

score = rf.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print("Score: {:.4f}".format(score))
print("MSE: {:.4f}".format(mse))

Score: -0.2054
MSE: 70.8439


In [49]:
feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
sorted(feature_importances, key=lambda x: x[1], reverse=True)[:20]

[(34, 0.42831441761252337),
 ('month_cat', 0.2666081387538331),
 (23, 0.15892488743632777),
 ('restrictions_pct_change', 0.031183908349918705),
 (11, 0.026156784179357868),
 ('multirestriction_system_pct_change', 0.014538548381268552),
 (24, 0.010467188585861932),
 (44, 0.007223573408846261),
 (21, 0.00704278517880986),
 ('collect_company_pct_change', 0.006971753637867335),
 ('amounts_pct_change', 0.006644023948236921),
 (15, 0.004740124294218902),
 (29, 0.004239275977218146),
 (32, 0.0038086178838063847),
 ('opt_fields_pct_change', 0.0032208351403693763),
 (35, 0.002691012713422169),
 ('collect_optin_pct_change', 0.002334469469790274),
 ('min_amount_pct_change', 0.002026455532645311),
 (36, 0.0016449823980282368),
 ('collect_phone_pct_change', 0.0015681450358530139)]

In [50]:
for threshold in [0.01, 0.02, 0.05, 0.1, 0.15]:
    scores = []
    mses = []
    
    for i in range(50):
        # features with importances greater than 1%
        features_gt_one = [c[0] for c in feature_importances if c[1] >= threshold]
        target = 'conversion_growth'

        # build training data set
        feature_cols = features_gt_one
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)

        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("Feature importance threshold: {:.2f}%".format(threshold * 100.))
    print("\tscore: {:.4f}".format(np.mean(scores)))
    print("\tmse: {:.4f}".format(np.mean(mses)))
    print("\tfeature count: {}".format(len(features_gt_one)))

Feature importance threshold: 1.00%
	score: -0.6335
	mse: 36.4306
	feature count: 7
Feature importance threshold: 2.00%
	score: -0.5404
	mse: 46.9653
	feature count: 5
Feature importance threshold: 5.00%
	score: -0.7527
	mse: 42.6595
	feature count: 3
Feature importance threshold: 10.00%
	score: -0.5147
	mse: 45.4989
	feature count: 3
Feature importance threshold: 15.00%
	score: -0.5416
	mse: 48.2491
	feature count: 3


## don_form_trans_count_pct_change

In [51]:
target = 'don_form_trans_count_pct_change'
# build training data set
feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

# fit & evaluate
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

score = rf.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print("Score: {:.4f}".format(score))
print("MSE: {:.4f}".format(mse))

Score: -0.1284
MSE: 1135.3879


In [52]:
feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
sorted(feature_importances, key=lambda x: x[1], reverse=True)[:20]

[('month_cat', 0.1812163739155677),
 (23, 0.17094495239755125),
 (11, 0.15526687132749883),
 (34, 0.12621455387209926),
 ('restrictions_pct_change', 0.043091032914588594),
 (29, 0.03889496880387942),
 (21, 0.033783780346431966),
 (32, 0.03307026931154995),
 ('amounts_pct_change', 0.028825289329625338),
 (24, 0.027671950225588503),
 (36, 0.020452654635550842),
 ('collect_optin_pct_change', 0.014905976285460218),
 (15, 0.01416558824065842),
 ('min_amount_pct_change', 0.010217464085216638),
 ('opt_fields_pct_change', 0.00907240263245777),
 ('pledge_active_pct_change', 0.00794943329263253),
 ('enable_sms_pct_change', 0.007052312612578569),
 ('req_fields_pct_change', 0.006983309550379626),
 ('collect_phone_pct_change', 0.006758852344338385),
 ('pledges_count_pct_change', 0.006296840521217187)]

In [53]:
for threshold in [0.01, 0.02, 0.05, 0.1, 0.15]:
    scores = []
    mses = []
    
    for i in range(50):
        # features with importances greater than 1%
        features_gt_one = [c[0] for c in feature_importances if c[1] >= threshold]
        target = 'don_form_trans_count_pct_change'

        # build training data set
        feature_cols = features_gt_one
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)

        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("Feature importance threshold: {:.2f}%".format(threshold * 100.))
    print("\tscore: {:.4f}".format(np.mean(scores)))
    print("\tmse: {:.4f}".format(np.mean(mses)))
    print("\tfeature count: {}".format(len(features_gt_one)))

Feature importance threshold: 1.00%
	score: -0.1564
	mse: 1250.9058
	feature count: 14
Feature importance threshold: 2.00%
	score: -0.1506
	mse: 1237.1203
	feature count: 11
Feature importance threshold: 5.00%
	score: -0.0968
	mse: 1205.1127
	feature count: 4
Feature importance threshold: 10.00%
	score: -0.0997
	mse: 1171.0008
	feature count: 4
Feature importance threshold: 15.00%
	score: -0.0557
	mse: 1139.7518
	feature count: 3


## new_rec_count_pct_change

In [54]:
target = 'new_rec_count_pct_change'
# build training data set
feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

# fit & evaluate
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

score = rf.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print("Score: {:.4f}".format(score))
print("MSE: {:.4f}".format(mse))

Score: -0.1845
MSE: 1341.9832


In [55]:
feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
sorted(feature_importances, key=lambda x: x[1], reverse=True)[:20]

[(23, 0.21518587523111044),
 (11, 0.19339405583988442),
 ('month_cat', 0.19291087463795614),
 (34, 0.12028278864975765),
 ('restrictions_pct_change', 0.05603631997519233),
 ('amounts_pct_change', 0.03380167776681638),
 (21, 0.03177765576430872),
 (29, 0.023932380007048414),
 (24, 0.018662611367285132),
 (32, 0.016993626889220563),
 (36, 0.015537081230713939),
 ('min_amount_pct_change', 0.012793955421866603),
 (35, 0.009815434255650344),
 (15, 0.00857980406983099),
 ('req_fields_pct_change', 0.00642302412715855),
 ('collect_phone_pct_change', 0.005198724314321957),
 ('opt_fields_pct_change', 0.004989744157058197),
 ('pledges_count_pct_change', 0.004492707885422661),
 ('ded_types_pct_change', 0.0043731032159567505),
 ('collect_optin_pct_change', 0.004161012856302714)]

In [57]:
for threshold in [0.01, 0.02, 0.05, 0.1, 0.15]:
    scores = []
    mses = []
    
    for i in range(50):
        # features with importances greater than 1%
        features_gt_one = [c[0] for c in feature_importances if c[1] >= threshold]
        target = 'new_rec_count_pct_change'

        # build training data set
        feature_cols = features_gt_one
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)

        scores.append(rf.score(X_test, y_test))
        mses.append(mean_squared_error(y_test, y_pred))
        
    print("Feature importance threshold: {:.2f}%".format(threshold * 100.))
    print("\tscore: {:.4f}".format(np.mean(scores)))
    print("\tmse: {:.4f}".format(np.mean(mses)))
    print("\tfeature count: {}".format(len(features_gt_one)))

Feature importance threshold: 1.00%
	score: -0.1566
	mse: 1358.5875
	feature count: 12
Feature importance threshold: 2.00%
	score: -0.1529
	mse: 1339.6120
	feature count: 8
Feature importance threshold: 5.00%
	score: -0.1394
	mse: 1327.6880
	feature count: 5
Feature importance threshold: 10.00%
	score: -0.1227
	mse: 1305.8301
	feature count: 4
Feature importance threshold: 15.00%
	score: -0.0783
	mse: 1255.5223
	feature count: 3


## count_growth

In [38]:
target = 'count_growth'
# build training data set
feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

# fit & evaluate
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

-0.029187671075280974

In [39]:
feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
sorted(feature_importances, key=lambda x: x[1], reverse=True)[:20]

[('month_cat', 0.2898206330172784),
 (34, 0.27761555521016335),
 (23, 0.16888039243458336),
 (11, 0.0798658783509187),
 (29, 0.030887677361040016),
 ('opt_fields_pct_change', 0.027543045197842692),
 (35, 0.027346352145332542),
 (24, 0.019319812867044477),
 ('collect_optin_pct_change', 0.011871578446935758),
 ('restrictions_pct_change', 0.01179619559019152),
 ('enable_donorlogins_pct_change', 0.008955984965950198),
 ('amounts_pct_change', 0.007176280232531108),
 ('collect_phone_pct_change', 0.004770886939474752),
 (32, 0.004552895661607228),
 (21, 0.004301825156416668),
 ('min_amount_pct_change', 0.004258473531623267),
 ('ded_types_pct_change', 0.003934837963595551),
 (15, 0.0030973707643816204),
 ('collect_company_pct_change', 0.0025765889287081073),
 ('req_fields_pct_change', 0.0023556198081053333)]

In [40]:
for threshold in [0.0025, 0.005, 0.01, 0.02, 0.05, 0.1, 0.15]:
    scores = []
    for i in range(100):
        # features with importances greater than 1%
        features_gt_one = [c[0] for c in feature_importances if c[1] >= threshold]
        target = 'count_growth'

        # build training data set
        feature_cols = features_gt_one
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)

        scores.append(rf.score(X_test, y_test))
        
    print("Feature importance threshold: {:.2f}%".format(threshold * 100.))
    print("\tscore: {:.4f}".format(np.mean(scores)))
    print("\tfeature count: {}".format(len(features_gt_one)))

Feature importance threshold: 0.25%
	score: -0.4859
	feature count: 19
Feature importance threshold: 0.50%
	score: -0.6337
	feature count: 12
Feature importance threshold: 1.00%
	score: -0.5751
	feature count: 10
Feature importance threshold: 2.00%
	score: -0.2870
	feature count: 7
Feature importance threshold: 5.00%
	score: -0.2545
	feature count: 4
Feature importance threshold: 10.00%
	score: -0.2477
	feature count: 3
Feature importance threshold: 15.00%
	score: -0.1976
	feature count: 3


## vol_growth

In [41]:
target = 'vol_growth'
# build training data set
feature_cols = [c for c in dataset.columns if c not in target_cols + ['week', 'form']]
this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

# train/test split
X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

# fit & evaluate
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

-4.270770951214371

In [42]:
feature_importances = []
for e in zip(X_train.columns, rf.feature_importances_):
    feature_importances.append(e)
sorted(feature_importances, key=lambda x: x[1], reverse=True)[:20]

[('month_cat', 0.408707560017338),
 (11, 0.12347990737398015),
 ('collect_optin_pct_change', 0.09537905488911888),
 (23, 0.08020489135295499),
 ('pledges_count_pct_change', 0.07836067503993673),
 (24, 0.06530791744960326),
 (34, 0.06345382286104032),
 ('amounts_pct_change', 0.01627468847484685),
 ('req_fields_pct_change', 0.015088511796333203),
 (36, 0.014683330093155408),
 ('ded_types_pct_change', 0.011371829201326686),
 ('donation_active_pct_change', 0.0060258678427350726),
 ('restrictions_pct_change', 0.005188176157896121),
 (29, 0.0037251316399992463),
 (32, 0.0035676702536430243),
 (21, 0.0016520942028775006),
 ('min_amount_pct_change', 0.0013757744453010297),
 ('permit_create_own_pledge_pct_change', 0.0013206235633018565),
 ('permit_other_amount_pct_change', 0.0009123324941325306),
 (35, 0.0006437386793393383)]

In [43]:
for threshold in [0.0025, 0.005, 0.01, 0.02, 0.05, 0.1, 0.15]:
    scores = []
    for i in range(100):
        # features with importances greater than 1%
        features_gt_one = [c[0] for c in feature_importances if c[1] >= threshold]
        target = 'vol_growth'

        # build training data set
        feature_cols = features_gt_one
        this_dataset = dataset[feature_cols + [target]].fillna(0).replace(np.inf, 100.).replace(-np.inf, -100.)

        # train/test split
        X_train, X_test, y_train, y_test = train_test_split(this_dataset.drop(target, axis=1), this_dataset[target])

        # fit & evaluate
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)

        scores.append(rf.score(X_test, y_test))
        
    print("Feature importance threshold: {:.2f}%".format(threshold * 100.))
    print("\tscore: {:.4f}".format(np.mean(scores)))
    print("\tfeature count: {}".format(len(features_gt_one)))

Feature importance threshold: 0.25%
	score: -3.1009
	feature count: 15
Feature importance threshold: 0.50%
	score: -2.4128
	feature count: 13
Feature importance threshold: 1.00%
	score: -2.6525
	feature count: 11
Feature importance threshold: 2.00%
	score: -3.2317
	feature count: 7
Feature importance threshold: 5.00%
	score: -3.2286
	feature count: 7
Feature importance threshold: 10.00%
	score: -0.0702
	feature count: 2
Feature importance threshold: 15.00%
	score: -0.0229
	feature count: 1
