In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("~/Repositories/datasets/analytics_p2p_registrations.csv")

In [5]:
drop_cols = ['product', 'sms_trans_vol', 'p2p_trans_vol', 'kiosk_trans_vol', 
             'fb_trans_count', 'sms_trans_count', 'kiosk_trans_count', 'p2p_trans_count', 
             'mobile_visits', 'vt_trans_count', 'vt_trans_vol', 'don_form_trans_count', 
             'don_form_trans_vol', 'form', 'org', 'id', 'fb_trans_vol', 
             'one_time_trans_count', 'one_time_trans_vol', 'rec_trans_count', 
             'rec_trans_vol', 'mobile_trans_count', 'mobile_trans_vol', 
             'mobilevt_trans_count', 'mobilevt_trans_vol', 'reg_volume', 'don_volume', 'don_count']
df = df.drop(drop_cols, axis=1)

In [6]:
df['reg_conversion'] = df['reg_count']/df['visits']
df.columns

Index([u'Unnamed: 0', u'sic', u'ein', u'visits', u'tm_stamp', u'class_count',
       u'cat_count', u'promo_count', u'rest_count', u'amt_count', u'ded_count',
       u'fields', u'opt_fields', u'req_fields', u'allows_reg_ind',
       u'allows_teams', u'allows_reg_team_create', u'allows_reg_team_join',
       u'allows_opt_reg_donation', u'allows_sub_reg', u'allows_sub_reg_pfp',
       u'allows_other_don_amt', u'allows_pfp_off_don', u'allows_tfp_off_don',
       u'allows_soc_post_pfp_tcp', u'share_home', u'share_pfp', u'share_tfp',
       u'share_therm', u'share_donation', u'allows_social',
       u'social_templt_count', u'social_auto', u'pcnt_posts', u'mon_posts',
       u'count_posts', u'date_posts', u'email_templt_count', u'sponsors_count',
       u'inappr_content', u'reg_count', u'non_fund_reg', u'sub_reg_count',
       u'teams_count', u'reg_amt', u'reg_conversion'],
      dtype='object')

In [7]:
print("Target variable characteristics:")
print("\tRegistration Count:")
print("\t\tmean: {}".format(df.reg_count.mean()))
print("\t\tstd: {}".format(df.reg_count.std()))
print("\tRegistration Amount:")
print("\t\tmean: {}".format(df.reg_amt.mean()))
print("\t\tstd: {}".format(df.reg_amt.std()))
print("\tRegistration Conversion:")
print("\t\tmean: {}".format(df.reg_conversion.mean()))
print("\t\tstd: {}".format(df.reg_conversion.std()))

Target variable characteristics:
	Registration Count:
		mean: 0.370938022201
		std: 2.76980622132
	Registration Amount:
		mean: 3.8912518277
		std: 46.411891162
	Registration Conversion:
		mean: 0.162489781408
		std: 1.49476779465


In [8]:
ftrs = ['class_count', 'cat_count', 'promo_count', 'rest_count', 'fields', 
        'opt_fields', 'req_fields', 'allows_reg_ind', 'allows_teams', 'allows_reg_team_create',
       'allows_reg_team_join', 'allows_opt_reg_donation', 'allows_sub_reg', 'allows_sub_reg_pfp',
       'share_home', 'share_pfp', 'share_tfp', 'share_therm', 'share_donation', 'allows_social',
       'teams_count', 'reg_count', 'reg_amt', 'reg_conversion']
df_ftrs = df[ftrs]

# Random Forest modeling

In [5]:
mse = []
exv = []
r2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt'], axis=1), df_ftrs['reg_count'], test_size=0.25)
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse.append(metrics.mean_squared_error(y_test, y_pred))
    exv.append(metrics.explained_variance_score(y_test, y_pred))
    r2.append(metrics.r2_score(y_test, y_pred))
print("RF w/ 100 trees:")
print("\tMSE: {}".format(np.mean(mse)))
print("\tExplained variance: {}".format(np.mean(exv)))
print("\tR^2: {}".format(np.mean(r2)))

RF w/ 100 trees:
	MSE: 8.28362236356
	Explained variance: -0.00229567213532
	R^2: -0.00231163238649


In [9]:
for i in range(len(X_test.columns)):
    print("{}: {}".format(X_test.columns[i], rf.feature_importances_[i]))

class_count: 0.175738004739
cat_count: 0.161261378846
promo_count: 0.161510568316
rest_count: 0.0118656674375
fields: 0.0969075601821
opt_fields: 0.0403493466081
req_fields: 0.0471703863334
allows_reg_ind: 0.0290808182716
allows_teams: 0.0108356534603
allows_reg_team_create: 0.00889787591117
allows_reg_team_join: 0.0105201364162
allows_opt_reg_donation: 0.030901570135
allows_sub_reg: 1.4627277453e-06
allows_sub_reg_pfp: 0.000872150081081
share_home: 0.0
share_pfp: 0.0
share_tfp: 0.0
share_therm: 0.0
share_donation: 0.0
allows_social: 0.00101219520463
teams_count: 0.21307522533


# GBR modeling

In [26]:
mse = []
exv = []
r2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt'], axis=1), df_ftrs['reg_count'], test_size=0.25)
    gbr = GradientBoostingRegressor(n_estimators=100)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    mse.append(metrics.mean_squared_error(y_test, y_pred))
    exv.append(metrics.explained_variance_score(y_test, y_pred))
    r2.append(metrics.r2_score(y_test, y_pred))
print("RF w/ 100 trees:")
print("\tMSE: {}".format(np.mean(mse)))
print("\tExplained variance: {}".format(np.mean(exv)))
print("\tR^2: {}".format(np.mean(r2)))

RF w/ 100 trees:
	MSE: 9.21816128196
	Explained variance: 8.42651662704e-05
	R^2: 5.26522176548e-05


# Random Forest modeling of registration conversion

In [7]:
mse = []
exv = []
r2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt', 'reg_conversion'], axis=1), df_ftrs['reg_conversion'], test_size=0.25)
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse.append(metrics.mean_squared_error(y_test, y_pred))
    exv.append(metrics.explained_variance_score(y_test, y_pred))
    r2.append(metrics.r2_score(y_test, y_pred))
print("RF w/ 100 trees against registration conversion:")
print("\tMSE: {}".format(np.mean(mse)))
print("\tExplained variance: {}".format(np.mean(exv)))
print("\tR^2: {}".format(np.mean(r2)))

RF w/ 100 trees against registration conversion:
	MSE: 1.7350273127
	Explained variance: 0.000434554329489
	R^2: 0.000389561139766


# Modeling different feature sets

Dropping all binary setting features (allows_*) and adding sponsors count to see if results differ notably.

In [5]:
ftrs = ['sponsors_count', 'class_count', 'cat_count', 'promo_count', 'rest_count', 'fields', 
        'opt_fields', 'req_fields', 'teams_count', 'reg_count', 'reg_amt', 'reg_conversion']
df_ftrs = df[ftrs]

## Random Forest modeling registration conversion w/ some feature selection

In [6]:
mse = []
exv = []
r2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt', 'reg_conversion'], axis=1), df_ftrs['reg_conversion'], test_size=0.25)
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse.append(metrics.mean_squared_error(y_test, y_pred))
    exv.append(metrics.explained_variance_score(y_test, y_pred))
    r2.append(metrics.r2_score(y_test, y_pred))
print("RF w/ 100 trees against registration conversion:")
print("\tMSE: {}".format(np.mean(mse)))
print("\tExplained variance: {}".format(np.mean(exv)))
print("\tR^2: {}".format(np.mean(r2)))

RF w/ 100 trees against registration conversion:
	MSE: 2.53004399463
	Explained variance: 0.000683080475513
	R^2: 0.000660716413522


So this performed worse than the original feature set, so there you go.

Trying the original feature set + sponsors count.

In [None]:
ftrs = ['class_count', 'cat_count', 'promo_count', 'rest_count', 'fields', 
        'opt_fields', 'req_fields', 'allows_reg_ind', 'allows_teams', 'allows_reg_team_create',
       'allows_reg_team_join', 'allows_opt_reg_donation', 'allows_sub_reg', 'allows_sub_reg_pfp',
       'share_home', 'share_pfp', 'share_tfp', 'share_therm', 'share_donation', 'allows_social',
       'teams_count', 'reg_count', 'reg_amt', 'reg_conversion', 'sponsors_count']
df_ftrs = df[ftrs]

In [13]:
mse = []
exv = []
r2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt', 'reg_conversion'], axis=1), df_ftrs['reg_conversion'], test_size=0.25)
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse.append(metrics.mean_squared_error(y_test, y_pred))
    exv.append(metrics.explained_variance_score(y_test, y_pred))
    r2.append(metrics.r2_score(y_test, y_pred))
print("RF w/ 100 trees against registration conversion:")
print("\tMSE: {}".format(np.mean(mse)))
print("\tExplained variance: {}".format(np.mean(exv)))
print("\tR^2: {}".format(np.mean(r2)))

RF w/ 100 trees against registration conversion:
	MSE: 2.52451839273
	Explained variance: 0.000707498826208
	R^2: 0.000683219449899


Still notably worse than the original feature set. Clearly sponsors count is dragging the model down. Now let's try some backward stepwise feature selection against the original random forest model, original feature set, and see if we can't find anything more accurate or efficient.

In [None]:
for y in ['reg_count', 'reg_amt', 'reg_conversion']:
    print("Modeling against {}".format(y))
    mse = []
    exv = []
    r2 = []
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt', 'reg_conversion'], axis=1), df_ftrs[y], test_size=0.25)
        rf = RandomForestRegressor(n_estimators=100)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        mse.append(metrics.mean_squared_error(y_test, y_pred))
        exv.append(metrics.explained_variance_score(y_test, y_pred))
        r2.append(metrics.r2_score(y_test, y_pred))
    print("Base:")
    print("\tMSE: {}; variance explained: {}; r^2: {}".format(np.mean(mse), np.mean(exv), np.mean(r2)))

    dropped_metrics = {}
    for c in X_test.columns:
        mse = []
        exv = []
        r2 = []
        for i in range(10):
            X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop([c, 'reg_count', 'reg_amt', 'reg_conversion'], axis=1), df_ftrs[y], test_size=0.25)
            rf = RandomForestRegressor(n_estimators=100)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)
            mse.append(metrics.mean_squared_error(y_test, y_pred))
            exv.append(metrics.explained_variance_score(y_test, y_pred))
            r2.append(metrics.r2_score(y_test, y_pred))
        print("Dropping {}:".format(c))
        print("\tMSE: {}; variance explained: {}; r^2: {}".format(np.mean(mse), np.mean(exv), np.mean(r2)))
        dropped_metrics[c] = {'mse': np.mean(mse), 'exv': np.mean(exv), 'r2': np.mean(r2)}

Modeling against reg_count
Base:
	MSE: 6.94991847449; variance explained: -0.00301030505204; r^2: -0.00301758947524
Dropping class_count:
	MSE: 7.56037292026; variance explained: -0.00181223442934; r^2: -0.00183441777741
Dropping cat_count:
	MSE: 7.54230862449; variance explained: -0.00257750303874; r^2: -0.00261176844221
Dropping promo_count:
	MSE: 6.85983089404; variance explained: -0.00319833358729; r^2: -0.00322776374637
Dropping rest_count:
	MSE: 7.25724724551; variance explained: -0.00232950789838; r^2: -0.00233731679647
Dropping fields:
	MSE: 9.19342738314; variance explained: -0.00263497467291; r^2: -0.00265925473631
Dropping opt_fields:
	MSE: 7.97173356645; variance explained: -0.00213623226474; r^2: -0.0021593096542
Dropping req_fields:
	MSE: 6.66111176494; variance explained: -0.00320425178996; r^2: -0.00321876428381
Dropping allows_reg_ind:
	MSE: 7.54170511101; variance explained: -0.00291855779953; r^2: -0.0029650419123
Dropping allows_teams:
	MSE: 8.22640369677; variance 

Dropping fields significantly improved accuracy, but not the variance explained or $r^2$

In [16]:
print("Modeling against {}".format('reg_conversion'))
mse = []
exv = []
r2 = []

for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt', 'reg_conversion'], axis=1), df_ftrs['reg_conversion'], test_size=0.25)
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse.append(metrics.mean_squared_error(y_test, y_pred))
    exv.append(metrics.explained_variance_score(y_test, y_pred))
    r2.append(metrics.r2_score(y_test, y_pred))
print("Base:")
print("\tMSE: {}; variance explained: {}; r^2: {}".format(np.mean(mse), np.mean(exv), np.mean(r2)))

from itertools import chain, combinations

dropped_metrics = {}
feature_list = ['allows_reg_team_create', 'req_fields', 'fields', 'opt_fields', 'rest_count', 'share_home']
drop_feature_lists = chain.from_iterable(combinations(feature_list,n) for n in range(len(feature_list)+1))
for c in drop_feature_lists:
    if len(c) > 0:
        mse = []
        exv = []
        r2 = []
        for i in range(20):
            X_train, X_test, y_train, y_test = train_test_split(df_ftrs.drop(['reg_count', 'reg_amt', 'reg_conversion'], axis=1).drop(list(c), axis=1), df_ftrs['reg_conversion'], test_size=0.25)
            rf = RandomForestRegressor(n_estimators=100)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_test)
            mse.append(metrics.mean_squared_error(y_test, y_pred))
            exv.append(metrics.explained_variance_score(y_test, y_pred))
            r2.append(metrics.r2_score(y_test, y_pred))
        print("Dropping {}:".format(c))
        print("\tMSE: {}; variance explained: {}; r^2: {}".format(np.mean(mse), np.mean(exv), np.mean(r2)))
        dropped_metrics[c] = {'mse': np.mean(mse), 'exv': np.mean(exv), 'r2': np.mean(r2)}

Modeling against reg_conversion
Base:
	MSE: 2.16182789706; variance explained: 0.00108410206803; r^2: 0.001059420598
Dropping ('allows_reg_team_create',):
	MSE: 2.06040585798; variance explained: 0.00039309129375; r^2: 0.000371853220205
Dropping ('req_fields',):
	MSE: 2.21490390637; variance explained: 0.000801581496315; r^2: 0.000788423441736
Dropping ('fields',):
	MSE: 2.419462945; variance explained: 0.000749546732996; r^2: 0.000725784201399
Dropping ('opt_fields',):
	MSE: 2.48332616884; variance explained: 0.0008303929663; r^2: 0.000814899633683
Dropping ('rest_count',):
	MSE: 2.29901520311; variance explained: 0.000663722792234; r^2: 0.000652213906218
Dropping ('share_home',):
	MSE: 2.0666510956; variance explained: 0.000730240222839; r^2: 0.000703350370759
Dropping ('allows_reg_team_create', 'req_fields'):
	MSE: 2.16155519209; variance explained: 0.000682762208875; r^2: 0.000658573326496
Dropping ('allows_reg_team_create', 'fields'):
	MSE: 1.94594357679; variance explained: 0.000