In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Data import & prep

In [2]:
df = pd.read_csv("~/Repositories/datasets/analytics_p2p_registrations.csv")

In [3]:
drop_cols = ['product', 'sms_trans_vol', 'p2p_trans_vol', 'kiosk_trans_vol', 
             'fb_trans_count', 'sms_trans_count', 'kiosk_trans_count', 'p2p_trans_count', 
             'mobile_visits', 'vt_trans_count', 'vt_trans_vol', 'don_form_trans_count', 
             'don_form_trans_vol', 'form', 'org', 'id', 'fb_trans_vol', 
             'one_time_trans_count', 'one_time_trans_vol', 'rec_trans_count', 
             'rec_trans_vol', 'mobile_trans_count', 'mobile_trans_vol', 
             'mobilevt_trans_count', 'mobilevt_trans_vol']
df = df.drop(drop_cols, axis=1)

In [4]:
df['reg_conversion'] = df['reg_count'] / df['visits']
df['don_conversion'] = df['don_count'] / df['visits']

In [5]:
df.columns

Index([u'Unnamed: 0', u'sic', u'ein', u'visits', u'tm_stamp', u'class_count',
       u'cat_count', u'promo_count', u'rest_count', u'amt_count', u'ded_count',
       u'fields', u'opt_fields', u'req_fields', u'allows_reg_ind',
       u'allows_teams', u'allows_reg_team_create', u'allows_reg_team_join',
       u'allows_opt_reg_donation', u'allows_sub_reg', u'allows_sub_reg_pfp',
       u'allows_other_don_amt', u'allows_pfp_off_don', u'allows_tfp_off_don',
       u'allows_soc_post_pfp_tcp', u'share_home', u'share_pfp', u'share_tfp',
       u'share_therm', u'share_donation', u'allows_social',
       u'social_templt_count', u'social_auto', u'pcnt_posts', u'mon_posts',
       u'count_posts', u'date_posts', u'email_templt_count', u'sponsors_count',
       u'inappr_content', u'reg_count', u'non_fund_reg', u'sub_reg_count',
       u'teams_count', u'reg_volume', u'don_volume', u'don_count', u'reg_amt',
       u'reg_conversion', u'don_conversion'],
      dtype='object')

In [25]:
ftrs = ['class_count', 'cat_count', 'promo_count', 'rest_count', 'fields', 
        'opt_fields', 'req_fields', 'allows_reg_ind', 'allows_teams', 'allows_reg_team_create',
       'allows_reg_team_join', 'allows_opt_reg_donation', 'allows_sub_reg', 'allows_sub_reg_pfp',
       'share_home', 'share_pfp', 'share_tfp', 'share_therm', 'share_donation', 'allows_social',
       'teams_count', 'reg_count', 'reg_amt', 'reg_conversion', 'don_volume', 'don_count', 'don_conversion']

targets = ['reg_count', 'reg_amt', 'reg_conversion', 'don_volume', 'don_count', 'don_conversion']

# Individual registration setting

## rebalancing & exploration

In [34]:
df.loc[df.allows_reg_ind==5, 'allows_reg_ind'] = 0
df.allows_reg_ind.value_counts()

1    202906
0     65190
Name: allows_reg_ind, dtype: int64

In [35]:
resampled = df[df.allows_reg_ind==0]
resampled = resampled.append(df[df.allows_reg_ind==1].sample(len(resampled)))

In [36]:
resampled.allows_reg_ind.value_counts()

1    65190
0    65190
Name: allows_reg_ind, dtype: int64

In [37]:
cols = ['allows_reg_ind', 'reg_count', 'reg_volume', 'don_volume', 'don_count', 
        'reg_amt', 'reg_conversion', 'don_conversion']
resampled[cols].corr()

Unnamed: 0,allows_reg_ind,reg_count,reg_volume,don_volume,don_count,reg_amt,reg_conversion,don_conversion
allows_reg_ind,1.0,-0.005654,,-0.080157,-0.108687,0.00026,0.000581,-0.072588
reg_count,-0.005654,1.0,,-0.000616,-0.00043,0.16666,0.649562,-0.004629
reg_volume,,,,,,,,
don_volume,-0.080157,-0.000616,,1.0,0.802023,-0.000932,-0.019732,0.150625
don_count,-0.108687,-0.00043,,0.802023,1.0,-0.001364,-0.023411,0.173411
reg_amt,0.00026,0.16666,,-0.000932,-0.001364,1.0,0.132524,-0.000996
reg_conversion,0.000581,0.649562,,-0.019732,-0.023411,0.132524,1.0,-0.018566
don_conversion,-0.072588,-0.004629,,0.150625,0.173411,-0.000996,-0.018566,1.0


## modeling for feature importance 

In [38]:
for target in targets:
    mse = []
    exv = []
    r2 = []

    df_ftrs = resampled[ftrs].drop(targets, axis=1)
    df_target = resampled[target]

    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(df_ftrs, df_target, test_size=0.25)
        rf = RandomForestRegressor(n_estimators=100)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        mse.append(metrics.mean_squared_error(y_test, y_pred))
        exv.append(metrics.explained_variance_score(y_test, y_pred))
        r2.append(metrics.r2_score(y_test, y_pred))

    print("RF for {}:".format(target))
    print("\tMSE: {}".format(np.mean(mse)))
    print("\tExplained variance: {}".format(np.mean(exv)))
    print("\tR^2: {}".format(np.mean(r2)))
    
    for i in range(len(df_ftrs.columns)):
        print("{}: {}".format(df_ftrs.columns[i], rf.feature_importances_[i]))

RF for reg_count:
	MSE: 8.85377875868
	Explained variance: -0.00884604278655
	R^2: -0.00889366666331
class_count: 0.172760631845
cat_count: 0.150587421728
promo_count: 0.168708329615
rest_count: 0.00121833820406
fields: 0.123077333703
opt_fields: 0.0486651563697
req_fields: 0.0609279667275
allows_reg_ind: 0.0340915685348
allows_teams: 0.0224022728343
allows_reg_team_create: 0.00730796354411
allows_reg_team_join: 0.00855843880182
allows_opt_reg_donation: 0.0461368838235
allows_sub_reg: 0.0
allows_sub_reg_pfp: 0.00116029335043
share_home: 0.0
share_pfp: 0.0
share_tfp: 0.0
share_therm: 0.0
share_donation: 0.0
allows_social: 3.1499313129e-05
teams_count: 0.154365901604
RF for reg_amt:
	MSE: 2679.60410356
	Explained variance: -0.0229903328509
	R^2: -0.0230201115001
class_count: 0.116811379903
cat_count: 0.141328255679
promo_count: 0.210782985784
rest_count: 0.0159182728579
fields: 0.107633326583
opt_fields: 0.0907029959821
req_fields: 0.0633237318789
allows_reg_ind: 0.0422447466773
allows_t

# Allows teams setting

## rebalancing and exploration

In [39]:
df.loc[df.allows_teams==5, 'allows_teams'] = 0
df.allows_teams.value_counts()

1    227966
0     40130
Name: allows_teams, dtype: int64

In [40]:
resampled = df[df.allows_teams==0]
resampled = resampled.append(df[df.allows_teams==1].sample(len(resampled)))
resampled.allows_teams.value_counts()

1    40130
0    40130
Name: allows_teams, dtype: int64

In [41]:
cols = ['allows_teams', 'reg_count', 'reg_volume', 'don_volume', 'don_count', 
        'reg_amt', 'reg_conversion', 'don_conversion']
resampled[cols].corr()

Unnamed: 0,allows_teams,reg_count,reg_volume,don_volume,don_count,reg_amt,reg_conversion,don_conversion
allows_teams,1.0,-0.004526,,0.046075,0.096507,-0.003325,-0.011977,-0.017141
reg_count,-0.004526,1.0,,-3.1e-05,-0.003862,0.267521,0.756511,-0.006616
reg_volume,,,,,,,,
don_volume,0.046075,-3.1e-05,,1.0,0.730526,-0.001755,-0.024516,0.210053
don_count,0.096507,-0.003862,,0.730526,1.0,0.000837,-0.027677,0.218075
reg_amt,-0.003325,0.267521,,-0.001755,0.000837,1.0,0.206503,-0.000513
reg_conversion,-0.011977,0.756511,,-0.024516,-0.027677,0.206503,1.0,-0.025962
don_conversion,-0.017141,-0.006616,,0.210053,0.218075,-0.000513,-0.025962,1.0


In [42]:
for target in targets:
    mse = []
    exv = []
    r2 = []

    df_ftrs = resampled[ftrs].drop(targets, axis=1)
    df_target = resampled[target]

    for i in range(5):
        X_train, X_test, y_train, y_test = train_test_split(df_ftrs, df_target, test_size=0.25)
        rf = RandomForestRegressor(n_estimators=100)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        mse.append(metrics.mean_squared_error(y_test, y_pred))
        exv.append(metrics.explained_variance_score(y_test, y_pred))
        r2.append(metrics.r2_score(y_test, y_pred))

    print("RF for {}:".format(target))
    print("\tMSE: {}".format(np.mean(mse)))
    print("\tExplained variance: {}".format(np.mean(exv)))
    print("\tR^2: {}".format(np.mean(r2)))
    
    for i in range(len(df_ftrs.columns)):
        print("{}: {}".format(df_ftrs.columns[i], rf.feature_importances_[i]))

RF for reg_count:
	MSE: 5.14983248856
	Explained variance: -0.0132367120577
	R^2: -0.0133302365971
class_count: 0.167001143094
cat_count: 0.180790024998
promo_count: 0.175152380422
rest_count: 0.00307167727992
fields: 0.0910506937304
opt_fields: 0.0531629340302
req_fields: 0.067088211083
allows_reg_ind: 0.031862809494
allows_teams: 0.0141326948815
allows_reg_team_create: 0.0174222300098
allows_reg_team_join: 0.0141484079552
allows_opt_reg_donation: 0.0572930004653
allows_sub_reg: 9.73955723664e-06
allows_sub_reg_pfp: 0.000472257401993
share_home: 0.0
share_pfp: 0.0
share_tfp: 0.0
share_therm: 0.0
share_donation: 0.0
allows_social: 0.00203793639185
teams_count: 0.125303859205
RF for reg_amt:
	MSE: 1760.2470753
	Explained variance: -0.0124806995644
	R^2: -0.0125634735732
class_count: 0.384766277688
cat_count: 0.0679616174405
promo_count: 0.134175577674
rest_count: 0.00618372514252
fields: 0.039659849728
opt_fields: 0.0257268311925
req_fields: 0.0176056696685
allows_reg_ind: 0.03369315097