In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Data prep

## donation data

In [2]:
# read CSV's to dataframes
df_base = pd.read_csv("/Users/jeremyvanvalkenburg/Repositories/datasets/export_base.csv")
df_p2p = pd.read_csv("/Users/jeremyvanvalkenburg/Repositories/datasets/export_p2p.csv")

# filter out entries w/out any visitors
df_base = df_base[(df_base.visits!=0)&(df_base.visits!=0)]
df_p2p = df_p2p[df_p2p.base.isin(df_base.id.values.tolist())]

# merge the dataframes and generate donation conversion column
df = pd.merge(df_base, df_p2p, left_on='id', right_on='base')
df['donation_conversion'] = df['don_count'] / df['visits'] + df['mobile_visits']

## Registration data

In [3]:
df_reg = pd.read_csv("~/Repositories/datasets/analytics_p2p_registrations.csv")
df_reg = df_reg[df_reg.visits>0]
df_reg['reg_conversion'] = df['reg_count'] / df['visits']

# Donation data to amounts & fields

In [8]:
df[['donation_conversion', 'don_count', 'fields', 'amt_count']].corr()

Unnamed: 0,donation_conversion,don_count,fields,amt_count
donation_conversion,1.0,0.184281,-0.028306,-0.012636
don_count,0.184281,1.0,-0.026774,-0.029991
fields,-0.028306,-0.026774,1.0,0.072683
amt_count,-0.012636,-0.029991,0.072683,1.0


## Random forest modeling

In [18]:
ftrs = ['class_count', 'cat_count', 'promo_count', 'rest_count', 'amt_count', 'ded_count', 
'fields', 'opt_fields', 'req_fields', 'allows_reg_ind', 'allows_teams', 'allows_reg_team_create', 
'allows_reg_team_join', 'allows_opt_reg_donation', 'allows_pfp_off_don', 'allows_tfp_off_don']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df[ftrs], df['donation_conversion'], test_size=0.25)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print("Feature importances in a random forest to donation_conversion:")
for c in range(len(X_train.columns)):
    if X_train.columns[c] in ['fields', 'amt_count']:
        print("\t{}: {}".format(X_train.columns[c], rf.feature_importances_[c]))

Feature importances in a random forest to donation_conversion:
	amt_count: 0.144098412859
	fields: 0.0737766847889


In [21]:
X_train, X_test, y_train, y_test = train_test_split(df[ftrs], df['don_count'], test_size=0.25)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print("Feature importances in a random forest to don_count:")
for c in range(len(X_train.columns)):
    if X_train.columns[c] in ['fields', 'amt_count']:
        print("\t{}: {}".format(X_train.columns[c], rf.feature_importances_[c]))

Feature importances in a random forest to don_count:
	amt_count: 0.0428145443356
	fields: 0.0223967511331


Not much to see here other than the __feature importance of amount count to donation conversion__. This seems unusually high compared to what I've been seeing in this data set at 14%. Noteworthy, not sure it's relevant but noteworthy none the less.

The other feature importances and the correlations are pretty weak.

# Registration data to fields & amounts

In [9]:
df_reg[['reg_conversion', 'reg_count', 'fields', 'amt_count']].corr()

Unnamed: 0,reg_conversion,reg_count,fields,amt_count
reg_conversion,,,,
reg_count,,1.0,-0.000994,-0.001331
fields,,-0.000994,1.0,0.072683
amt_count,,-0.001331,0.072683,1.0


## Random forest modeling

In [22]:
ftrs = ['class_count', 'cat_count', 'promo_count', 'rest_count', 'amt_count', 'ded_count', 
'fields', 'opt_fields', 'req_fields', 'allows_reg_ind', 'allows_teams', 'allows_reg_team_create', 
'allows_reg_team_join', 'allows_opt_reg_donation', 'allows_pfp_off_don', 'allows_tfp_off_don']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(df_reg[ftrs], df_reg['reg_conversion'], test_size=0.25)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print("Feature importances in a random forest to reg_conversion:")
for c in range(len(X_train.columns)):
    if X_train.columns[c] in ['fields', 'amt_count']:
        print("\t{}: {}".format(X_train.columns[c], rf.feature_importances_[c]))

Feature importances in a random forest to reg_conversion:
	amt_count: 0.0
	fields: 0.0


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df_reg[ftrs], df_reg['reg_count'], test_size=0.25)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
print("Feature importances in a random forest to reg_count:")
for c in range(len(X_train.columns)):
    if X_train.columns[c] in ['fields', 'amt_count']:
        print("\t{}: {}".format(X_train.columns[c], rf.feature_importances_[c]))

Feature importances in a random forest to reg_count:
	amt_count: 0.0820866472723
	fields: 0.0949688731904


The correlations are too weak to be meaningful here, the feature importances when modeling against the count are a little better. These values push these features up to the top 25% or so for the model, but still nothing astonishing. The importances are fairly evenly distributed across all features so these are still not strong predictors.

# Amounts data

In [11]:
df.amt_count.value_counts()

4     127571
5      70206
6      34186
7      10965
3       8858
8       5489
10      3701
0       2311
9       1765
11      1338
12       851
1        453
26       396
2          6
Name: amt_count, dtype: int64

In [12]:
df.amt_count.describe()

count    268096.000000
mean          4.857428
std           1.656156
min           0.000000
25%           4.000000
50%           4.000000
75%           5.000000
max          26.000000
Name: amt_count, dtype: float64

# Fields data

In [14]:
df.fields.value_counts()

0     94115
1     59935
2     39650
3     24811
4     13617
5     12575
6      6173
13     3442
9      2778
10     2713
8      2475
12     1914
21     1070
7      1032
11      787
27      607
16      402
Name: fields, dtype: int64

In [15]:
df.fields.describe()

count    268096.000000
mean          2.121878
std           3.152475
min           0.000000
25%           0.000000
50%           1.000000
75%           3.000000
max          27.000000
Name: fields, dtype: float64

In [29]:
df.columns

Index([u'id_x', u'org_x', u'form', u'sic', u'ein', u'visits', u'mobile_visits',
       u'vt_trans_count', u'don_form_trans_count', u'kiosk_trans_count',
       u'p2p_trans_count', u'mobile_trans_count', u'mobilevt_trans_count',
       u'sms_trans_count', u'fb_trans_count', u'vt_trans_vol',
       u'don_form_trans_vol', u'kiosk_trans_vol', u'p2p_trans_vol',
       u'mobile_trans_vol', u'mobilevt_trans_vol', u'sms_trans_vol',
       u'fb_trans_vol', u'tm_stamp', u'one_time_trans_count',
       u'one_time_trans_vol', u'rec_trans_count', u'rec_trans_vol', u'product',
       u'id_y', u'base', u'org_y', u'class_count', u'cat_count',
       u'promo_count', u'rest_count', u'amt_count', u'ded_count', u'fields',
       u'opt_fields', u'req_fields', u'allows_reg_ind', u'allows_teams',
       u'allows_reg_team_create', u'allows_reg_team_join',
       u'allows_opt_reg_donation', u'allows_sub_reg', u'allows_sub_reg_pfp',
       u'allows_other_don_amt', u'allows_pfp_off_don', u'allows_tfp_off_don',
 