In [26]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [25]:
def get_donation_data():
    df_base = pd.read_csv("/Users/jeremyvanvalkenburg/Repositories/datasets/export_base.csv")
    df_p2p = pd.read_csv("/Users/jeremyvanvalkenburg/Repositories/datasets/export_p2p.csv")
    df = pd.merge(df_base, df_p2p, left_on='id', right_on='base')
    df['donation_conversion'] = df['don_count'] / df['visits'] + df['mobile_visits']
    df['registration_conversion'] = df['reg_count'] / df['visits'] + df['mobile_visits']
    df['donation_mean'] = df['don_volume'] / df['don_count']
    
    return df

def get_registration_data():
    df = pd.read_csv("~/Repositories/datasets/analytics_p2p_registrations.csv")
    drop_cols = ['product', 'sms_trans_vol', 'p2p_trans_vol', 'kiosk_trans_vol', 
                 'fb_trans_count', 'sms_trans_count', 'kiosk_trans_count', 'p2p_trans_count', 
                 'mobile_visits', 'vt_trans_count', 'vt_trans_vol', 'don_form_trans_count', 
                 'don_form_trans_vol', 'form', 'org', 'id', 'fb_trans_vol', 
                 'one_time_trans_count', 'one_time_trans_vol', 'rec_trans_count', 
                 'rec_trans_vol', 'mobile_trans_count', 'mobile_trans_vol', 
                 'mobilevt_trans_count', 'mobilevt_trans_vol', 'reg_volume', 'don_volume', 'don_count']
    df = df.drop(drop_cols, axis=1)

    # the following are missing all data so dropping for clean views
    more_drop_cols = ['share_home', 'share_pfp', 'share_tfp', 'share_therm', 'share_donation', 
                      'allows_soc_post_pfp_tcp', 'social_templt_count', 'social_auto', 
                      'pcnt_posts', 'mon_posts', 'count_posts', 'date_posts', 'email_templt_count']
    df = df.drop(more_drop_cols, axis=1)

    # no disparity between values (ie, settings almost universally identifal to all observations 
    # or has no impact)
    further_drop_cols = ['allows_social', 'allows_sub_reg', 'allows_other_don_amt']
    df = df.drop(further_drop_cols, axis=1)

    # set binary features to binary just in case another value (ie, 5 for inactive) slipped in
    for c in df.columns:
        if 'allows' in c:
            df[c] = df[c].apply(lambda x: 0 if x == 5 else x)

    # adding conversion variable
    df['reg_conversion'] = df['reg_count'] / df['visits']

    return df

# Classifications

## exploration

In [20]:
print("|-------|------|")
print("| mean | {} |".format(df.class_count.mean()))
print("| std | {} |".format(df.class_count.std()))
print("| mode | {} |".format(df.class_count.mode()))
print("| unique | {} |".format(len(df.class_count.unique())))

|-------|------|
| mean | 5.03339848413 |
| std | 22.8468895312 |
| mode | 0    0
dtype: int64 |
| unique | 65 |


In [23]:
df[['class_count', 'reg_amt', 'reg_count']].corr()

Unnamed: 0,class_count,reg_amt,reg_count
class_count,1.0,-0.002278,-0.001805
reg_amt,-0.002278,1.0,0.206859
reg_count,-0.001805,0.206859,1.0


## modeling RF for feature importance

In [36]:
targets = ['reg_amt', 'reg_count', 'reg_conversion', 'don_count', 'don_volume', 'donation_conversion']

ftrs_list = ['class_count', 'cat_count', 'promo_count', 'rest_count', 'amt_count', 'ded_count', 
'fields', 'opt_fields', 'req_fields', 'allows_reg_ind', 'allows_teams', 'allows_reg_team_create', 
'allows_reg_team_join', 'allows_opt_reg_donation', 'allows_pfp_off_don', 'allows_tfp_off_don']

don_df = get_donation_data()
reg_df = get_registration_data()

In [37]:
# drop target columns & split data
ftrs = reg_df[ftrs_list]
target_col = reg_df['reg_amt']
X_train, X_test, y_train, y_test = train_test_split(ftrs, target_col, test_size=0.25)

# model with rf
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest targeting reg_amt")
print("\tPrediction error: {}".format((y_pred-y_test).mean()))
print("\tR^2: {}".format(rf.score(X_test, y_test)))

# print feature importances
print("\tfeature importances:")
for i in range(len(ftrs.columns)):
    if ftrs.columns[i] in ['class_count', 'cat_count']:
        print("\t\t{}: {}".format(ftrs.columns[i], rf.feature_importances_[i]))

Random Forest targeting reg_amt
	Prediction error: 0.105473496902
	R^2: -0.00203878920813
	feature importances:
		class_count: 0.109922880832
		cat_count: 0.135840000731


In [38]:
# drop target columns & split data
ftrs = reg_df[ftrs_list]
target_col = reg_df['reg_count']
X_train, X_test, y_train, y_test = train_test_split(ftrs, target_col, test_size=0.25)

# model with rf
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest targeting reg_count")
print("\tPrediction error: {}".format((y_pred-y_test).mean()))
print("\tR^2: {}".format(rf.score(X_test, y_test)))

# print feature importances
print("\tfeature importances:")
for i in range(len(ftrs.columns)):
    if ftrs.columns[i] in ['class_count', 'cat_count']:
        print("\t\t{}: {}".format(ftrs.columns[i], rf.feature_importances_[i]))

Random Forest targeting reg_count
	Prediction error: 0.0106870069106
	R^2: -0.0022696253644
	feature importances:
		class_count: 0.184880252543
		cat_count: 0.173049213546


In [39]:
# drop target columns & split data
ftrs = reg_df[ftrs_list]
target_col = reg_df['reg_conversion']
X_train, X_test, y_train, y_test = train_test_split(ftrs, target_col, test_size=0.25)

# model with rf
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest targeting reg_conversion")
print("\tPrediction error: {}".format((y_pred-y_test).mean()))
print("\tR^2: {}".format(rf.score(X_test, y_test)))

# print feature importances
print("\tfeature importances:")
for i in range(len(ftrs.columns)):
    if ftrs.columns[i] in ['class_count', 'cat_count']:
        print("\t\t{}: {}".format(ftrs.columns[i], rf.feature_importances_[i]))

Random Forest targeting reg_conversion
	Prediction error: 0.0113060766677
	R^2: 0.00110401894471
	feature importances:
		class_count: 0.12091805936
		cat_count: 0.145226961353


In [44]:
# drop target columns & split data
don_df = don_df[don_df.visits > 0]
ftrs = don_df[ftrs_list]
target_col = don_df['donation_conversion']
X_train, X_test, y_train, y_test = train_test_split(ftrs, target_col, test_size=0.25)

# model with rf
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Random Forest targeting don_conversion")
print("\tPrediction error: {}".format((y_pred-y_test).mean()))
print("\tR^2: {}".format(rf.score(X_test, y_test)))

# print feature importances
print("\tfeature importances:")
for i in range(len(ftrs.columns)):
    if ftrs.columns[i] in ['class_count', 'cat_count']:
        print("\t\t{}: {}".format(ftrs.columns[i], rf.feature_importances_[i]))

Random Forest targeting don_conversion
	Prediction error: -0.000349115261988
	R^2: 0.0942886713147
	feature importances:
		class_count: 0.109342008573
		cat_count: 0.132199345656


In [46]:
don_df[['class_count', 'cat_count', 'donation_conversion']].corr()

Unnamed: 0,class_count,cat_count,donation_conversion
class_count,1.0,0.108865,-0.005044
cat_count,0.108865,1.0,-0.047141
donation_conversion,-0.005044,-0.047141,1.0


In [47]:
reg_df[['class_count', 'cat_count', 'reg_conversion']].corr()

Unnamed: 0,class_count,cat_count,reg_conversion
class_count,1.0,0.108865,-0.007188
cat_count,0.108865,1.0,-0.004348
reg_conversion,-0.007188,-0.004348,1.0


# Categories

## exploration

In [11]:
print("|-------|------|")
print("| mean | {} |".format(df.cat_count.mean()))
print("| std | {} |".format(df.cat_count.std()))
print("| mode | {} |".format(df.cat_count.mode()))
print("| unique | {} |".format(len(df.cat_count.unique())))

|-------|------|
| mean | 3.12805860587 |
| std | 3.97652687242 |
| mode | 0    1
dtype: int64 |
| unique | 26 |


In [24]:
df[['cat_count', 'reg_amt', 'reg_count']].corr()

Unnamed: 0,cat_count,reg_amt,reg_count
cat_count,1.0,0.007043,0.002151
reg_amt,0.007043,1.0,0.206859
reg_count,0.002151,0.206859,1.0


In [17]:
df.reg_count.describe()

count    268096.0
mean          0.0
std           0.0
min           0.0
25%           0.0
50%           0.0
75%           0.0
max           0.0
Name: reg_count, dtype: float64