In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Form feature modeling

In [7]:
path = "/Users/jeremyvanvalkenburg/Repositories/Recommendation/webroot/formhealth_features_04162018.pkl"

ftrs_core = ['opt_fields', 'req_fields', 'donation_active', 'multirestriction_system', 
             'restrictions', 'permit_other_amount', 'collect_captcha', 'form']

In [3]:
print("load & prep data")
print("\treading CSV's")
df_base = pd.read_csv("~/Repositories/datasets/analytics/a_base.csv")
df_qgiv = pd.read_csv("~/Repositories/datasets/analytics/analytics_qgiv.csv")
df_qgiv_base = pd.read_csv("~/Repositories/datasets/analytics/a_qgiv_base_id.csv")

print("\tmerge dataframes")
df_qgiv = df_qgiv.merge(df_qgiv_base, left_on="id", right_on="id_x")
df = df_qgiv.merge(df_base, left_on="base", right_on="id", how="left")

print("\tadding conversion & embedding columns")
df['conversion'] = df['don_form_trans_count'] / (df['visits']+df['mobile_visits'])
df['conversion'].fillna(0., inplace=True)
df['conversion'].replace(np.inf, 1.)

df_emb = pd.read_csv("~/Repositories/datasets/analytics/form_conversion_embedding.csv")
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs_core+['conversion']]
df_core = df_core.merge(df_emb, on="form")

print("\tdata prepped, {} features of {} observations".format(len(df_core.columns) - 2, len(df_core)))
# ...len(columns) - 2 because form & conversion are in there

load & prep data
	reading CSV's
	merge dataframes
	adding conversion & embedding columns
	data prepped, 37 features of 833215 observations


In [4]:
print("fit on training data & verify performance")
print("\ttrain/test split")
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\tfitting model")
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

print("\tevaluating model")
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("\t\tMSE: {}".format(mse))

fit on training data & verify performance
	train/test split
	fitting model
	evaluating model
		MSE: 0.0134541628673


In [8]:
print("fit on all data and store")
print("\tfitting model")
rf = RandomForestRegressor()
rf.fit(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\tstoring model to {}".format(path))
joblib.dump({"fit": rf, "mse": mse, "features": X_train.columns}, path)

fit on all data and store
	fitting model
	storing model to /Users/jeremyvanvalkenburg/Repositories/Recommendation/webroot/formhealth_features_04162018.pkl


['/Users/jeremyvanvalkenburg/Repositories/Recommendation/webroot/formhealth_features_04162018.pkl']

In [9]:
!ls -l ~/Repositories/Recommendation/webroot/

total 226504
-rw-r--r--  1 jeremyvanvalkenburg  staff      3941 Sep  7  2017 analytics.py
-rw-r--r--  1 jeremyvanvalkenburg  staff      4091 Feb 27 16:26 analytics.pyc
-rw-r--r--  1 jeremyvanvalkenburg  staff     12442 Mar  7 14:23 analytics_update.py
-rw-r--r--  1 jeremyvanvalkenburg  staff      7271 Feb 27 16:26 analytics_update.pyc
-rw-r--r--  1 jeremyvanvalkenburg  staff     15314 Sep  7  2017 db.py
-rw-r--r--  1 jeremyvanvalkenburg  staff     15166 Feb 27 16:26 db.pyc
-rw-r--r--  1 jeremyvanvalkenburg  staff   4025788 Sep  7  2017 form_features.pkl
-rw-r--r--  1 jeremyvanvalkenburg  staff    410212 Sep  7  2017 form_settings.pkl
-rw-r--r--  1 jeremyvanvalkenburg  staff   1411944 Apr 16 11:59 formhealth_features_04162018.pkl
-rw-r--r--  1 jeremyvanvalkenburg  staff  56049147 Mar 14 12:25 fraud.pkl
-rw-r--r--  1 jeremyvanvalkenburg  staff  53970079 Mar 14 12:20 fraud_sml.pkl
-rw-r--r--@ 1 jeremyvanvalkenburg  staff      7070 Mar 16 12:09 model_loader.py
-rw-r--r--  1 jeremyvanvalken

# Form settings modeling

In [10]:
path = "/Users/jeremyvanvalkenburg/Repositories/Recommendation/webroot/formhealth_settings_04162018.pkl"

ftrs = ['pledge_active', 'donation_active', 'multirestriction_system', 'min_amount',
       'max_amount', 'show_amount', 'permit_anonymous', 'permit_recurring',
       'permit_other_amount', 'permit_create_own_pledge', 'collect_company',
       'collect_phone', 'collect_optin', 'collect_captcha',
       'collect_address_mobile', 'enable_donorlogins', 'enable_sms', 'form']

In [11]:
print("load & prep data")
print("\treading CSV's")
df_base = pd.read_csv("~/Repositories/datasets/analytics/a_base.csv")
df_qgiv = pd.read_csv("~/Repositories/datasets/analytics/analytics_qgiv.csv")
df_qgiv_base = pd.read_csv("~/Repositories/datasets/analytics/a_qgiv_base_id.csv")

print("\tmerge dataframes")
df_qgiv = df_qgiv.merge(df_qgiv_base, left_on="id", right_on="id_x")
df = df_qgiv.merge(df_base, left_on="base", right_on="id", how="left")

print("\tadding conversion & embedding columns")
df['conversion'] = df['don_form_trans_count'] / (df['visits']+df['mobile_visits'])
df['conversion'].fillna(0., inplace=True)
df['conversion'].replace(np.inf, 1.)

df_emb = pd.read_csv("~/Repositories/datasets/analytics/form_conversion_embedding.csv")
df_core = df[(df['visits']>0)|(df['mobile_visits']>0)][ftrs+['conversion']]
df_core = df_core.merge(df_emb, on="form")

print("\tdata prepped, {} features of {} observations".format(len(df_core.columns) - 2, len(df_core)))
# ...len(columns) - 2 because form & conversion are in there

load & prep data
	reading CSV's
	merge dataframes
	adding conversion & embedding columns
	data prepped, 47 features of 833215 observations


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

In [13]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: {}".format(mse))

MSE: 0.0133204596809


In [14]:
print("fit on all data and store")
print("\tfitting model")
rf = RandomForestRegressor()
rf.fit(df_core.drop(['form', 'conversion'], axis=1), df_core['conversion'])

print("\tstoring model to {}".format(path))
joblib.dump({"fit": rf, "mse": mse, "features": X_train.columns}, path)

fit on all data and store
	fitting model
	storing model to /Users/jeremyvanvalkenburg/Repositories/Recommendation/webroot/formhealth_settings_04162018.pkl


['/Users/jeremyvanvalkenburg/Repositories/Recommendation/webroot/formhealth_settings_04162018.pkl']