In [50]:
%pylab inline
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing, cross_validation, cluster, tree
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import scipy, scipy.stats, matplotlib
matplotlib.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


In [2]:
analytic_base = pd.read_csv('~/Documents/data/analytic_base.csv')
analytic_qgiv = pd.read_csv('~/Documents/data/analytic_qgiv_stats.csv')

ab = analytic_base[['id', 'org', 'form', 'timestamp', 'visits', 'mobile_visits', 'don_form_trans_count', 'don_form_trans_vol']]
aq = analytic_qgiv[['id', 'base', 'org', 'total_visits', 'opt_fields', 'req_fields', 'donation_active', 'amounts_system', 'multirestriction_system', 'restrictions', 'pledges_count', 'pledge_active', 'permit_anonymous', 'permit_mobile', 'permit_other_amount', 'enable_donorlogins', 'collect_captcha']]
d = pd.merge(ab, aq, left_on="id", right_on="base")

In [8]:
# filter out not visited observations
data = d[d.visits > 0]
# add conversion
conversion = pd.DataFrame({'conversion':data["don_form_trans_count"]/data["total_visits"]*100})
# merge conversion w/ the rest of the data
data = pd.concat([data, conversion], axis=1)
# add day_of_month & month
date_data = pd.DataFrame({
        'day': pd.to_datetime(data.timestamp).apply(lambda x: x.day), 
        'month': pd.to_datetime(data.timestamp).apply(lambda x: x.month)})
# merge date data w/ the rest of the data
data = pd.concat([data, date_data], axis=1)
data.drop("timestamp", axis=1, inplace=True)

# add restrictionsXmultirestriction interaction term, restrictions^2, opt_fields^2, req_fields^2
data['restrictionsXmultirestriction'] = data.restrictions*data.multirestriction_system
data['restrictions2'] = data.restrictions*data.restrictions
data['restrictions2Xmultirestriction'] = data.restrictions2*data.multirestriction_system
data['restrictions3'] = data.restrictions*data.restrictions*data.restrictions
data['restrictions3Xmultirestriction'] = data.restrictions3*data.multirestriction_system
data['opt_fields2'] = data.opt_fields*data.opt_fields
data['opt_fields3'] = data.opt_fields*data.opt_fields*data.opt_fields
data['req_fields2'] = data.req_fields*data.req_fields
data['req_fields3'] = data.req_fields*data.req_fields*data.req_fields
data['fields'] = data['opt_fields'] + data['req_fields']
data['fields2'] = data['fields']*data['fields']
data['fields3'] = data['fields']*data['fields']*data['fields']

In [9]:
data.columns

Index([u'id_x', u'org_x', u'form', u'visits', u'mobile_visits',
       u'don_form_trans_count', u'don_form_trans_vol', u'id_y', u'base',
       u'org_y', u'total_visits', u'opt_fields', u'req_fields',
       u'donation_active', u'amounts_system', u'multirestriction_system',
       u'restrictions', u'pledges_count', u'pledge_active',
       u'permit_anonymous', u'permit_mobile', u'permit_other_amount',
       u'enable_donorlogins', u'collect_captcha', u'conversion', u'day',
       u'month', u'restrictionsXmultirestriction', u'restrictions2',
       u'restrictions2Xmultirestriction', u'restrictions3',
       u'restrictions3Xmultirestriction', u'opt_fields2', u'opt_fields3',
       u'req_fields2', u'req_fields3', u'fields', u'fields2', u'fields3'],
      dtype='object')

# Cluster by visits

In [16]:
clusters = cluster.KMeans(n_clusters=5, random_state=42).fit(data.total_visits.values.reshape(-1,1))

In [22]:
clusters.cluster_centers_

array([[    4.01543321],
       [  364.51836735],
       [ 1074.38571429],
       [   98.35045679],
       [ 2873.65      ]])

In [32]:
data['traffic_clusters'] = clusters.predict(data.total_visits.values.reshape(-1,1))

In [41]:
data['cluster_0'] = data.traffic_clusters==0
data['cluster_1'] = data.traffic_clusters==1
data['cluster_2'] = data.traffic_clusters==2
data['cluster_3'] = data.traffic_clusters==3
data['cluster_4'] = data.traffic_clusters==4

In [42]:
data.drop('traffic_clusters', axis=1, inplace=True)

In [43]:
data.columns

Index([u'id_x', u'org_x', u'form', u'visits', u'mobile_visits',
       u'don_form_trans_count', u'don_form_trans_vol', u'id_y', u'base',
       u'org_y', u'total_visits', u'opt_fields', u'req_fields',
       u'donation_active', u'amounts_system', u'multirestriction_system',
       u'restrictions', u'pledges_count', u'pledge_active',
       u'permit_anonymous', u'permit_mobile', u'permit_other_amount',
       u'enable_donorlogins', u'collect_captcha', u'conversion', u'day',
       u'month', u'restrictionsXmultirestriction', u'restrictions2',
       u'restrictions2Xmultirestriction', u'restrictions3',
       u'restrictions3Xmultirestriction', u'opt_fields2', u'opt_fields3',
       u'req_fields2', u'req_fields3', u'fields', u'fields2', u'fields3',
       u'cluster_0', u'cluster_1', u'cluster_2', u'cluster_3', u'cluster_4'],
      dtype='object')

In [44]:
data.drop("id_x", axis=1, inplace=True)
data.drop("org_x", axis=1, inplace=True)
data.drop("form", axis=1, inplace=True)
data.drop("visits", axis=1, inplace=True)
data.drop("mobile_visits", axis=1, inplace=True)
data.drop("don_form_trans_count", axis=1, inplace=True)
data.drop("don_form_trans_vol", axis=1, inplace=True)
data.drop("id_y", axis=1, inplace=True)
data.drop("base", axis=1, inplace=True)
data.drop("org_y", axis=1, inplace=True)
data.drop("total_visits", axis=1, inplace=True)

In [45]:
data.columns

Index([u'opt_fields', u'req_fields', u'donation_active', u'amounts_system',
       u'multirestriction_system', u'restrictions', u'pledges_count',
       u'pledge_active', u'permit_anonymous', u'permit_mobile',
       u'permit_other_amount', u'enable_donorlogins', u'collect_captcha',
       u'conversion', u'day', u'month', u'restrictionsXmultirestriction',
       u'restrictions2', u'restrictions2Xmultirestriction', u'restrictions3',
       u'restrictions3Xmultirestriction', u'opt_fields2', u'opt_fields3',
       u'req_fields2', u'req_fields3', u'fields', u'fields2', u'fields3',
       u'cluster_0', u'cluster_1', u'cluster_2', u'cluster_3', u'cluster_4'],
      dtype='object')

# Run quick fit w/ Decision Tree Regressor

In [51]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('conversion', axis=1), data.conversion, test_size=0.25, random_state=42)

dtr = tree.DecisionTreeRegressor()
dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

r2 = metrics.r2_score(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)

print "R2: "+str(r2)
print "MSE: "+str(mse)

R2: -0.562249047922
MSE: 2557.71004394


Let's try training on segmented data

In [56]:
r2s = []
mses = []
lr2s = []
lmses = []

for c in ['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']:
    _d = data[data[c]==True]
    X_train, X_test, y_train, y_test = train_test_split(_d.drop('conversion', axis=1), _d.conversion, test_size=0.25, random_state=42)

    dtr = tree.DecisionTreeRegressor()
    dtr.fit(X_train, y_train)
    
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = dtr.predict(X_test)
    
    r2s.append(metrics.r2_score(y_test, y_pred))
    mses.append(metrics.mean_squared_error(y_test, y_pred))
    
    y_pred = lr.predict(X_test)
    
    lr2s.append(metrics.r2_score(y_test, y_pred))
    lmses.append(metrics.mean_squared_error(y_test, y_pred))
    
print "Stepped Decision Tree Regressor:"
print "\tR2: "+str(np.mean(r2s))
print "\tMSE: "+str(np.mean(mses))
print "Stepped Linear Regression:"
print "\tR2: "+str(np.mean(lr2s))
print "\tMSE: "+str(np.mean(lmses))

Stepped Decision Tree Regressor:
	R2: 0.196472948084
	MSE: 630.008368176
Stepped Linear Regression:
	R2: -1.29658248638
	MSE: 519.266983579


Here we're seeing a pretty significant improvement in the metrics! 

This is what we see in the multivariate linear model:
- R2: 0.00879105544379
- MSE: 255.329264828