In [22]:
import pandas as pd
import numpy as np
import feather
import gc

from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LarsCV
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cats = ['Green','Fairway','Intermediate Rough','Primary Rough','Fringe','Bunker','Other']

In [3]:
def make_natural_spline_matrix(x,df):
    t = np.percentile(x,np.linspace(0,100,df+2)[1:-1])
    #print t
    X = np.column_stack(([1.0]*len(x),x))
    def d(x,e_k,e_K):
        def pos(x,e):
            x_ = np.copy(x)
            x_[x<=e] = 0.0
            x_[x>e] = (x_[x>e]-e)**3
            return x_
        return (pos(np.copy(x),e_k) - pos(np.copy(x),e_K))/(e_K-e_k)
    for k in range(1,len(t)-1):
        X = np.column_stack((X,d(np.copy(x),t[k],t[-1]) - d(np.copy(x),t[k-1],t[-1])))
    return X

In [4]:
lcv = LarsCV(max_iter=10000,normalize=True,max_n_alphas=1000,copy_X=True)

In [5]:
def tensor_product(mats):
    if len(mats)<2:
        return "I need at least two mats!"
    first = True
    while mats:
        if first:
            res = np.hstack(mats[0].T[:,:,None] * mats[1][None,:,:])
            mats = mats[2:]
            first = False
        else:
            res = np.hstack(res.T[:,:,None] * mats[0][None,:,:])
            mats = mats[1:]
    return res

In [6]:
def run(df1,df2,df3):
    errors = []
    for u,cat in enumerate(cats):
        data = None
        gc.collect()
        data = feather.read_dataframe('./../difficulty.feather')
        data = data[data.Cat==cat]
        data.loc[data.Green_to_work_with.isnull(),'Green_to_work_with'] = \
        data.Green_to_work_with[data.Green_to_work_with.notnull()].mean()
        samp = np.random.choice(range(len(data)),40000 if len(data)>40000 else len(data),replace=False)
        data = data.iloc[samp,:]
        
        y = data.values[:,0]
        if cat=='Green':
            X_dist = make_natural_spline_matrix(data.Distance_from_hole,df1[u])
            X_elev = make_natural_spline_matrix(data.Started_at_Z,df3[u])
            data = None
            gc.collect()
            X = np.hstack((X_dist,X_elev,tensor_product([X_dist,X_elev])))
            X_dist,X_elev = None,None
            gc.collect()
        else:
            X_dist = make_natural_spline_matrix(data.Distance_from_hole,df1[u])
            X_gtww = make_natural_spline_matrix(data.Green_to_work_with,df2[u])
            X_elev = make_natural_spline_matrix(data.Started_at_Z,df3[u])
            data = None
            gc.collect()
            X = np.hstack((X_dist,X_gtww,X_elev,tensor_product([X_dist,X_gtww,X_elev])))
            X_dist,X_gtww,X_elev = None,None,None
            gc.collect()
        errors.append(cross_val_score(lcv,X,y,scoring='mean_squared_error',n_jobs=1))
    return errors

In [7]:
np.array(cats)

array(['Green', 'Fairway', 'Intermediate Rough', 'Primary Rough', 'Fringe',
       'Bunker', 'Other'], 
      dtype='|S18')

In [8]:
param_grid = [[14]*7,[7]*7,[8]*7]

In [9]:
run(*param_grid)



[array([-0.15510111, -1.39432359, -6.62346583]),
 array([-0.33242987, -0.32931262, -0.31690858]),
 array([-0.30887963, -0.31894385, -0.31768968]),
 array([-0.36587546, -0.37081103, -0.36610863]),
 array([-0.18077784, -0.18793761, -0.18455915]),
 array([-0.38028543, -0.37734506, -0.38566152]),
 array([-0.51596645, -0.51014027, -0.49910991])]

In [30]:
data = feather.read_dataframe('./../difficulty.feather')

In [21]:
(((data.Cat=="Green").sum()*0.15510111 +
 (data.Cat=="Fairway").sum()*0.33242987 +
 (data.Cat=="Intermediate Rough").sum()*0.30887963 +
 (data.Cat=="Primary Rough").sum()*0.36587546 +
 (data.Cat=="Fringe").sum()*0.18077784 + 
 (data.Cat=="Bunker").sum()*0.38028543 +
 (data.Cat=="Other").sum()*0.51596645)/len(data))**.5

0.48617642480294376

In [39]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.01, 
                                n_estimators=1000, subsample=.5, 
                                min_weight_fraction_leaf=0.0, max_depth=3)

In [40]:
def run():
    errors = []
    for u,cat in enumerate(cats[::-1]):
        data = None
        gc.collect()
        data = feather.read_dataframe('./../difficulty.feather')
        data = data[data.Cat==cat]
        data.loc[data.Green_to_work_with.isnull(),'Green_to_work_with'] = \
        data.Green_to_work_with[data.Green_to_work_with.notnull()].mean()
        
        y = data.values[:,0]
        if cat=='Green': 
            X = data.values[:,np.array([2,4])]
            data = None
            gc.collect()
        else:
            X = data.values[:,2:]
            data = None
            gc.collect()
        gbr.fit(X,y)
        print cat
        print gbr.feature_importances_
        error = np.mean((gbr.predict(X)-y)**2)
        print error
        errors.append(error)
    return errors

In [27]:
run()

[array([-0.11665466, -0.11131057, -0.11583314]),
 array([-0.3135188 , -0.30775616, -0.31905701]),
 array([-0.29630674, -0.30533181, -0.30735962]),
 array([-0.3609437 , -0.36251877, -0.3633638 ]),
 array([-0.18694726, -0.18624939, -0.18571111]),
 array([-0.38349447, -0.36652894, -0.37572808]),
 array([-0.50196789, -0.50959922, -0.5026008 ])]

In [41]:
run()

Other
[ 0.41335769  0.23836312  0.34827919]
0.493646743899
Bunker
[ 0.53774321  0.19835877  0.26389802]
0.372257282426
Fringe
[ 0.46624475  0.22086954  0.31288571]
0.182508595153
Primary Rough
[ 0.56121132  0.17140679  0.26738189]
0.361347962243
Intermediate Rough
[ 0.56586519  0.19200746  0.24212735]
0.300202209856
Fairway
[ 0.6112764   0.16812388  0.22059972]
0.31740937289
Green
[ 0.69468874  0.30531126]
0.115458066078


[0.4936467438990713,
 0.37225728242567796,
 0.1825085951531437,
 0.36134796224265625,
 0.30020220985632007,
 0.31740937288979143,
 0.11545806607753986]

In [43]:
(((data.Cat=="Green").sum()*0.11545806607753986 +
 (data.Cat=="Fairway").sum()*0.31740937289 +
 (data.Cat=="Intermediate Rough").sum()*0.300202209856 +
 (data.Cat=="Primary Rough").sum()*0.361347962243 +
 (data.Cat=="Fringe").sum()*0.182508595153 + 
 (data.Cat=="Bunker").sum()*0.372257282426 +
 (data.Cat=="Other").sum()*0.493646743899)/len(data))**.5

0.45785217793734079