In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix,bmat
import gc

from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from sklearn.model_selection import GroupShuffleSplit

import xgboost as xgb

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
cats = ['Green','Fairway','Intermediate Rough','Primary Rough','Fringe','Bunker','Other']

In [None]:
params = {'objective':'reg:linear','silent':1,'eta':.2,'max_depth':5,'subsample':.7,'min_child_weight':10,
          'alpha':.2,'lambda':1.5}
results = {}
for cat in cats[1:]:
    print '************************* DOING %s *************************' % (cat,)
    
    data = pd.concat([pd.read_csv('./../new_data/%d.csv' % year)[['Cat','Shots_taken_from_location','Started_at_Z',
            'Distance_from_hole','Hole','Round','Course_#','Year','Green_to_work_with','Player_#']] for year in range(2003,2017)])
    data = data[data.Cat==cat]
    data.insert(len(data.columns),'Year-Course',data.Year.astype(str).str.cat(data['Course_#'].astype(str),sep='-'))
    data.insert(len(data.columns),'Hole-Course',data.Hole.astype(str).str.cat(data['Course_#'].astype(str),sep='-'))
    data.insert(len(data.columns),'Hole-Year-Course',data.Hole.astype(str).str.cat(
                                        [data.Year.astype(str),data['Course_#'].astype(str)],sep='-'))
    data.insert(len(data.columns),'Round-Year-Course',data.Round.astype(str).str.cat(
                                        [data.Year.astype(str),data['Course_#'].astype(str)],sep='-'))
    data.insert(len(data.columns),'Hole-Round-Year-Course',data.Hole.astype(str).str.cat(
                            [data.Round.astype(str),data.Year.astype(str),data['Course_#'].astype(str)],sep='-'))

    groups = ['-'.join(map(str,tup)) for tup in data[['Hole','Round','Course_#','Year','Player_#']].values.tolist()]
    le = LabelEncoder()
    groups = le.fit_transform(groups)

    y = data.Shots_taken_from_location.values
    
    lb = LabelBinarizer(sparse_output=True)
    results[cat] = {}
    
    ##simplest

    X = csr_matrix(data.values[:,np.array([2,3,8])].astype(float))
    results[cat][0] = get_best_estimators(params,X,y,groups)

    ## with course

    X = bmat([[X,lb.fit_transform(data.values[:,6].astype(str))]],format='csr')
    results[cat][1] = get_best_estimators(params,X,y,groups)

    ## with year-course

    X = bmat([[X,lb.fit_transform(data.values[:,9].astype(str))]],format='csr')
    results[cat][2] = get_best_estimators(params,X,y,groups)

    ## with hole-course

    X = bmat([[X,lb.fit_transform(data.values[:,10].astype(str))]],format='csr')
    results[cat][3] = get_best_estimators(params,X,y,groups)

    ## with round-year-course

    X = bmat([[X,lb.fit_transform(data.values[:,11].astype(str))]],format='csr')
    results[cat][4] = get_best_estimators(params,X,y,groups)

    # with hole-year-course

    X = bmat([[X,lb.fit_transform(data.values[:,12].astype(str))]],format='csr')
    results[cat][5] = get_best_estimators(params,X,y,groups)

    # with everything

    X = bmat([[X,lb.fit_transform(data.values[:,13].astype(str))]],format='csr')
    results[cat][6] = get_best_estimators(params,X,y,groups)

************************* DOING Fairway **************************
**** FOLD 0 ****
0.318324875069
**** FOLD 1 ****
0.318104613412
**** FOLD 2 ****
0.31809367166
**** FOLD 3 ****
0.317862614668
**** FOLD 4 ****
0.317256456262
**** FOLD 5 ****
0.31772669636
**** FOLD 6 ****
0.317683346039
**** FOLD 7 ****
0.317037372185
**** FOLD 8 ****
0.31783952489
**** FOLD 9 ****
0.316641135971
**** FOLD 10 ****
0.31715487122
**** FOLD 11 ****
0.317281606598
**** FOLD 0 ****
0.315923324298
**** FOLD 1 ****
0.314630678843
**** FOLD 2 ****
0.314760671425
**** FOLD 3 ****
0.316525000841
**** FOLD 4 ****
0.314669988856
**** FOLD 5 ****
0.315193253434
**** FOLD 6 ****
0.315124511348
**** FOLD 7 ****
0.317110512785
**** FOLD 8 ****
0.315709842492
**** FOLD 9 ****
0.316962941904
**** FOLD 10 ****
0.315144406845
**** FOLD 11 ****
0.316726635814
**** FOLD 0 ****
0.315110247101
**** FOLD 1 ****
0.316445731007
**** FOLD 2 ****
0.314056012664
**** FOLD 3 ****
0.315862593145
**** FOLD 4 ****

In [None]:
res_2[0]+res_2[1]

In [None]:
res_0[0]

In [None]:
for u,(mean,std) in enumerate([res_0,res_1,res_2,res_3,res_4,res_5,res_6]):
    plt.scatter([u],[mean])
    plt.plot((u, u), (mean-std, mean+std), 'k-')

plt.ylim(.115,.116);

In [None]:
feature_names = ['elev_change','distance'] + ['Course = ' + name for name in lbins[6].classes_] + \
                ['Year = ' + name.strip().split('-')[0] + ', Course = ' + name.strip().split('-')[1] for name in lbins[9].classes_] + \
                ['Hole = ' + name.split('-')[0] + ', Course = ' + name.split('-')[1] for name in lbins[10].classes_] + \
                ['Hole = ' + name.split('-')[0] + ', Year = ' + name.split('-')[1] + ', Course = ' + name.split('-')[2] for name in lbins[11].classes_] + \
                ['Round = ' + name.split('-')[0] + ', Year = ' + name.split('-')[1] + ', Course = ' + name.split('-')[2] for name in lbins[12].classes_] + \
                ['Hole = ' + name.split('-')[0] + ', Round = ' + name.split('-')[1] + ', Year = ' + name.split('-')[2] + ', Course = ' + name.split('-')[3] for name in lbins[13].classes_]

In [None]:
d = {(feature_names[int(key[1:])],int(key[1:])):value for key,value in bst.get_score().iteritems()}
df = pd.DataFrame({'label': [key[0] for key in d.iterkeys()],'ind': [key[1] for key in d.iterkeys()]
                   ,'count': [d[key] for key in d.iterkeys()]})
d = {feature_names[int(key[1:])]:(int(key[1:]),value) for key,value in bst.get_score().iteritems()}

In [19]:
def get_best_estimators(params,X,y,groups,cv_folds=12,early_stopping_rounds=50):
    cv = GroupShuffleSplit(n_splits=cv_folds, test_size=0.2)
    errors = []
    for u,(train,test) in enumerate(cv.split(X,y,groups)):
        print '**** FOLD %d ****' % (u,)
        dtrain = xgb.DMatrix(X[train],label=y[train])
        dtest = xgb.DMatrix(X[test],label=y[test])
        watchlist  = [(dtrain,'train'),(dtest,'eval')]
        num_round = 100000
        bst = xgb.train(params,dtrain,num_round,watchlist,early_stopping_rounds=early_stopping_rounds,verbose_eval=False)       
        error = np.mean((bst.predict(dtest,ntree_limit=bst.best_iteration) - y[test])**2)
        print error
        errors.append(error)
    return (np.array(errors).mean(),np.array(errors).std())

In [None]:
len(bst.get_score())

In [None]:
len(feature_names)

In [None]:
df.sort_values('count',ascending=False).head(10).reset_index(drop=True)

In [None]:
arr = np.zeros(52674)
arr[0] = -1
arr[1] = 10
arr = xgb.DMatrix(csr_matrix(arr),feature_names = dMat.feature_names)
bst.predict(arr)

In [None]:
try: print d['Hole = 18, Round = 4, Year = 2004, Course = 512']
except: print 'none'
try: print d['Round = 4, Year = 2004, Course = 512']
except: print 'none'
try: print d['Hole = 18, Course = 512']
except: print 'none'
try: print d['Year = 2004, Course = 512']
except: print 'none'
try: print d['Course = 512']
except: print 'none'

In [None]:
preds1 = []
preds2 = []
for dist in np.linspace(0.01,100,350):
    arr = np.zeros(52674)
    arr[1] = dist
    arr[0] = 0
    preds1.append(bst.predict(xgb.DMatrix(csr_matrix(arr),feature_names = dMat.feature_names)).tolist()[0])
    arr[35131] = 1
    arr[13667] = 1
    arr[1423] = 1
    arr[24] = 1
    preds2.append(bst.predict(xgb.DMatrix(csr_matrix(arr),feature_names = dMat.feature_names)).tolist()[0])

In [None]:
plt.figure(figsize=(10,10))
plt.plot(np.linspace(0.01,100,350)[0:30],preds1[0:30],label='generic');
plt.plot(np.linspace(0.01,100,350)[0:30],preds2[0:30],label='special green');
plt.legend();

In [None]:
np.array(preds1)-np.array(preds2)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=.6, 
                                criterion='friedman_mse', min_samples_split=20, min_samples_leaf=5, 
                                max_depth=3, verbose=5)

In [None]:
arr = np.zeros(52674)
arr[0] = 0
arr[1] = 1
gbr.predict(np.matrix(arr))

In [None]:
(gbr.feature_importances_!=0).sum()

In [None]:
for ind in np.argsort(gbr.feature_importances_)[::-1][:25]:
    print ind,dMat.feature_names[ind],gbr.feature_importances_[ind]

In [None]:
gbr.fit(X,y)

In [None]:
preds1 = []
preds2 = []
for dist in np.linspace(0,100,100):
    arr = np.zeros(52674)
    arr[1] = dist
    arr[0] = -.1
    preds1.append(gbr.predict(np.matrix(arr)))
    arr[0] = .1
    preds2.append(gbr.predict(np.matrix(arr)))

In [None]:
plt.figure(figsize=(10,10))
plt.plot(np.linspace(0,100,100),preds1,label='uphill 1 ft.');
plt.plot(np.linspace(0,100,100),preds2,label='downhill 1 ft.');
plt.legend();

In [None]:
for arg,num in zip(np.argsort(gbr.feature_importances_)[::-1],np.sort(gbr.feature_importances_)[::-1])[0:7]:
    print arg,num

In [None]:
fig,axs = plot_partial_dependence(gbr,X_train_m2,[4],feature_names=,n_jobs=1,grid_resolution=50)

In [None]:
np.mean(((lcv.predict(normalizer.transform(X_test)) + gbr.predict(X_test_m))/2 - y_test)**2)

In [None]:
(((data.Cat=="Green").sum()*0.11545806607753986 +
 (data.Cat=="Fairway").sum()*0.31740937289 +
 (data.Cat=="Intermediate Rough").sum()*0.300202209856 +
 (data.Cat=="Primary Rough").sum()*0.361347962243 +
 (data.Cat=="Fringe").sum()*0.182508595153 + 
 (data.Cat=="Bunker").sum()*0.372257282426 +
 (data.Cat=="Other").sum()*0.493646743899)/len(data))**.5