In [2]:
import pandas as pd
import numpy as np
import itertools
from scipy.optimize import fmin_tnc

In [61]:
data = pd.read_csv('data/2014_with_hole_coordinates.csv', sep = ',')
data.drop('Unnamed: 0',axis=1,inplace=True)

In [63]:
#unique Course-Round-Hole Tuples
uCRHtps = list(itertools.product(np.unique(data['Course Name']),np.unique(data['Round']),np.unique(data['Hole'])))

In [39]:
data.columns

Index([u'Unnamed: 0', u'# of Strokes', u'1st Putt Flag',
       u'Around the Green Flag', u'Course #', u'Course Name', u'Date',
       u'Distance', u'Distance from Center', u'Distance from Edge',
       u'Distance to Hole after the Shot', u'Distance to Pin', u'Elevation',
       u'From Location(Enhanced)', u'From Location(Scorer)', u'Hole',
       u'Hole Score', u'Hole X Coordinate', u'Hole Y Coordinate',
       u'Hole Z Coordinate', u'In the Hole Flag', u'Lie', u'Par Value',
       u'Permanent Tournament #', u'Player #', u'Player First Name',
       u'Player Last Name', u'Round', u'Shot', u'Shot Type(S/P/D)', u'Slope',
       u'Time', u'To Location(Enhanced)', u'To Location(Laser)', u'Tour Code',
       u'Tour Description', u'Tourn.#', u'Tournament Name', u'X Coordinate',
       u'Y Coordinate', u'Yardage', u'Year', u'Z Coordinate'],
      dtype='object')

In [None]:
# goal is to impute tee box locations. will use all shots with 'Shot' column equal to 1
# will do the same process of optimizing a guess as when imputing the location of the hole.
# will use a randomly selected shot as initial guess, then run optimization, then filter out
# shots over a certian threshold of implausibility, then rereun the optimization.

In [6]:
def f (a):
    x0,y0,z0 = a[0],a[1],a[2]
    return sum((((x-x0)**2 + (y-y0)**2 + (z-z0)**2)**.5-d)**2)/len(x)

def find_the_tee_box ():
    xopt = fmin_tnc(f,[x0,y0,z0],approx_grad=1,maxfun=1000)[0].tolist()
    return xopt

In [40]:
# initializing new data frame with two rows which will be deleted after
#newdata = pd.DataFrame(data.loc[1:2,:])
before_mean_and_size=[]
after_mean_and_size=[]
for u,i in enumerate(uCRHtps):
    if u%50==1:
        print u#, newdata.shape
    subset = data[(data['Course Name']==i[0]) & (data['Round']==int(i[1])) & (data['Hole']==int(i[2]))]
    before = subset.shape[0]
    if subset[subset['Distance to Hole after the Shot']!=0].shape[0] == 0:
        continue
    d = subset[subset['Shot']==1]['Distance'].values/12.0
    x = subset[subset['Shot']==1]['X Coordinate'].values
    y = subset[subset['Shot']==1]['Y Coordinate'].values
    z = subset[subset['Shot']==1]['Z Coordinate'].values
    rand_ind = np.random.choice(range(subset[subset['Shot']==1].shape[0]),size=1)
    rand_shot = subset[subset['Shot']==1][['X Coordinate','Y Coordinate','Z Coordinate']].values[rand_ind,:].tolist()[0]
    x0,y0,z0 = rand_shot[0],rand_shot[1],rand_shot[2]
    a = find_the_tee_box()
    subset.insert(len(subset.columns),'dist_w_impute',np.array([0]*subset[subset['Shot']!=1].shape[0] \
                  + (((x-a[0])**2 + (y-a[1])**2 + (z-a[2])**2)**.5).tolist()))
    subset.insert(len(subset.columns),'dist_diff',np.array([abs(subset['dist_w_impute'].values[j] - \
                  subset['Distance'].values[j]/12) if subset['Shot'].values[j]==1 else 0 for j in range(subset.shape[0])]))
    mean_err = subset[subset['dist_diff']>0]['dist_diff'].mean()
    before_mean_and_size.append((mean_err,len(subset)))
    std_err = subset[subset['dist_diff']>0]['dist_diff'].std()
    c=0
    while mean_err>216:
        c+=1
        if c>=20:
            break
        print u,mean_err
        subset = subset.drop(subset[subset['dist_diff'] > mean_err + 2.5*std_err].index,axis=0)
        subset = subset.drop('dist_w_impute',axis=1)
        subset = subset.drop('dist_diff',axis=1)
        d = subset[subset['Shot']==1]['Distance'].values/12.0
        x = subset[subset['Shot']==1]['X Coordinate'].values
        y = subset[subset['Shot']==1]['Y Coordinate'].values
        z = subset[subset['Shot']==1]['Z Coordinate'].values
        rand_ind = np.random.choice(range(subset[subset['Shot']==1].shape[0]),size=1)
        rand_shot = subset[subset['Shot']==1][['X Coordinate','Y Coordinate','Z Coordinate']].values[rand_ind,:].tolist()[0]
        x0,y0,z0 = rand_shot[0],rand_shot[1],rand_shot[2]
        a = find_the_tee_box()
        subset.insert(len(subset.columns),'dist_w_impute',np.array([0]*subset[subset['Shot']!=1].shape[0] \
                  + (((x-a[0])**2 + (y-a[1])**2 + (z-a[2])**2)**.5).tolist()))
        subset.insert(len(subset.columns),'dist_diff',np.array([abs(subset['dist_w_impute'].values[j] - \
                  subset['Distance'].values[j]/12) if subset['Shot'].values[j]==1 else 0 for j in range(subset.shape[0])]))
        mean_err = subset[subset['dist_diff']>0]['dist_diff'].mean()
        std_err = subset[subset['dist_diff']>0]['dist_diff'].std()
    after_mean_and_size.append((mean_err,len(subset)))
    
    
#     if c==20:
#         continue
#     subset = subset.drop('dist_w_impute',axis=1)
#     subset = subset.drop('dist_diff',axis=1)
#     subset.insert(len(subset.columns),'Tee Box X Coordinate',np.array([a[0]]*subset.shape[0]))
#     subset.insert(len(subset.columns),'Tee Box Y Coordinate',np.array([a[1]]*subset.shape[0]))
#     subset.insert(len(subset.columns),'Tee Box Z Coordinate',np.array([a[2]]*subset.shape[0]))
#     after = subset.shape[0]
#     if before-after>0:
#         print u, before-after
#     newdata = newdata.append(subset)
    #print u,newdata.shape

# newdata.drop(newdata.head(2).index, inplace=True)
# print newdata.shape


1
51
75 284.45458454
75 286.587676602
75 287.143756742
75 299.983152734
101
151
201
217 427.716838367
217 441.325817595
217 439.94271655
217 444.381228536
217 450.00117106
217 459.896986796
217 11719.1204236
217 466.888944177
217 263.579024228
251
301
351
401
451
472 272.512266426
472 272.51248971
472 272.510983008
472 285.110483861
472 274.002282588
472 272.511908376
472 272.513613433
472 272.512241319
472 272.48971017
472 272.438243897
472 271.083360978
472 272.511939844
472 272.50832567
472 274.002282588
472 272.510915039
472 272.511908376
472 272.512783107
472 272.514459307
472 272.513102213
501
525 240.66095994
525 241.841662121
525 243.37091783
525 244.580407047
525 246.544984812
551
582 249.756647823
582 243.954381071
582 246.292805549
601
651
701
751
801
825 291.598989196
825 290.033171894
825 292.324497092
825 298.905313787
851
901
951
1001
1051
1101
1136 413.401755372
1136 236.861435944
1136 234.180207929
1147 276.599290379
1147 276.085202634
1147 275.980023402
1147 274.52397

In [45]:
%matplotlib inline
import matplotlib.pyplot as plt 
colors = ['red','blue']
x1 = np.transpose(np.matrix([i[0] for i in before_mean_and_size]))
x2 = np.transpose(np.matrix([i[0] for i in after_mean_and_size]))
x1 = pd.Series(np.array([i[0] for i in before_mean_and_size]))
print x1.describe()
x2 = pd.Series(np.array([i[0] for i in after_mean_and_size]))
print x2.describe()
# x = np.concatenate((x1,x2),axis=1)
# plt.hist(x, color=colors)
# plt.show()

count    2750.000000
mean       65.408402
std       280.083551
min         0.347339
25%        17.384910
50%        41.399734
75%        69.205315
max      7753.126810
dtype: float64
count    2745.000000
mean       61.833521
std       271.008378
min         0.347339
25%              NaN
50%              NaN
75%              NaN
max      7748.453207
dtype: float64




In [64]:
# initializing new data frame with two rows which will be deleted after
newdata = pd.DataFrame(data.loc[1:2,:])
for u,i in enumerate(uCRHtps):
    if u%50==1:
        print u, newdata.shape
    subset = data[(data['Course Name']==i[0]) & (data['Round']==int(i[1])) & (data['Hole']==int(i[2]))]
    before = subset.shape[0]
    if subset[subset['Distance to Hole after the Shot']!=0].shape[0] == 0:
        continue
    d = subset[subset['Shot']==1]['Distance'].values/12.0
    x = subset[subset['Shot']==1]['X Coordinate'].values
    y = subset[subset['Shot']==1]['Y Coordinate'].values
    z = subset[subset['Shot']==1]['Z Coordinate'].values
    rand_ind = np.random.choice(range(subset[subset['Shot']==1].shape[0]),size=1)
    rand_shot = subset[subset['Shot']==1][['X Coordinate','Y Coordinate','Z Coordinate']].values[rand_ind,:].tolist()[0]
    x0,y0,z0 = rand_shot[0],rand_shot[1],rand_shot[2]
    a = find_the_tee_box()
    subset.insert(len(subset.columns),'dist_w_impute',np.array([0]*subset[subset['Shot']!=1].shape[0] \
                  + (((x-a[0])**2 + (y-a[1])**2 + (z-a[2])**2)**.5).tolist()))
    subset.insert(len(subset.columns),'dist_diff',np.array([abs(subset['dist_w_impute'].values[j] - \
                  subset['Distance'].values[j]/12) if subset['Shot'].values[j]==1 else 0 for j in range(subset.shape[0])]))
    mean_err = subset[subset['dist_diff']>0]['dist_diff'].mean()
    std_err = subset[subset['dist_diff']>0]['dist_diff'].std()
    c=0
    while mean_err>252:
        c+=1
        if c>=25:
            break
        print u,mean_err
        subset = subset.drop(subset[subset['dist_diff'] > mean_err + 2.5*std_err].index,axis=0)
        subset = subset.drop('dist_w_impute',axis=1)
        subset = subset.drop('dist_diff',axis=1)
        d = subset[subset['Shot']==1]['Distance'].values/12.0
        x = subset[subset['Shot']==1]['X Coordinate'].values
        y = subset[subset['Shot']==1]['Y Coordinate'].values
        z = subset[subset['Shot']==1]['Z Coordinate'].values
        rand_ind = np.random.choice(range(subset[subset['Shot']==1].shape[0]),size=1)
        rand_shot = subset[subset['Shot']==1][['X Coordinate','Y Coordinate','Z Coordinate']].values[rand_ind,:].tolist()[0]
        x0,y0,z0 = rand_shot[0],rand_shot[1],rand_shot[2]
        a = find_the_tee_box()
        subset.insert(len(subset.columns),'dist_w_impute',np.array([0]*subset[subset['Shot']!=1].shape[0] \
                  + (((x-a[0])**2 + (y-a[1])**2 + (z-a[2])**2)**.5).tolist()))
        subset.insert(len(subset.columns),'dist_diff',np.array([abs(subset['dist_w_impute'].values[j] - \
                  subset['Distance'].values[j]/12) if subset['Shot'].values[j]==1 else 0 for j in range(subset.shape[0])]))
        mean_err = subset[subset['dist_diff']>0]['dist_diff'].mean()
        std_err = subset[subset['dist_diff']>0]['dist_diff'].std()
    
    if c==25:
        print 'Skipping ', u, len(subset)
        continue
    subset = subset.drop('dist_w_impute',axis=1)
    subset = subset.drop('dist_diff',axis=1)
    subset.insert(len(subset.columns),'Tee Box X Coordinate',np.array([a[0]]*subset.shape[0]))
    subset.insert(len(subset.columns),'Tee Box Y Coordinate',np.array([a[1]]*subset.shape[0]))
    subset.insert(len(subset.columns),'Tee Box Z Coordinate',np.array([a[2]]*subset.shape[0]))
    after = subset.shape[0]
    if before-after>0:
        print u, before-after
    newdata = newdata.append(subset)

newdata.drop(newdata.head(2).index, inplace=True)
print newdata.shape

1 (507, 45)
51 (21733, 45)
101 (36029, 45)
151 (50481, 45)
201 (70529, 45)
251 (91668, 45)
253 440.573111334
253 8654.9091417
253 262.164663191
253 262.127627326
253 262.186034
253 264.492223845
253 8654.9091417
253 263.352361992
253 262.132841583
253 265.345931912
253 269.816816537
253 259.832979323
253 262.085509086
253 262.340438361
253 1
301 (109989, 45)
351 (129797, 45)
401 (136930, 45)
451 (146175, 45)
501 (160541, 45)
551 (185411, 45)
601 (206125, 45)
651 (224986, 45)
701 (249541, 45)
751 (273220, 45)
801 (291417, 45)
851 (311283, 45)
901 (335657, 45)
951 (348880, 45)
1001 (360151, 45)
1006 279.246958183
1006 322.53585499
1006 325.248141684
1006 3
1051 (371299, 45)
1101 (380720, 45)
1151 (386613, 45)
1201 (413106, 45)
1251 (432732, 45)
1301 (450456, 45)
1351 (473254, 45)
1401 (498366, 45)
1451 (515841, 45)
1501 (535800, 45)
1551 (561246, 45)
1554 399.116202292
1554 390.331023933
1554 2
1559 428.620504828
1559 1
1601 (577184, 45)
1609 279.671371787
1609 1
1651 (593441, 45)
1676 2

In [65]:
newdata.to_csv('data/2014_with_teebox_coordinates.csv',index=False)

In [66]:
print data.shape

(985714, 42)
