In [27]:
import numpy http://localhost:8888/notebooks/Downloads/svm_xwoba-Copy1.ipynb#as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.neighbors import KernelDensity
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import matplotlib.pylab as plt

raw = pd.read_csv("battedballdata.csv")

In [2]:
data = raw.loc[:, ("hc_x", "hc_y", "launch_speed", "launch_angle", "woba_value")]
data["launch_direction"] = np.arctan((data.loc[:, "hc_x"]-125.42)/(198.27-data.loc[:, "hc_y"]))*180/np.pi
data = data.dropna()
data = data.drop(columns=['hc_x', 'hc_y'])

launch_speed        float64
launch_angle        float64
woba_value          float64
launch_direction    float64
dtype: object


In [17]:
target = 'woba_value'
predictors = [x for x in data.columns if x not in [target]]

X_train, X_test, y_train, y_test = train_test_split(data[predictors], data[target], test_size=0.5, random_state=346)

float64


In [18]:
clf = svm.SVR(kernel = 'rbf', gamma=0.01, C=0.5, epsilon=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [15]:
def svc_param_selection(X, y, nfolds):
    Cs = [.5]
    gammas = [.01]
    eps = [0]
    param_grid = {'C': Cs, 'gamma' : gammas, 'epsilon' : eps}
    grid_search = GridSearchCV(svm.SVR(kernel='rbf'), param_grid, cv=nfolds
                               , scoring = 'neg_mean_absolute_error', n_jobs = -1)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [16]:
#print(svc_param_selection(X_train, y_train, 5))

{'epsilon': 0, 'C': 0.5, 'gamma': 0.01}


In [170]:
def kde_xwoba(clf, batter, raw, predictors):
    df = raw.loc[raw['batter'] == batter, ("hc_x", "hc_y", "launch_speed", "launch_angle", "woba_value")]
    df["launch_direction"] = np.arctan((df.loc[:, "hc_x"]-125.42)/(198.27-df.loc[:, "hc_y"]))*180/np.pi
    df = df.dropna()
    df = df.drop(columns=['hc_x', 'hc_y'])
    ss = len(df['launch_direction'])
    if ss < 5: return 0
    params = {'bandwidth' : np.logspace(.7,1.5,50)}
    grid = GridSearchCV(KernelDensity(kernel='gaussian'), params, cv=5, n_jobs=-1)
    grid.fit(df[predictors])
    kde = grid.best_estimator_
    new_data = kde.sample(5000, random_state=42)
    new_data = new_data[abs(new_data[:,2]) < 45]
    return np.mean(clf.predict(new_data))

In [66]:
idcount = raw['batter'].value_counts()
ids = idcount[idcount > 100].index

In [74]:
res = np.ndarray(0)
for bid in ids:
    tmp = kde_xwoba(clf, bid, raw, predictors)
    res = np.append(res, tmp)
    print(bid, tmp)

606299 0.2841601379064288
596019 0.3704205658611457
455976 0.3652239630669354
592518 0.3900217250218549
607208 0.3455490173711124
516416 0.31451096753722535
645277 0.33052390453498565
593160 0.36989891463826746
488726 0.35295653908185404
542255 0.28425815934074444
592743 0.30096898830312835
608324 0.37212764816284877
543760 0.3291248755238807
608070 0.34270445507735
453568 0.3500587586432441
519203 0.35087304073199976
543829 0.25631978653014137
518692 0.4201534989585017
430945 0.3520357034270408
493329 0.2956197687633151
643217 0.3642425992475411
542303 0.3760155748084104
571448 0.37581061915594116
516770 0.3414579202174905
609280 0.3557708512014634
519058 0.35964753825063595
467793 0.3368191946602854
476704 0.37571668182205165
543333 0.3057284059396836
592206 0.4436708413152116
570560 0.30443704944659583
571697 0.3491670353526181
605119 0.3543741433265296
518934 0.33696218600003425
514888 0.3565303798761522
520471 0.30976118798902347
592696 0.3422461604431974
571745 0.4106103072107125

445988 0.2966893772544317
640447 0.3552602978196664
643603 0.3485963123101628
543484 0.32381157008274714
572073 0.3330752406811553
459964 0.3626603893619446
491696 0.28577294843251483
571437 0.3979757674525449
607054 0.3057067570504827
624585 0.3648241711343545
607345 0.2872268993557932
434658 0.2668421525818213
547172 0.2454312537490679
608654 0.32020543371926335
500135 0.28158373288474714
435263 0.30933308843906065
593934 0.3876030239937562
519346 0.4044708915617584
527043 0.2798764199754639
543543 0.4027350427687627
592122 0.4097280817230336
545358 0.3609764640145078
641583 0.33551114994990816
446386 0.323508747251354
641319 0.31462790904010157
594824 0.2880894030321675
448602 0.381685936794936
571875 0.34597118290313866
600303 0.2901396776078484
434670 0.3409452428367507
600474 0.25884079742523486
605170 0.31232510691026316
640457 0.3535719750491507
571974 0.34161479107803533
596847 0.38471888275971905
570267 0.3867289233682919
408236 0.37615080222612257
571912 0.3633889637534474
6

In [200]:
ids = np.array(ids)
result = pd.DataFrame(data = np.dstack((ids,res))[0], columns = ("batter", "kde_xwoba"))

In [201]:
batters = raw.groupby(['player_name', 'batter']).size().reset_index()
batters = batters.rename(columns = {0 : 'BIP'})

In [206]:
final = result.merge(batters)
final['batter'] = final['batter'].astype('int64')
temp = raw.loc[:,('batter', 'woba_value')]
temp = temp.merge(batters)
diff = sum(final['kde_xwoba']*final['BIP'])/sum(final['BIP']) - np.mean(temp['woba_value']) 
final['kde_xwoba'] = final['kde_xwoba'] - diff

In [207]:
final = final.sort_values('kde_xwoba', ascending = False)
print(final)

     batter  kde_xwoba           player_name  BIP
189  608336   0.506679            Joey Gallo  291
60   502110   0.505353         J.D. Martinez  428
210  571970   0.491091             Max Muncy  266
213  592450   0.483034           Aaron Judge  264
253  660271   0.479840         Shohei Ohtani  223
54   605141   0.478506          Mookie Betts  433
29   592206   0.472156  Nicholas Castellanos  466
83   572761   0.471545        Matt Carpenter  409
86   501981   0.467917           Khris Davis  405
142  545361   0.465941            Mike Trout  350
72   502671   0.464713      Paul Goldschmidt  419
132  502054   0.460879            Tommy Pham  357
74   621566   0.460711            Matt Olson  418
165  606192   0.457477     Teoscar Hernandez  315
169  660670   0.457092      Ronald Acuna Jr.  312
94   443558   0.456986           Nelson Cruz  399
40   543685   0.456349        Anthony Rendon  450
106  547180   0.455676          Bryce Harper  386
381  488671   0.454497            Alex Avila  106


In [208]:
final.to_csv("kde_xwoba.csv")