# xBA and xxBA
Statcast's "expected batting average" aka xBA is a KNN model of "hit probability" as a function of exit velocity and launch angle. Batted balls are labelled 1 if the batter made it to 1st, or 0 otherwise. The probability that a batted ball results in the batter reaching 1st base is estimated by taking the average result of "similar balls" aka nearest neighbours.

Statscast's KNN model is not publicly available (to my knowledge), but we can approximate it by fitting a KNN to its output. I call this model xxBA, since xxBA estimates xBA in the same way xBA estimates BA. This should give better granularity vs training my own KNN model, since the model can express a finer range of values while keeping the number of neighbours quite small.


In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor as KNN
import matplotlib.pyplot as plt
# load dataframe
df = pd.read_csv("data/mlb-all-swings-no-bunts.csv")
df = df[["description", "type","events", "bb_type", "launch_speed", "launch_angle", "estimated_ba_using_speedangle"]]
df = df[~df.launch_speed.isna()] # remove missed swings
df.rename(columns={"launch_speed": "exit_velocity"}, inplace=True)
df = df[df.events != "catcher_interf"] # there's only one of these, but we should remove it anyway
df

Unnamed: 0,description,type,events,bb_type,exit_velocity,launch_angle,estimated_ba_using_speedangle
0,hit_into_play,X,field_out,line_drive,91.0,25.0,0.106
2,hit_into_play,X,field_out,line_drive,63.3,22.0,0.540
3,hit_into_play,X,field_out,ground_ball,51.9,-81.0,0.129
4,hit_into_play,X,field_out,fly_ball,94.1,31.0,0.177
5,foul,S,,,76.2,59.0,
...,...,...,...,...,...,...,...
49993,foul,S,,,84.6,55.0,
49995,hit_into_play,X,single,fly_ball,102.1,23.0,0.583
49996,foul,S,,,80.8,69.0,
49997,hit_into_play,X,double,ground_ball,99.0,-6.0,0.294


In [2]:
exit_launch = df[["exit_velocity", "launch_angle"]]

In [3]:
xxba = KNN(20)
training_mask = ~df.estimated_ba_using_speedangle.isna()
training_df = df[training_mask]
xxba.fit(training_df[["exit_velocity", "launch_angle"]], training_df[["estimated_ba_using_speedangle"]])

In [4]:
import pickle

with open("xxba.pickle", "wb") as f:
    pickle.dump(xxba, f)

# Training my own xBA
I'm curious about whether xxBA and xBA are meaningfully different, so I'm going to train my own xBA and compare. Since the model takes 2D input it is easily plotted, using colour to show output. 

In [7]:
label = df.events.isin(["single", "double", "triple", "home_run"])
xba = KNN(50)
xba.fit(exit_launch[training_mask], label[training_mask])

In [None]:
import pickle
with open("xba.pickle", "wb") as f:
    pickle.dump(xba, f)

# xxBA vs My xBA vs StatsCast xBA

In [24]:
plt.close("all")
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn import metrics
fig1, ax = plt.subplots(1, 3)
ax[0].set_title("xxBA")
ax[0].scatter(training_df.exit_velocity, training_df.launch_angle, c=xxba.predict(exit_launch[training_mask]), cmap="inferno", vmax=1, vmin=0, s=1/10)
ax[1].set_title("xBA")
ax[1].scatter(training_df.exit_velocity, training_df.launch_angle, c=xba.predict(exit_launch[training_mask]), cmap="inferno", vmax=1, vmin=0, s=1/10)
ax[2].set_title("Statscast xBA")
ax[2].scatter(df.exit_velocity[training_mask], df.launch_angle[training_mask], c=df.estimated_ba_using_speedangle[training_mask], cmap="inferno", vmax=1, vmin=0, s=1/10)
fig1.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [55]:
fig, ax = plt.subplots(1, 2)
xx_pred = xxba.predict(exit_launch).squeeze()
ax[0].set_title("Actual Hits (exceeding)")
ax[0].ecdf(xx_pred[label], label="xx", complementary=True)
ax[1].set_title("Actual non-hits (non-exceeding)")
ax[1].ecdf(xx_pred[~label], label="xx")

x_pred = xba.predict(exit_launch).squeeze()
ax[0].ecdf(x_pred[label], label="x", complementary=True)
ax[1].ecdf(x_pred[~label], label="x")

ax[0].ecdf(training_df.estimated_ba_using_speedangle[label[training_mask]], label="statscast", complementary=True)
ax[1].ecdf(training_df.estimated_ba_using_speedangle[~label[training_mask]], label="statscast")

ax[0].legend()
ax[1].legend()
fig.show()

<IPython.core.display.Javascript object>

In [45]:
dat = (df[df.exit_velocity < 50])[["exit_velocity", "launch_angle"]]
fig, ax = plt.subplots()
ax.ecdf(xxba.predict(dat).squeeze(), label="xx", complementary=True)
ax.ecdf(xba.predict(dat).squeeze(), label="x", complementary=True)
ax.set_title("<50mph ECDF")
fig.show()

<IPython.core.display.Javascript object>

In [56]:
fig, ax = plt.subplots()
fig.suptitle("ROC")
metrics.RocCurveDisplay.from_predictions(label, xxba.predict(exit_launch), name="xxBA", ax=ax)
metrics.RocCurveDisplay.from_predictions(label, xba.predict(exit_launch), name="xBA", ax=ax)
metrics.RocCurveDisplay.from_predictions(label[training_mask], training_df.estimated_ba_using_speedangle, name="StatsCast", ax=ax)
fig.show()
# note that the statscast data doesn't have to predict foul balls

<IPython.core.display.Javascript object>

In [122]:
plt.close("all")

(5817,)

In [57]:
diff_x = abs(x_pred - df.estimated_ba_using_speedangle)
diff_xx = abs(xx_pred - df.estimated_ba_using_speedangle)
fig, ax = plt.subplots()
ax.ecdf(diff_x[training_mask], complementary=True, label="x")
ax.set_title("Exceedance Curve of Absolute Difference from StatsCast\n(fouls not included)")
ax.ecdf(diff_xx[training_mask], complementary=True, label="xx")
ax.set_xlabel("absolute difference from Statscast")
ax.set_ylabel("probability of exceeding")
fig.show()
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x22167193c50>

(35012, 1)