In [1]:
from pybaseball import statcast
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

df = statcast(start_dt="2023-03-28", end_dt="2025-09-28")
df = df[df["game_type"] == "R"].copy()

This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.


Skipping offseason dates
Skipping offseason dates


100%|████████████████████████████████████████████████████████████████████████████████| 677/677 [06:17<00:00,  1.79it/s]


In [61]:
df = df.dropna(subset=["player_name", "batter", "pitch_type", "description"])
df["swing"] = df["description"].isin([
    "swinging_strike", "foul", "hit_into_play", "foul_tip", "swinging_strike_blocked"
]).astype(int)
df["called_pitch"] = df["description"].isin(["called_strike", "ball", "blocked_ball", "hit_by_pitch", "pitchout"]).astype(int)
df["is_strike"] = df["description"].isin([
    "swinging_strike", "called_strike", "foul", "foul_tip", "hit_into_play", "swinging_strike_blocked", "foul_bunt", "missed_bunt", "bunt_foul_tip"
]).astype(int)

In [62]:
plate_features = [
    "pfx_x", "pfx_z", "plate_x", "plate_z",
    "release_speed", "balls", "strikes",
    "release_spin_rate", "release_extension",
    "release_pos_y", "effective_speed", "spin_axis"
]
df = df.dropna(subset=plate_features)

df["swing_miss"] = df["description"].isin([
    "swinging_strike", "swinging_strike_blocked", "foul_tip"
]).astype(int)

df["swing_foul"] = df["description"].isin([
    "foul", "foul_bunt", "bunt_foul_tip"
]).astype(int)

df["swing_inplay"] = df["description"].isin([
    "hit_into_play"
]).astype(int)

df["swing"] = (df["swing_miss"] + df["swing_foul"] + df["swing_inplay"]).clip(upper=1)

swing_model = DecisionTreeClassifier(max_depth=6)
X = df[plate_features]
y = df["swing"]
swing_model.fit(X, y)
df["swing_prob"] = swing_model.predict_proba(X)[:, 1]

called_df = df[df["called_pitch"] == 1]
strike_model = DecisionTreeClassifier(max_depth=6)
X_called = called_df[plate_features]
y_called = called_df["is_strike"]
strike_model.fit(X_called, y_called)
df["strike_prob"] = strike_model.predict_proba(df[plate_features])[:, 1]

df["expected_swing_rv"] = (
    df["swing_inplay"] * df["estimated_woba_using_speedangle"].fillna(0) +
    df["swing_miss"] * -0.15 +
    df["swing_foul"] * -0.02
)

df["pitch_score"] = (
    df["swing_prob"] * df["expected_swing_rv"] +
    df["strike_prob"] * -0.1 +
    (1 - df["strike_prob"]) * 0.05
)

df["decision_score"] = np.where(
    df["swing"] == 1,
    df["expected_swing_rv"] - df["pitch_score"],
    0.05 - df["pitch_score"]
)

plate_discipline = df.groupby("batter")["decision_score"].mean().rename("plate_discipline")

In [63]:
power_df = df.dropna(subset=["launch_speed", "launch_angle"])

power_features = [
    "release_speed", "plate_x", "plate_z", "launch_angle",
    "pfx_x", "pfx_z", "release_spin_rate", "release_extension",
    "release_pos_y", "effective_speed"
]

power_df["release_speed_x_launch_angle"] = power_df["release_speed"] * power_df["launch_angle"]
power_df["release_speed_squared"] = power_df["release_speed"] ** 2
power_df["launch_angle_squared"] = power_df["launch_angle"] ** 2
power_df["pfx_x_squared"] = power_df["pfx_x"] ** 2
power_df["pfx_z_squared"] = power_df["pfx_z"] ** 2

power_features_extended = power_features + [
    "release_speed_x_launch_angle",
    "release_speed_squared",
    "launch_angle_squared",
    "pfx_x_squared",
    "pfx_z_squared"
]

X_power = power_df[power_features_extended]
y_power = power_df["launch_speed"]

reg = DecisionTreeRegressor(max_depth=6)
reg.fit(X_power, y_power)

power_df["xEV"] = reg.predict(X_power)

power_df["EV_residual"] = power_df["launch_speed"] - power_df["xEV"]

power_df["bat_speed"] = (power_df["EV_residual"] / 1.2) + 72

power_metric = power_df.groupby("batter")["bat_speed"].median().rename("power")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  power_df["release_speed_x_launch_angle"] = power_df["release_speed"] * power_df["launch_angle"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  power_df["release_speed_squared"] = power_df["release_speed"] ** 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  power_df["launch_angle_squared"] = power_d

In [64]:
contact_features = [
    "pfx_x", "pfx_z", "plate_x", "plate_z",
    "release_speed", "balls", "strikes",
    "release_spin_rate", "release_extension",
    "release_pos_y", "effective_speed", "spin_axis",
    "attack_angle", "attack_direction", "swing_path_tilt",
    "intercept_ball_minus_batter_pos_x_inches",
    "intercept_ball_minus_batter_pos_y_inches"
]

contact_df = df[df["swing"] == 1].copy()

contact_df["made_contact"] = contact_df["description"].isin([
    "foul", "hit_into_play"
]).astype(int)

contact_df = contact_df.dropna(subset=contact_features)

contact_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)
X_c = contact_df[contact_features]
y_c = contact_df["made_contact"]
contact_model.fit(X_c, y_c)

contact_df["xContact"] = contact_model.predict_proba(X_c)[:, 1]

contact_df["contact_value"] = np.where(
    contact_df["made_contact"] == 1,
    1 - contact_df["xContact"],
    -contact_df["xContact"]
)

contact_metric = contact_df.groupby("batter")["contact_value"].mean().rename("contact")

In [65]:
speed_features = ["hit_distance_sc", "launch_speed", "launch_angle"]

speed_df["home_to_first"] = np.where(
    ((speed_df["events"] == "single") |
     ((speed_df["events"] == "field_out") & (speed_df["bb_type"] == "ground_ball"))),
    4.0 - (speed_df["hit_distance_sc"] / 300),
    np.nan
)

speed_metric = (
    speed_df.groupby("batter")["home_to_first"].median().rename("speed")
)

In [66]:
tools = pd.concat([plate_discipline, power_metric, contact_metric, speed_metric], axis=1)

for col in tools.columns:

    tools[col] = tools[col].fillna(tools[col].mean())

    median = tools[col].median()
    iqr = tools[col].quantile(0.75) - tools[col].quantile(0.25)
    
    if iqr == 0:
        iqr = 1e-6

    tools[col + "_grade"] = 50 + 20 * (tools[col] - median) / iqr

    tools[col + "_grade"] = tools[col + "_grade"].clip(20, 80)

batter_counts = df["batter"].value_counts()
qualified_batters = batter_counts[batter_counts >= 1000].index
tools = tools.loc[tools.index.isin(qualified_batters)]

clean_tools = tools.dropna()

woba_df = df.groupby("batter")["estimated_woba_using_speedangle"].mean()
merged = clean_tools.merge(woba_df, on="batter", how="inner")

merged = merged.dropna(subset=[
    "plate_discipline", "power", "contact", "speed", "estimated_woba_using_speedangle"
])

X_final = merged[["plate_discipline", "power", "contact", "speed"]]
y_final = merged["estimated_woba_using_speedangle"]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = LinearRegression()
r2_scores = cross_val_score(model, X_final, y_final, cv=kf, scoring="r2")
print(f"Tool-based wOBA R² (5-fold CV): {np.mean(r2_scores):.3f}")

Tool-based wOBA R² (5-fold CV): 0.475


In [68]:
final_output = tools.copy()

filtered_df = df[df["batter"].isin(final_output.index)]

filtered_df["hitter_name"] = filtered_df["des"].str.extract(r"^(\w+\s\w+)")

name_map = (
    filtered_df[["batter", "hitter_name"]]
    .dropna(subset=["batter", "hitter_name"])
    .drop_duplicates(subset="batter")
    .set_index("batter")
)

final_output = final_output.reset_index(names="batter")

final_output["player_name"] = final_output["batter"].map(name_map["hitter_name"])

final_output = final_output.round(2)

ordered_cols = (
    ["batter", "player_name"]
    + [c for c in final_output.columns if "grade" in c]
    + [c for c in final_output.columns if c not in ["batter", "player_name"] and "grade" not in c]
)
final_output = final_output[ordered_cols]

final_output.to_csv("hitter_tools.csv", index=False)
print("✅ Finished grading hitters. Saved as hitter_tools.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["hitter_name"] = filtered_df["des"].str.extract(r"^(\w+\s\w+)")


✅ Finished grading hitters. Saved as hitter_tools.csv
