In [43]:
import pandas as pd
import json

lines = pd.read_csv("../data/lines/strikeouts_2025-06-04.csv")
lines.head(99)

Unnamed: 0,player,k_line,over_american_price,over_decimal_price,over_payout_multiplier,under_american_price,under_decimal_price,under_payout_multiplier
0,Kyle Freeland,3.5,-159,1.63,0.82,119,2.19,1.13
1,Andrew Abbott,4.5,-130,1.77,0.92,-103,1.98,1.04
2,DL Hall,2.5,103,2.03,1.07,-137,1.73,0.9
3,Lucas Giolito,5.5,-109,1.92,1.0,-122,1.82,1.0
4,José Soriano,4.5,-115,1.87,1.0,-115,1.87,1.0
5,Ryan Gusto,4.5,108,2.08,1.09,-143,1.7,0.88
6,Mike Burrows,3.5,-110,1.91,1.0,-121,1.83,1.0
7,Matthew Boyd,5.5,108,2.08,1.1,-143,1.7,0.88
8,MacKenzie Gore,6.5,-125,1.8,1.0,-107,1.94,1.0
9,Clarke Schmidt,4.5,-167,1.6,0.8,125,2.25,1.16


In [44]:

probables = lines['player'].unique().tolist()

df_2025 = pd.read_parquet("../data/processed/pitcher_game_data_2025.parquet")
df_latest = (
    df_2025[df_2025['pitcher_name'].isin(probables)]
    .sort_values(["pitcher_name", 'game_date'])
    .groupby("pitcher_name")
    .tail(1)
)

with open("../models/XGB_Tuned.json") as f:
    meta = json.load(f)

X_pred = df_latest[meta["features"]]

In [45]:
from joblib import load
model = load("../models/xgb_tuned_pitcher_k_model.joblib")
df_latest['model_k_pred'] = model.predict(X_pred)

In [46]:
merged = df_latest.merge(lines, left_on='pitcher_name', right_on='player', how='left')
merged['edge'] = merged['model_k_pred'] - merged['k_line']
picks = merged[['pitcher_name', 'k_line', 'model_k_pred', 'edge']]
picks.head(99)

Unnamed: 0,pitcher_name,k_line,model_k_pred,edge
0,Andrew Abbott,4.5,5.591453,1.091453
1,Chris Sale,6.5,4.943423,-1.556577
2,Clarke Schmidt,4.5,7.008631,2.508631
3,Emerson Hancock,4.5,2.011127,-2.488873
4,Griffin Canning,4.5,2.330023,-2.169977
5,Kyle Freeland,3.5,5.945942,2.445942
6,Lucas Giolito,5.5,3.335991,-2.164009
7,Luis L. Ortiz,4.5,6.520377,2.020377
8,MacKenzie Gore,6.5,6.582089,0.082089
9,Matthew Boyd,5.5,5.203145,-0.296855


In [47]:
# Absolute edge and sort
merged['abs_edge'] = merged['edge'].abs()
top = merged.sort_values('abs_edge', ascending=False).reset_index(drop=True)

# Keep the useful fields
top_picks = top[['pitcher_name', 'k_line', 'model_k_pred', 'edge', 'abs_edge']]


In [48]:
top_picks.head(20)

Unnamed: 0,pitcher_name,k_line,model_k_pred,edge,abs_edge
0,Clarke Schmidt,4.5,7.008631,2.508631,2.508631
1,Emerson Hancock,4.5,2.011127,-2.488873,2.488873
2,Kyle Freeland,3.5,5.945942,2.445942,2.445942
3,Griffin Canning,4.5,2.330023,-2.169977,2.169977
4,Lucas Giolito,5.5,3.335991,-2.164009,2.164009
5,Nick Pivetta,6.5,4.399947,-2.100053,2.100053
6,Luis L. Ortiz,4.5,6.520377,2.020377,2.020377
7,Shane Baz,4.5,2.529403,-1.970597,1.970597
8,Chris Sale,6.5,4.943423,-1.556577,1.556577
9,Andrew Abbott,4.5,5.591453,1.091453,1.091453
