In [8]:
import pandas as pd
import json

lines = pd.read_csv("../data/lines/strikeouts.csv")
lines.head(20)

Unnamed: 0,player,k_line,over_american_price,over_decimal_price,over_payout_multiplier,under_american_price,under_decimal_price,under_payout_multiplier
0,Tanner Bibee,4.5,-167,1.6,0.8,125,2.25,1.16
1,Dustin May,5.5,125,2.25,1.16,-167,1.6,0.8
2,Tomoyuki Sugano,3.5,116,2.16,1.11,-154,1.65,0.83
3,Andre Pallante,3.5,-164,1.61,0.81,123,2.23,1.15
4,Logan Webb,5.5,-173,1.58,0.77,130,2.3,1.16
5,Jack Flaherty,5.5,-159,1.63,0.82,119,2.19,1.13
6,Spencer Strider,6.5,113,2.13,1.12,-150,1.67,0.86
7,Ranger Suárez,5.5,113,2.13,1.12,-150,1.67,0.86
8,Taj Bradley,5.5,119,2.19,1.13,-159,1.63,0.82
9,Joe Ryan,5.5,-164,1.61,0.81,123,2.23,1.15


In [9]:

probables = lines['player'].unique().tolist()

df_2025 = pd.read_parquet("../data/processed/pitcher_game_data_2025.parquet")
df_latest = (
    df_2025[df_2025['pitcher_name'].isin(probables)]
    .sort_values(["pitcher_name", 'game_date'])
    .groupby("pitcher_name")
    .tail(1)
)

with open("../models/ridge_pitcher_k_model.json") as f:
    meta = json.load(f)

X_pred = df_latest[meta["features"]]

In [10]:
from joblib import load
model = load("../models/ridge_pitcher_k_model.joblib")
df_latest['model_k_pred'] = model.predict(X_pred)

In [11]:
merged = df_latest.merge(lines, left_on='pitcher_name', right_on='player', how='left')
merged['edge'] = merged['model_k_pred'] - merged['k_line']
picks = merged[['pitcher_name', 'k_line', 'model_k_pred', 'edge']]
picks.head(25)

Unnamed: 0,pitcher_name,k_line,model_k_pred,edge
0,Hunter Brown,6.5,7.050312,0.550312
1,Joe Ryan,5.5,5.823476,0.323476
2,Logan Webb,5.5,7.484013,1.984013
3,Nathan Eovaldi,5.5,0.602921,-4.897079
4,Tylor Megill,6.5,7.382911,0.882911


In [12]:
print(df_latest.shape[0])
print(X_pred.shape[0])
print("Expected probables:", len(probables))
print("Probables found in df_2025:", df_2025['pitcher_name'].isin(probables).sum())

import difflib
for name in probables:
    close = difflib.get_close_matches(name, df_2025['pitcher_name'].unique(), n=1)
    print(f"{name} -> {close}")

5
5
Expected probables: 29
Probables found in df_2025: 52
Tanner Bibee -> []
Dustin May -> []
Tomoyuki Sugano -> []
Andre Pallante -> []
Logan Webb -> ['Logan Webb']
Jack Flaherty -> []
Spencer Strider -> []
Ranger Suárez -> []
Taj Bradley -> []
Joe Ryan -> ['Joe Ryan']
Tylor Megill -> ['Tylor Megill']
Jonathan Cannon -> []
Brady Singer -> []
Daniel Lynch -> []
Aaron Civale -> []
Nathan Eovaldi -> ['Nathan Eovaldi']
Bowden Francis -> []
Germán Márquez -> []
Cade Horton -> []
Hunter Brown -> ['Hunter Brown']
JP Sears -> []
Tyler Anderson -> []
Carlos Rodón -> []
Corbin Burnes -> []
Mike Burrows -> []
Mitchell Parker -> ['Mitch Keller']
Logan Evans -> []
Max Meyer -> []
Stephen Kolek -> []


In [13]:
df_2025['pitcher_name'].unique()

array(['Tarik Skubal', 'Jesus Luzardo', 'Garrett Crochet', 'Zack Wheeler',
       'Kris Bubic', 'Logan Webb', 'Nathan Eovaldi', 'Max Fried',
       'Hunter Brown', 'MacKenzie Gore', 'Cole Ragans', 'Bryan Woo',
       'Yoshinobu Yamamoto', 'Pablo Lopez', 'Matthew Liberatore',
       'Nick Martinez', 'Mitch Keller', 'Tylor Megill', 'Luis Severino',
       'Chris Sale', 'Tyler Mahle', 'Framber Valdez', 'Michael King',
       'Kodai Senga', 'Nick Pivetta', 'Merrill Kelly', 'Joe Ryan',
       'Chris Bassitt'], dtype=object)

In [14]:
df_2025.describe()

Unnamed: 0,game_date,pitch_count,strikeouts,max_inning,num_pitch_types,rest_days,whiff_rate,csw_pct,whiff_rate_expanding,csw_pct_expanding,opponent_k_pct,park_factor_K,rolling_K_avg_3,rolling_K_avg_5,rolling_pitch_count_5,rolling_K_rate,pitcher_id
count,275,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0,275.0
mean,2025-04-28 15:58:15.272727296,93.029091,6.345455,6.112727,5.287273,5.792727,0.0,0.0,0.0,0.0,0.223947,1.002311,5.922424,5.64,88.922182,0.048075,640037.32
min,2025-04-01 00:00:00,39.0,0.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.141026,0.848444,2.333333,3.0,80.4,0.01559,518876.0
25%,2025-04-14 00:00:00,89.0,5.0,6.0,5.0,5.0,0.0,0.0,0.0,0.0,0.203306,0.946159,5.0,5.0,85.0,0.042026,607259.0
50%,2025-04-29 00:00:00,93.0,6.0,6.0,5.0,6.0,0.0,0.0,0.0,0.0,0.223122,1.002838,5.333333,5.0,85.0,0.055,657277.0
75%,2025-05-13 00:00:00,99.0,8.0,7.0,6.0,6.0,0.0,0.0,0.0,0.0,0.236082,1.060868,7.0,6.2,93.1,0.055,669197.5
max,2025-05-27 00:00:00,112.0,13.0,9.0,8.0,17.0,0.0,0.0,0.0,0.0,0.428571,1.133173,10.333333,10.0,102.8,0.070136,808967.0
std,,9.330463,2.483656,1.013604,0.833394,0.987486,0.0,0.0,0.0,0.0,0.032477,0.069701,1.493308,1.194513,5.368513,0.010142,57669.086195
