# MLB 2025 Baselines (Paper-aligned): Logistic Regression + SVM

This notebook implements **Steps 1–8** to train **Logistic Regression** and **SVM** baselines on your `games_table_2025` table, using a **time-based cutoff** and dropping the columns we agreed to exclude.

**Assumptions**
- Your SQLite DB file is `mlb_scrape.sqlite`
- You have already built: `games_table_2025` (via step3)
- Cutoff date: **2025-04-07** (inclusive)


In [108]:
# Step 1: Imports + config
import sqlite3
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score

DB_PATH = "mlb_scrape.sqlite"

In [109]:
START_YEAR, END_YEAR = 2015, 2025

conn = sqlite3.connect(DB_PATH)

# (Optional) verify which tables exist
existing = set(r[0] for r in conn.execute(
    "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'games_table_%'"
).fetchall())

tables = [f"games_table_{y}" for y in range(START_YEAR, END_YEAR + 1) if f"games_table_{y}" in existing]
print("Found tables:", tables)

union_sql = "\nUNION ALL\n".join([f"SELECT * FROM {t}" for t in tables])
union_sql = f"""
SELECT *
FROM (
{union_sql}
)
ORDER BY gameDate ASC
"""

df_all = pd.read_sql(union_sql, conn)
print("Combined rows:", len(df_all))

# ---- NEW: merge park factors from context_game (by gamePk) ----
pf = pd.read_sql("SELECT gamePk, park_pf_runs FROM context_game", conn)
df_all = df_all.merge(pf, on="gamePk", how="left")
print("park_pf_runs non-null:", df_all["park_pf_runs"].notnull().sum(), "/", len(df_all))

# weather = pd.read_sql(
#     """
#     SELECT gamePk,
#            temp_f,
#            wind_mph,
#            humidity,
#            precip_mm
#     FROM context_game
#     """,
#     conn
# )

# df_all = df_all.merge(weather, on="gamePk", how="left")

hand = pd.read_sql(
    """
    SELECT gamePk, home_sp_throws, away_sp_throws
    FROM context_game
    """,
    conn
)
df_all = df_all.merge(hand, on="gamePk", how="left")

print("home_sp_throws non-null:", df_all["home_sp_throws"].notnull().sum(), "/", len(df_all))
print("away_sp_throws non-null:", df_all["away_sp_throws"].notnull().sum(), "/", len(df_all))


# --- Optional: drop games before April 7 in every season ---
df_all["gameDate_dt"] = pd.to_datetime(df_all["gameDate"], utc=True, errors="coerce")
cutoff_month, cutoff_day = 4, 7

mask_cutoff = ~(
    (df_all["gameDate_dt"].dt.month < cutoff_month) |
    ((df_all["gameDate_dt"].dt.month == cutoff_month) & (df_all["gameDate_dt"].dt.day < cutoff_day))
)

df_filt2 = df_all.loc[mask_cutoff].copy()
print("Rows after Apr 7 cutoff (all seasons):", len(df_filt2))

print(df_filt2.columns.tolist())

# --- Starter handedness features (context, not home/away-diff pairs) ---
# Encode L/R as binary; treat missing/unknown as 0 (or impute later via SimpleImputer)
df_filt2["home_sp_left"] = (df_filt2["home_sp_throws"].astype(str).str.upper() == "L").astype(int)
df_filt2["away_sp_left"] = (df_filt2["away_sp_throws"].astype(str).str.upper() == "L").astype(int)

# OPS sources you already have
HOME_SLG_L10 = "home_bat_last10_B8_SLG_mean"
AWAY_SLG_L10 = "away_bat_last10_B8_SLG_mean"

df_filt2["home_slg_x_away_sp_left"] = df_filt2[HOME_SLG_L10] * df_filt2["away_sp_left"]
df_filt2["away_slg_x_home_sp_left"] = df_filt2[AWAY_SLG_L10] * df_filt2["home_sp_left"]
df_filt2["diff_slg_x_sp_left"] = (
    df_filt2["home_slg_x_away_sp_left"]
    - df_filt2["away_slg_x_home_sp_left"]
)

# A compact single feature capturing relative platoon direction
# (-1 means away starter is L and home is not; +1 means home starter is L and away is not)
df_filt2["diff_sp_left"] = df_filt2["home_sp_left"] - df_filt2["away_sp_left"]




Found tables: ['games_table_2015', 'games_table_2016', 'games_table_2017', 'games_table_2018', 'games_table_2019', 'games_table_2020', 'games_table_2021', 'games_table_2022', 'games_table_2023', 'games_table_2024', 'games_table_2025']
Combined rows: 25193
park_pf_runs non-null: 24836 / 25193
home_sp_throws non-null: 25193 / 25193
away_sp_throws non-null: 25193 / 25193
Rows after Apr 7 cutoff (all seasons): 24437
['gamePk', 'season', 'gameDate', 'homeTeamId', 'awayTeamId', 'homeTeamName', 'awayTeamName', 'homeWin', 'home_bat_season_B1_AB_mean', 'home_bat_season_B1_AB_std', 'home_bat_season_B2_H_mean', 'home_bat_season_B2_H_std', 'home_bat_season_B3_BB_mean', 'home_bat_season_B3_BB_std', 'home_bat_season_B4_SO_mean', 'home_bat_season_B4_SO_std', 'home_bat_season_B5_PA_mean', 'home_bat_season_B5_PA_std', 'home_bat_season_B6_BA_mean', 'home_bat_season_B6_BA_std', 'home_bat_season_B7_OBP_mean', 'home_bat_season_B7_OBP_std', 'home_bat_season_B8_SLG_mean', 'home_bat_season_B8_SLG_std', 'home_

In [110]:
# Step 3: Drop columns you don't want (B14–B17 everywhere; career SP8/SP9; metadata)

# Drop B14-B17 columns anywhere in the dataframe (means/stds/blocks)
drop_b_cols = [c for c in df_filt2.columns if any(x in c for x in ["B14_", "B15_", "B16_", "B17_"])]

# Drop career SP8/SP9 columns only (we keep other career fields)
drop_sp_career_cols = [
    c for c in df_filt2.columns
    if ("sp_career_" in c) and (("SP8_" in c) or ("SP9_" in c))
]

# Drop non-feature columns (IDs/names/dates)
meta_cols = [
    "gamePk", "gameDate",
    "homeTeamName", "awayTeamName",
    "homeTeamId", "awayTeamId",
]

to_drop = drop_b_cols + drop_sp_career_cols + meta_cols
print("Dropping columns:", len(to_drop))
print("  - B14-B17:", len(drop_b_cols))
print("  - sp_career SP8/SP9:", len(drop_sp_career_cols))
print("  - meta:", len(meta_cols))

df_model = df_filt2.drop(columns=to_drop)
df_model.head(5)


Dropping columns: 58
  - B14-B17: 48
  - sp_career SP8/SP9: 4
  - meta: 6


Unnamed: 0,season,homeWin,home_bat_season_B1_AB_mean,home_bat_season_B1_AB_std,home_bat_season_B2_H_mean,home_bat_season_B2_H_std,home_bat_season_B3_BB_mean,home_bat_season_B3_BB_std,home_bat_season_B4_SO_mean,home_bat_season_B4_SO_std,...,park_pf_runs,home_sp_throws,away_sp_throws,gameDate_dt,home_sp_left,away_sp_left,home_slg_x_away_sp_left,away_slg_x_home_sp_left,diff_slg_x_sp_left,diff_sp_left
13,2015,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,100.373,R,R,2015-04-07 02:05:00+00:00,0,0,-0.0,-0.0,0.0,0
14,2015,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,107.453,R,L,2015-04-07 02:10:00+00:00,0,1,-1.0,-0.0,-1.0,-1
15,2015,0.0,31.0,0.0,8.0,0.0,1.0,0.0,7.0,0.0,...,91.084,R,L,2015-04-07 23:10:00+00:00,0,1,-1.0,-0.0,-1.0,-1
16,2015,0.0,32.0,0.0,8.0,0.0,3.0,0.0,8.0,0.0,...,90.31,R,L,2015-04-07 23:10:00+00:00,0,1,-1.0,-0.0,-1.0,-1
17,2015,0.0,33.0,0.0,8.0,0.0,0.0,0.0,9.0,0.0,...,106.246,R,R,2015-04-08 00:10:00+00:00,0,0,-0.0,-0.0,0.0,0


In [111]:
# --- Home–away differencing ---
import re

df_diff = df_model.copy()

home_cols = [c for c in df_diff.columns if c.startswith("home_")]
away_cols = [c for c in df_diff.columns if c.startswith("away_")]

# Map base feature name -> (home_col, away_col)
pairs = {}
for h in home_cols:
    base = h.replace("home_", "")
    a = "away_" + base
    if a in away_cols:
        pairs[base] = (h, a)

print("Paired features:", len(pairs))

skipped = []
diffed = 0

# Create differenced features
for base, (h, a) in pairs.items():
    # Only diff if both columns are numeric (or can be safely coerced)
    h_num = pd.to_numeric(df_diff[h], errors="coerce")
    a_num = pd.to_numeric(df_diff[a], errors="coerce")

    # If both were originally non-numeric, coercion will be mostly NaN -> skip
    # Heuristic: require at least some non-NaN in BOTH after coercion
    if h_num.notna().sum() == 0 or a_num.notna().sum() == 0:
        skipped.append(base)
        continue

    df_diff[f"diff_{base}"] = h_num - a_num
    diffed += 1


# Drop original home/away columns (whether diffed or skipped)
df_diff = df_diff.drop(columns=home_cols + away_cols)

print("Diffed numeric pairs:", diffed)
print("Skipped non-numeric pairs:", len(skipped))
if skipped:
    print("Examples skipped:", skipped[:20])

print("Shape after differencing:", df_diff.shape)
df_diff.head(3)


Paired features: 150
Diffed numeric pairs: 149
Skipped non-numeric pairs: 1
Examples skipped: ['sp_throws']
Shape after differencing: (24437, 154)


  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num - a_num
  df_diff[f"diff_{base}"] = h_num 

Unnamed: 0,season,homeWin,park_pf_runs,gameDate_dt,diff_slg_x_sp_left,diff_sp_left,diff_bat_season_B1_AB_mean,diff_bat_season_B1_AB_std,diff_bat_season_B2_H_mean,diff_bat_season_B2_H_std,...,diff_rp_season_P18_IS_mean,diff_rp_season_P18_IS_std,diff_sp_career_SP1_IP,diff_sp_career_SP2_H,diff_sp_career_SP3_BB,diff_sp_career_SP4_SO,diff_sp_career_SP5_HR,diff_sp_career_SP6_ERA,diff_sp_career_SP7_BF,diff_sp_career_WHIP
13,2015,1.0,100.373,2015-04-07 02:05:00+00:00,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,101.333333,-95.0,-89.0,341.0,-13.0,-0.48,159.0,-0.17
14,2015,0.0,107.453,2015-04-07 02:10:00+00:00,-1.0,-1,0.0,0.0,0.0,0.0,...,0.0,0.0,-1514.0,-1342.0,-376.0,-1576.0,-170.0,0.17,-6240.0,0.05
15,2015,0.0,91.084,2015-04-07 23:10:00+00:00,-1.0,-1,-2.0,0.0,2.0,0.0,...,0.0,0.0,-104.666667,-169.0,-21.0,-162.0,-11.0,-0.14,-510.0,-0.05


In [112]:
# df_diff["wind_mph_clip"] = df_diff["wind_mph"].clip(0, 25)
# df_diff["humidity_clip"] = df_diff["humidity"].clip(0, 100)
# df_diff["precip_mm_clip"] = df_diff["precip_mm"].clip(0, 20)

In [113]:
# Step 4 (differenced): Build X / y
import numpy as np

y = df_diff["homeWin"].astype(float).values
X = df_diff.drop(columns=["homeWin"])

# Drop datetime columns if present
dt_cols = X.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
if dt_cols:
    X = X.drop(columns=dt_cols)

# Drop any non-numeric columns (safety)
non_num = X.select_dtypes(exclude=[np.number]).columns.tolist()
if non_num:
    X = X.drop(columns=non_num)

X = X.astype("float32")

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (24437, 152)
y shape: (24437,)


In [114]:
# Step 5: Time-based train/validation split (80/20 on post-cutoff data)
n = len(X)
train_end = int(0.80 * n)

X_train, y_train = X.iloc[:train_end], y[:train_end]
X_val, y_val     = X.iloc[train_end:], y[train_end:]

print("Train rows:", len(X_train))
print("Val rows:", len(X_val))
print("Train homeWin mean:", y_train.mean())
print("Val homeWin mean:", y_val.mean())


Train rows: 19549
Val rows: 4888
Train homeWin mean: 0.5340938155404369
Val homeWin mean: 0.5317103109656302


In [115]:
# ---- Stage 1: manual pruning ----
import re

cols = X_train.columns.tolist()

drop_patterns = [
    "_std",                # drop all stds
    "last20",              # drop last20 windows
    # "bat_season",          # optional: comment out if you want season batting
    "sp_season",           # drop season SP stats
]

def should_drop(c):
    return any(p in c for p in drop_patterns)

keep_cols = [c for c in cols if not should_drop(c)]

Xtr_1 = X_train[keep_cols]
Xva_1 = X_val[keep_cols]

print("Features before:", X_train.shape[1])
print("Features after stage 1:", Xtr_1.shape[1])


Features before: 152
Features after stage 1: 59


In [116]:
# ---- Stage 2: correlation pruning ----
import numpy as np

corr = Xtr_1.corr().abs()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]

Xtr_2 = Xtr_1.drop(columns=to_drop)
Xva_2 = Xva_1.drop(columns=to_drop)

print("Dropped due to correlation:", len(to_drop))
print("Remaining features:", Xtr_2.shape[1])


Dropped due to correlation: 11
Remaining features: 48


In [117]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

l1 = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l1",
        solver="saga",
        C=0.1,        # try 0.05–0.2
        max_iter=4000
    ))
])

l1.fit(Xtr_2, y_train)

coef = l1.named_steps["clf"].coef_[0]
selected = np.abs(coef) > 1e-6

selected_cols = Xtr_2.columns[selected]
print("Selected features:", len(selected_cols))


Selected features: 40


In [118]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

# --- ensure handedness feature is included (even if L1 drops it) ---
selected_cols_final = list(selected_cols)
# for c in ["diff_sp_left", "diff_slg_x_sp_left", "park_pf_runs"]:
# for c in ["diff_sp_left", "park_pf_runs"]:
for c in ["park_pf_runs"]:
    if c in Xtr_2.columns and c not in selected_cols_final:
        selected_cols_final.append(c)

logreg_diff = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        penalty="l2",
        C=1.0,
        solver="lbfgs",
        max_iter=1000
    ))
])

Xtr_3 = Xtr_2[selected_cols_final]
Xva_3 = Xva_2[selected_cols_final]

logreg_diff.fit(Xtr_3, y_train)
pred = logreg_diff.predict_proba(Xva_3)[:,1]

print("Final pruned LogReg AUC:", roc_auc_score(y_val, pred))
print("Final pruned LogReg ACC:", accuracy_score(y_val, pred > 0.5))
print("Used cols:", len(selected_cols_final), "| includes diff_sp_left =", "diff_sp_left" in selected_cols_final)

Final pruned LogReg AUC: 0.6023358784194816
Final pruned LogReg ACC: 0.5752864157119476
Used cols: 40 | includes diff_sp_left = True


In [119]:
# ANN baseline (paper-aligned scaling: MinMax) for clean comparison
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, accuracy_score

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

imputer = SimpleImputer(strategy="median")

Xtr_imp = imputer.fit_transform(Xtr_3.values)
Xva_imp = imputer.transform(Xva_3.values)

# 1) Scale (fit on train only)
scaler = MinMaxScaler()
Xtr_ann = scaler.fit_transform(Xtr_imp)
Xva_ann = scaler.transform(Xva_imp)

ytr = y_train.astype("float32")
yva = y_val.astype("float32")

# 2) Build a compact MLP (keep it small to avoid overfitting)
tf.keras.backend.clear_session()
tf.random.set_seed(42)

model = Sequential([
    Dense(64, input_shape=(Xtr_ann.shape[1],), activation="relu"),
    BatchNormalization(),
    Dropout(0.25),

    Dense(32, activation="relu"),
    BatchNormalization(),
    Dropout(0.15),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=[tf.keras.metrics.AUC(name="auc"), "accuracy"]
)

# 3) Train with early stopping
es = EarlyStopping(
    monitor="val_auc",
    mode="max",
    patience=10,
    restore_best_weights=True
)

hist = model.fit(
    Xtr_ann, ytr,
    validation_data=(Xva_ann, yva),
    epochs=100,
    batch_size=256,
    callbacks=[es],
    verbose=1
)

# 4) Evaluate
pred = model.predict(Xva_ann, verbose=0).reshape(-1)
auc = roc_auc_score(yva, pred)
acc = accuracy_score(yva, pred > 0.5)

print("ANN AUC:", round(auc, 4))
print("ANN ACC:", round(acc, 4))




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
ANN AUC: 0.6031
ANN ACC: 0.5636


In [120]:
selected_cols_final

['season',
 'park_pf_runs',
 'diff_slg_x_sp_left',
 'diff_sp_left',
 'diff_bat_season_B1_AB_mean',
 'diff_bat_season_B2_H_mean',
 'diff_bat_season_B3_BB_mean',
 'diff_bat_season_B4_SO_mean',
 'diff_bat_season_B7_OBP_mean',
 'diff_bat_season_B8_SLG_mean',
 'diff_bat_season_B10_Pit_mean',
 'diff_bat_season_B11_Str_mean',
 'diff_bat_season_B12_PO_mean',
 'diff_bat_season_B13_A_mean',
 'diff_bat_last10_B1_AB_mean',
 'diff_bat_last10_B2_H_mean',
 'diff_bat_last10_B4_SO_mean',
 'diff_bat_last10_B10_Pit_mean',
 'diff_bat_last10_B11_Str_mean',
 'diff_bat_last10_B13_A_mean',
 'diff_sp_last3_SP1_IP_mean',
 'diff_sp_last3_SP2_H_mean',
 'diff_sp_last3_SP3_BB_mean',
 'diff_sp_last3_SP4_SO_mean',
 'diff_sp_last3_SP5_HR_mean',
 'diff_sp_last3_SP7_BF_mean',
 'diff_sp_last3_SP9_Str_mean',
 'diff_sp_last3_WHIP_mean',
 'diff_rp_season_P2_H_mean',
 'diff_rp_season_P3_BB_mean',
 'diff_rp_season_P4_SO_mean',
 'diff_rp_season_P5_HR_mean',
 'diff_rp_season_P6_ERA_mean',
 'diff_rp_season_P7_BF_mean',
 'diff_rp