# MLB Total Runs Regression Baselines (Multi‑Season)

This notebook builds **baseline regression models** to predict **total runs scored in a game**
using the same **pruned, differenced feature set (~37 features)** used in classification.

Targets:
- `totalRuns = homeScore + awayScore`

Models:
- Ridge Regression (linear baseline)
- ElasticNet (sparse linear baseline)

Assumptions:
- SQLite DB: `mlb_scrape.sqlite`
- Tables: `games_table_YYYY` (2015–2025)
- Feature engineering already done up through differencing


In [14]:
# Step 1: Imports & config
import sqlite3
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DB_PATH = "mlb_scrape.sqlite"
START_YEAR, END_YEAR = 2015, 2025
CUTOFF_MONTH, CUTOFF_DAY = 4, 7


In [15]:
# Step 2: Union all seasons (2015–2025) AND join scores from schedule_games
conn = sqlite3.connect(DB_PATH)

existing = set(r[0] for r in conn.execute(
    "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'games_table_%'"
).fetchall())

tables = [f"games_table_{y}" for y in range(START_YEAR, END_YEAR + 1)
          if f"games_table_{y}" in existing]

union_sql = " UNION ALL ".join([f"SELECT * FROM {t}" for t in tables])

query = f"""
SELECT
    g.*,
    s.homeScore,
    s.awayScore
FROM (
    {union_sql}
) g
JOIN schedule_games s
  ON g.gamePk = s.gamePk
WHERE s.homeScore IS NOT NULL
  AND s.awayScore IS NOT NULL
ORDER BY g.gameDate ASC
"""

df = pd.read_sql(query, conn)

print("Rows with scores:", len(df))


Rows with scores: 25193


In [16]:
# Step 3: Derive regression target (total runs)
df["totalRuns"] = df["homeScore"] + df["awayScore"]

print(df[["homeScore", "awayScore", "totalRuns"]].describe())


          homeScore     awayScore     totalRuns
count  25193.000000  25193.000000  25193.000000
mean       4.543921      4.451832      8.995753
std        3.141230      3.227924      4.525818
min        0.000000      0.000000      1.000000
25%        2.000000      2.000000      6.000000
50%        4.000000      4.000000      9.000000
75%        6.000000      6.000000     12.000000
max       29.000000     28.000000     38.000000


In [17]:
# Step 4: Apply April 7 cutoff (per season)
df["gameDate_dt"] = pd.to_datetime(df["gameDate"], utc=True, errors="coerce")

mask = (
    (df["gameDate_dt"].dt.month > CUTOFF_MONTH) |
    ((df["gameDate_dt"].dt.month == CUTOFF_MONTH) & (df["gameDate_dt"].dt.day >= CUTOFF_DAY))
)

df = df.loc[mask].copy()
print("Rows after cutoff:", len(df))


Rows after cutoff: 24437


In [18]:
# Step 5: Drop non-feature columns
meta_cols = [
    "gamePk","season","gameDate","gameDate_dt",
    "homeTeamId","awayTeamId","homeTeamName","awayTeamName",
    "homeWin","homeScore","awayScore"
]

drop_adv = [c for c in df.columns if any(x in c for x in ["B14_","B15_","B16_","B17_"])]
drop_sp = [c for c in df.columns if ("sp_career_" in c and ("SP8_" in c or "SP9_" in c))]

df_model = df.drop(columns=meta_cols + drop_adv + drop_sp, errors="ignore")
print("Columns remaining:", df_model.shape[1])


Columns remaining: 297


In [None]:
# Step 6: Home–away differencing
# home_cols = [c for c in df_model.columns if c.startswith("home_")]
# away_cols = [c for c in df_model.columns if c.startswith("away_")]

# pairs = {}
# for h in home_cols:
#     base = h.replace("home_","")
#     a = "away_" + base
#     if a in away_cols:
#         pairs[base] = (h,a)

# for base,(h,a) in pairs.items():
#     df_model[f"diff_{base}"] = df_model[h] - df_model[a]

# df_model = df_model.drop(columns=home_cols + away_cols)
# print("Shape after differencing:", df_model.shape)


Shape after differencing: (24437, 149)


  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model[f"diff_{base}"] = df_model[h] - df_model[a]
  df_model

In [20]:
# Step 7: Build X / y
y = df_model["totalRuns"].astype("float32").values
X = df_model.drop(columns=["totalRuns"])

X = X.select_dtypes(include=[np.number]).astype("float32")

print("X shape:", X.shape)
print("y mean:", y.mean())


X shape: (24437, 148)
y mean: 8.999591


In [21]:
# Step 8: Time-based split
n = len(X)
train_end = int(0.8 * n)

X_train, X_val = X.iloc[:train_end], X.iloc[train_end:]
y_train, y_val = y[:train_end], y[train_end:]

print("Train rows:", len(X_train))
print("Val rows:", len(X_val))


Train rows: 19549
Val rows: 4888


In [22]:
# ---- Stage 1: manual pruning ----
import re

cols = X_train.columns.tolist()

drop_patterns = [
    "_std",                # drop all stds
    "last20",              # drop last20 windows
    # "bat_season",          # optional: comment out if you want season batting
    "sp_season",           # drop season SP stats
]

def should_drop(c):
    return any(p in c for p in drop_patterns)

keep_cols = [c for c in cols if not should_drop(c)]

Xtr_1 = X_train[keep_cols]
Xva_1 = X_val[keep_cols]

print("Features before:", X_train.shape[1])
print("Features after stage 1:", Xtr_1.shape[1])

# ---- Stage 2: correlation pruning ----
import numpy as np

corr = Xtr_1.corr().abs()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]

Xtr_2 = Xtr_1.drop(columns=to_drop)
Xva_2 = Xva_1.drop(columns=to_drop)

print("Dropped due to correlation:", len(to_drop))
print("Remaining features:", Xtr_2.shape[1])


Features before: 148
Features after stage 1: 55
Dropped due to correlation: 11
Remaining features: 44


In [23]:
# Step 9: Ridge regression baseline
ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", Ridge(alpha=1.0))
])

ridge.fit(Xtr_2, y_train)
pred = ridge.predict(Xva_2)

print("Ridge MAE:", mean_absolute_error(y_val, pred))
print("Ridge RMSE:", mean_squared_error(y_val, pred, squared=False))
print("Ridge R2:", r2_score(y_val, pred))


Ridge MAE: 3.545036
Ridge RMSE: 4.464936
Ridge R2: -0.0023581981658935547




In [24]:
# Step 10: ElasticNet baseline
enet = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000))
])

enet.fit(Xtr_2, y_train)
pred = enet.predict(Xva_2)

print("ElasticNet MAE:", mean_absolute_error(y_val, pred))
print("ElasticNet RMSE:", mean_squared_error(y_val, pred, squared=False))
print("ElasticNet R2:", r2_score(y_val, pred))


ElasticNet MAE: 3.542916
ElasticNet RMSE: 4.463938
ElasticNet R2: -0.001910090446472168


