# 2026 Batter WAR and HR Projections

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from pybaseball import batting_stats

In [2]:
# Pull multi-year data (one row per player per season)
data = batting_stats(2021, 2025, qual=0)  # qual=0 for all players with any PA

# Key columns we'll use
data = data[['Name', 'Season', 'Age', 'G', 'PA', 'HR', 'wOBA', 'WAR', 'Team']]

# Rename for consistency
data = data.rename(columns={'Season': 'year'})

# Filter for reasonable playing time (at least 100 PA) to reduce noise
data = data[data['PA'] >= 100]

# Sort for lagged features
data = data.sort_values(['Name', 'year']).reset_index(drop=True)

print(data.head())
print(f"Total player-seasons: {len(data)}")

           Name  year  Age    G   PA  HR   wOBA  WAR   Team
0  A.J. Pollock  2021   33  117  422  21  0.375  3.2    LAD
1  A.J. Pollock  2022   34  138  527  14  0.297  0.6    CHW
2  A.J. Pollock  2023   35   54  144   5  0.227 -0.7  - - -
3   Aaron Hicks  2021   31   32  126   4  0.279  0.0    NYY
4   Aaron Hicks  2022   32  130  453   8  0.292  0.9    NYY
Total player-seasons: 2309


In [3]:
# Existing lagged features...
data['prev_WAR'] = data.groupby('Name')['WAR'].shift(1)
data['prev_Age'] = data.groupby('Name')['Age'].shift(1)
data['prev_PA'] = data.groupby('Name')['PA'].shift(1)
data['prev_HR'] = data.groupby('Name')['HR'].shift(1)
data['prev_wOBA'] = data.groupby('Name')['wOBA'].shift(1)

# Weighted recent WAR (as before)
def weighted_recent_war(group):
    return (group['WAR'].shift(1) * 0.5 +
            group['WAR'].shift(2) * 0.3 +
            group['WAR'].shift(3) * 0.2)

data['recent_WAR'] = data.groupby('Name').apply(weighted_recent_war).reset_index(level=0, drop=True)

# New: Weighted recent HR (power persists differently)
def weighted_recent_hr(group):
    return (group['HR'].shift(1) * 0.5 +
            group['HR'].shift(2) * 0.3 +
            group['HR'].shift(3) * 0.2)

data['recent_HR'] = data.groupby('Name').apply(weighted_recent_hr).reset_index(level=0, drop=True)

# Drop rows missing lagged data
model_data = data.dropna(subset=['prev_WAR', 'prev_Age', 'prev_PA', 'prev_HR', 'prev_wOBA', 'recent_WAR', 'recent_HR', 'WAR', 'HR'])

# Features for both models
features = ['prev_Age', 'prev_PA', 'prev_WAR', 'prev_HR', 'prev_wOBA', 'recent_WAR', 'recent_HR']

  data['recent_WAR'] = data.groupby('Name').apply(weighted_recent_war).reset_index(level=0, drop=True)
  data['recent_HR'] = data.groupby('Name').apply(weighted_recent_hr).reset_index(level=0, drop=True)


In [4]:
X = model_data[features]
y_war = model_data['WAR']
y_hr = model_data['HR']

X_train, X_test, y_train_war, y_test_war = train_test_split(X, y_war, test_size=0.2, random_state=42)
X_train_hr, X_test_hr, y_train_hr, y_test_hr = train_test_split(X, y_hr, test_size=0.2, random_state=42)

# WAR model (as before)
model_war = RandomForestRegressor(n_estimators=300, random_state=42)
model_war.fit(X_train, y_train_war)

# New HR model
model_hr = RandomForestRegressor(n_estimators=300, random_state=42)
model_hr.fit(X_train_hr, y_train_hr)

# Evaluation
print("WAR Model MAE:", mean_absolute_error(y_test_war, model_war.predict(X_test)))
print("HR Model MAE:", mean_absolute_error(y_test_hr, model_hr.predict(X_test_hr)))

WAR Model MAE: 1.2152473118279568
HR Model MAE: 6.524767025089605


In [5]:
proj_2025 = data[data['year'] == 2025].copy()

# Setup previous season features
proj_2025['prev_Age'] = proj_2025['Age'] + 1
proj_2025['prev_PA'] = proj_2025['PA']
proj_2025['prev_WAR'] = proj_2025['WAR']
proj_2025['prev_HR'] = proj_2025['HR']
proj_2025['prev_wOBA'] = proj_2025['wOBA']
proj_2025['recent_WAR'] = proj_2025['WAR']  # Proxy; improve if desired
proj_2025['recent_HR'] = proj_2025['HR']    # Proxy

proj_features = proj_2025[features]

# Predictions
proj_2025['proj_2026_WAR'] = model_war.predict(proj_features)
proj_2025['proj_2026_HR'] = model_hr.predict(proj_features)

# Simple playing time projection: regress toward 550 PA
proj_2025['proj_2026_PA'] = 0.7 * proj_2025['PA'] + 0.3 * 550

# Scale HR by projected PA relative to 2025 PA (basic adjustment)
proj_2025['proj_2026_HR'] = proj_2025['proj_2026_HR'] * (proj_2025['proj_2026_PA'] / proj_2025['PA'].replace(0, 550))

# Round nicely
proj_2025['proj_2026_WAR'] = proj_2025['proj_2026_WAR'].round(2)
proj_2025['proj_2026_HR'] = proj_2025['proj_2026_HR'].round(0).astype(int)
proj_2025['proj_2026_PA'] = proj_2025['proj_2026_PA'].round(0).astype(int)
proj_2025['WAR'] = proj_2025['WAR'].round(2)

# Top 20 projections
top_proj = (proj_2025[['Name', 'Team', 'Age', 'PA', 'HR', 'WAR', 
                       'proj_2026_PA', 'proj_2026_HR', 'proj_2026_WAR']]
            .sort_values('proj_2026_WAR', ascending=False)
            .head(20))

top_proj

Unnamed: 0,Name,Team,Age,PA,HR,WAR,proj_2026_PA,proj_2026_HR,proj_2026_WAR
10,Aaron Judge,NYY,33,679,53,10.1,640,45,8.36
1799,Pete Crow-Armstrong,CHC,23,647,31,5.4,618,39,7.18
354,Cal Raleigh,SEA,28,705,60,9.1,658,38,7.1
1979,Shohei Ohtani,LAD,30,727,55,7.5,674,41,6.68
1223,Julio Rodriguez,SEA,24,710,32,5.7,662,28,6.43
242,Bobby Witt Jr.,KCR,25,687,23,8.0,646,19,6.22
707,Francisco Lindor,NYM,31,732,31,6.3,677,24,6.06
478,Corbin Carroll,ARI,24,642,31,6.5,614,27,5.94
1441,Maikel Garcia,KCR,25,666,16,5.6,631,15,5.49
697,Fernando Tatis Jr.,SDP,26,691,25,6.1,649,19,5.46


In [6]:
export_df = proj_2025[['Name', 'Team', 'Age', 'PA', 'HR', 'WAR', 
                       'proj_2026_PA', 'proj_2026_HR', 'proj_2026_WAR']] \
            .sort_values('proj_2026_WAR', ascending=False)

export_df.to_csv('mlb_2026_projections_war_hr.csv', index=False)
print("Exported full projections including HR!")

Exported full projections including HR!
