In [None]:
## This is the 3rd of 3 notebooks exploring xwOBA, its various components, and how they can be used to predict performance.

## This notebook is focused on wOBACON/xwOBACON. 

## Main Research Question: How predictive is xwOBACON of future performance, specifically, wOBACON? 

## Data Used: Baseball Savant CSV with various statistics from 2021 to 2024 and a PA threshold of 400. 


In [49]:
## Importing Libraries

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor


In [50]:
## View and Inspect Data

df_original = pd.read_csv('~/Desktop/savant.csv')
print(df_original.columns)
print(df_original.head())
print(df_original.columns.tolist())

Index(['last_name, first_name', 'player_id', 'year', 'player_age', 'ab', 'pa',
       'k_percent', 'bb_percent', 'babip', 'xba', 'xslg', 'woba', 'xwoba',
       'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg',
       'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent',
       'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent',
       'poorlyweak_percent', 'hard_hit_percent', 'avg_best_speed',
       'avg_hyper_speed', 'z_swing_percent', 'oz_swing_percent',
       'oz_swing_miss_percent', 'meatball_swing_percent', 'iz_contact_percent',
       'whiff_percent', 'swing_percent', 'pull_percent',
       'straightaway_percent', 'groundballs_percent', 'flyballs_percent',
       'linedrives_percent', 'popups_percent', 'sprint_speed'],
      dtype='object')
  last_name, first_name  player_id  year  player_age   ab   pa  k_percent  \
0       Cabrera, Miguel     408234  2021          38  472  526       22.4   
1        Molina, Yadier     425877  2021

In [51]:
## Creating Lagged Variables

df = df_original.sort_values(by=['player_id', 'year'])
columns_to_lag = df.columns[df.columns.get_loc('player_id') + 1:] 
lagged_df = df.groupby('player_id')[columns_to_lag].shift(1).add_prefix('lagged_')
df = pd.concat([df, lagged_df], axis=1)
df = df.dropna()
print(df.columns.tolist())
print(df)


['last_name, first_name', 'player_id', 'year', 'player_age', 'ab', 'pa', 'k_percent', 'bb_percent', 'babip', 'xba', 'xslg', 'woba', 'xwoba', 'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent', 'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent', 'avg_best_speed', 'avg_hyper_speed', 'z_swing_percent', 'oz_swing_percent', 'oz_swing_miss_percent', 'meatball_swing_percent', 'iz_contact_percent', 'whiff_percent', 'swing_percent', 'pull_percent', 'straightaway_percent', 'groundballs_percent', 'flyballs_percent', 'linedrives_percent', 'popups_percent', 'sprint_speed', 'lagged_year', 'lagged_player_age', 'lagged_ab', 'lagged_pa', 'lagged_k_percent', 'lagged_bb_percent', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_woba', 'lagged_xwoba', 'lagged_wobacon', 'lagged_xwobacon', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_per

In [52]:
## Linear Regression: Predicting wOBACON using last year's xwOBACON

model_results = {}

# Model 1
X = df[['lagged_xwobacon']]  
y = df['wobacon']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 1"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 1 Results Saved:")
print(f"R2: {model_results['Model 1']['R2']}")
print(f"RMSE: {model_results['Model 1']['RMSE']}")
print(f"Intercept: {model_results['Model 1']['Intercept']}")
print(f"Coefficients: {model_results['Model 1']['Coefficients']}")



Intercept: 0.12756657561444773
Coefficients: [0.65219411]
Mean Squared Error: 0.0013380591790067371
R-squared: 0.5008130615088404
Root Mean Squared Error (RMSE): 0.036579491234935695

Model 1 Results Saved:
R2: 0.5008130615088404
RMSE: 0.036579491234935695
Intercept: 0.12756657561444773
Coefficients: {'lagged_xwobacon': 0.6521941066418766}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [53]:
## Linear Regression: Adding in Age

# Model 2
X = df[['lagged_xwobacon', 'lagged_player_age']]  
y = df['wobacon']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)


model_results["Model 2"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 2 Results Saved:")
print(f"R2: {model_results['Model 2']['R2']}")
print(f"RMSE: {model_results['Model 2']['RMSE']}")
print(f"Intercept: {model_results['Model 2']['Intercept']}")
print(f"Coefficients: {model_results['Model 2']['Coefficients']}")


Intercept: 0.13542757700585528
Coefficients: [ 6.52085974e-01 -2.81000905e-04]
Mean Squared Error: 0.0013275952951131067
R-squared: 0.5047168007810195
Root Mean Squared Error (RMSE): 0.03643618112691157

Model 2 Results Saved:
R2: 0.5047168007810195
RMSE: 0.03643618112691157
Intercept: 0.13542757700585528
Coefficients: {'lagged_xwobacon': 0.6520859740432968, 'lagged_player_age': -0.00028100090511286904}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [54]:
## Linear Regression: Adding in Year

# Model 3

X = df[['lagged_xwobacon', 'lagged_year', 'lagged_player_age']]  
y = df['wobacon']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 3"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 3 Results Saved:")
print(f"R2: {model_results['Model 3']['R2']}")
print(f"RMSE: {model_results['Model 3']['RMSE']}")
print(f"Intercept: {model_results['Model 3']['Intercept']}")
print(f"Coefficients: {model_results['Model 3']['Coefficients']}")


Intercept: -8.445740331737033
Coefficients: [ 6.48427820e-01  4.24412703e-03 -2.40941058e-04]
Mean Squared Error: 0.0013291575502763597
R-squared: 0.5041339735157373
Root Mean Squared Error (RMSE): 0.03645761306334192

Model 3 Results Saved:
R2: 0.5041339735157373
RMSE: 0.03645761306334192
Intercept: -8.445740331737033
Coefficients: {'lagged_xwobacon': 0.6484278203010726, 'lagged_year': 0.004244127032920034, 'lagged_player_age': -0.0002409410576827305}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [55]:
## Linear Regression with all selected variables

# Model 4

X = df[['lagged_xwobacon', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent', 'lagged_barrel_batted_rate', 'lagged_flareburner_percent', 'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent', 'lagged_hard_hit_percent', 'lagged_meatball_swing_percent', 'lagged_pull_percent', 'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_flyballs_percent', 'lagged_linedrives_percent', 'lagged_popups_percent', 'lagged_sprint_speed']]  
y = df['wobacon']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 4"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 4 Results Saved:")
print(f"R2: {model_results['Model 4']['R2']}")
print(f"RMSE: {model_results['Model 4']['RMSE']}")
print(f"Intercept: {model_results['Model 4']['Intercept']}")
print(f"Coefficients: {model_results['Model 4']['Coefficients']}")



Intercept: -10.657410760102502
Coefficients: [ 3.04138587e-01  9.72439567e-05  5.18188252e-03 -6.96137778e-06
  6.43643275e-02 -2.76967016e-01  1.00343303e-01 -2.91478359e-03
  1.55728918e-03 -3.12123230e-04  3.70293676e-03 -4.82007381e-04
  2.34390176e-03 -1.87838342e-03 -2.06449077e-03  1.63508639e-03
  1.68627357e-04 -7.92888461e-04 -6.94867532e-04  8.04720322e-03
  3.32026631e-03  4.84905615e-03  2.86540559e-03  3.51029511e-03]
Mean Squared Error: 0.0013739385256397079
R-squared: 0.4874276287254631
Root Mean Squared Error (RMSE): 0.037066676754730896

Model 4 Results Saved:
R2: 0.4874276287254631
RMSE: 0.037066676754730896
Intercept: -10.657410760102502
Coefficients: {'lagged_xwobacon': 0.30413858676726824, 'lagged_player_age': 9.724395668759796e-05, 'lagged_year': 0.005181882524998328, 'lagged_pa': -6.961377781341139e-06, 'lagged_babip': 0.06436432752718732, 'lagged_xba': -0.2769670155571108, 'lagged_xslg': 0.10034330276040762, 'lagged_exit_velocity_avg': -0.0029147835871212906, '

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [56]:
## VIF Calculation to check for multicollinearity and find a better linear model 

# Model 5
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data['feature'] = df.columns
    vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

def reduce_vif(df, threshold=5.0):
    while True:
        vif_data = calculate_vif(df)
        max_vif = vif_data['VIF'].max()
        if max_vif > threshold:
            feature_to_remove = vif_data.sort_values('VIF', ascending=False).iloc[0]['feature']
            print(f'Removing {feature_to_remove} with VIF={max_vif}')
            df = df.drop(columns=[feature_to_remove])
        else:
            break
    return df

X_train_reduced = reduce_vif(X_train)

model_reduced = LinearRegression()
model_reduced.fit(X_train_reduced, y_train)

X_test_reduced = X_test[X_train_reduced.columns]
y_pred_reduced = model_reduced.predict(X_test_reduced)

print("Reduced Model Results:")
print("Intercept:", model_reduced.intercept_)
print("Coefficients:", dict(zip(X_train_reduced.columns, model_reduced.coef_)))

mse_reduced = mean_squared_error(y_test, y_pred_reduced)
r2_reduced = r2_score(y_test, y_pred_reduced)
rmse_reduced = np.sqrt(mse_reduced)

print("Mean Squared Error (Reduced):", mse_reduced)
print("R-squared (Reduced):", r2_reduced)
print("Root Mean Squared Error (RMSE) (Reduced):", rmse_reduced)

final_vif = calculate_vif(X_train_reduced)
print("Final VIF values:")
print(final_vif)

model_results["Model 5"] = {
    "R2": r2_reduced,
    "RMSE": rmse_reduced,
    "Intercept": model_reduced.intercept_,
    "Coefficients": dict(zip(X_train_reduced.columns, model_reduced.coef_)),
    "Final VIF": final_vif.to_dict(orient='records'), 
}

print("\nModel 5 Results Saved:")
print(f"R2: {model_results['Model 5']['R2']}")
print(f"RMSE: {model_results['Model 5']['RMSE']}")
print(f"Intercept: {model_results['Model 5']['Intercept']}")
print(f"Coefficients: {model_results['Model 5']['Coefficients']}")
print(f"Final VIF values: {model_results['Model 5']['Final VIF']}")


Removing lagged_year with VIF=2208509.7945275703
Removing lagged_exit_velocity_avg with VIF=27676.088420891363
Removing lagged_groundballs_percent with VIF=2764.2983560798903
Removing lagged_xslg with VIF=2612.661006110662
Removing lagged_xwobacon with VIF=1502.8048979604712
Removing lagged_sprint_speed with VIF=759.176401660102
Removing lagged_flyballs_percent with VIF=617.0452167788594
Removing lagged_sweet_spot_percent with VIF=492.58590213007227
Removing lagged_xba with VIF=308.9877057205745
Removing lagged_poorlyunder_percent with VIF=262.85312818966815
Removing lagged_straightaway_percent with VIF=238.1422718960806
Removing lagged_babip with VIF=171.4292353662694
Removing lagged_meatball_swing_percent with VIF=151.29556573878403
Removing lagged_flareburner_percent with VIF=135.97662810264967
Removing lagged_hard_hit_percent with VIF=123.13641623795431
Removing lagged_poorlytopped_percent with VIF=84.8000303675056
Removing lagged_pull_percent with VIF=67.27627289469518
Removing la

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [57]:
## Stepwise Regression

# Model 6
def stepwise_selection(X, y, initial_features=[], threshold_in=0.05, threshold_out=0.05, verbose=True):
    included = list(initial_features)
    while True:
        changed = False
        
        excluded = list(set(X.columns) - set(included))
        new_pvals = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included + [new_column]])).fit()
            new_pvals[new_column] = model.pvalues[new_column]
        best_pval = new_pvals.min()
        if best_pval < threshold_in:
            best_feature = new_pvals.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Adding feature {best_feature} with p-value {best_pval}')
        
        model = sm.OLS(y, sm.add_constant(X[included])).fit()
        pvals = model.pvalues.iloc[1:]  
        worst_pval = pvals.max()
        if worst_pval > threshold_out:
            worst_feature = pvals.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Removing feature {worst_feature} with p-value {worst_pval}')
        
        if not changed:
            break
    
    return included

selected_features = stepwise_selection(X_train, y_train)

model_stepwise = LinearRegression()
X_train_stepwise = X_train[selected_features]
X_test_stepwise = X_test[selected_features]
model_stepwise.fit(X_train_stepwise, y_train)

y_pred_stepwise = model_stepwise.predict(X_test_stepwise)

print("\nStepwise Model Results:")
print("Intercept:", model_stepwise.intercept_)
print("Coefficients:", dict(zip(X_train_stepwise.columns, model_stepwise.coef_)))

mse_stepwise = mean_squared_error(y_test, y_pred_stepwise)
r2_stepwise = r2_score(y_test, y_pred_stepwise)
rmse_stepwise = np.sqrt(mse_stepwise)

print("Mean Squared Error (Stepwise):", mse_stepwise)
print("R-squared (Stepwise):", r2_stepwise)
print("Root Mean Squared Error (RMSE) (Stepwise):", rmse_stepwise)

model_results["Model 6"] = {
    "R2": r2_stepwise,
    "RMSE": rmse_stepwise,
    "Intercept": model_stepwise.intercept_,
    "Coefficients": dict(zip(X_train_stepwise.columns, model_stepwise.coef_)),
    "Selected Features": selected_features,
}

print("\nModel 6 Results Saved:")
print(f"R2: {model_results['Model 6']['R2']}")
print(f"RMSE: {model_results['Model 6']['RMSE']}")
print(f"Intercept: {model_results['Model 6']['Intercept']}")
print(f"Coefficients: {model_results['Model 6']['Coefficients']}")
print(f"Selected Features: {model_results['Model 6']['Selected Features']}")



Adding feature lagged_xwobacon with p-value 1.3890888253440606e-46
Adding feature lagged_linedrives_percent with p-value 0.002095959339445563
Adding feature lagged_hard_hit_percent with p-value 0.020861488028135637

Stepwise Model Results:
Intercept: 0.18169125496488325
Coefficients: {'lagged_xwobacon': 0.5122713781397815, 'lagged_linedrives_percent': -0.0019860070694882777, 'lagged_hard_hit_percent': 0.0011555335963402834}
Mean Squared Error (Stepwise): 0.0013858478088129374
R-squared (Stepwise): 0.4829846573680364
Root Mean Squared Error (RMSE) (Stepwise): 0.03722697689596803

Model 6 Results Saved:
R2: 0.4829846573680364
RMSE: 0.03722697689596803
Intercept: 0.18169125496488325
Coefficients: {'lagged_xwobacon': 0.5122713781397815, 'lagged_linedrives_percent': -0.0019860070694882777, 'lagged_hard_hit_percent': 0.0011555335963402834}
Selected Features: ['lagged_xwobacon', 'lagged_linedrives_percent', 'lagged_hard_hit_percent']


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [58]:
## Lasso Regression

# Model 7

X = df[['lagged_xwobacon', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent', 'lagged_barrel_batted_rate', 'lagged_flareburner_percent', 'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent', 'lagged_hard_hit_percent', 'lagged_meatball_swing_percent', 'lagged_pull_percent', 'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_flyballs_percent', 'lagged_linedrives_percent', 'lagged_popups_percent', 'lagged_sprint_speed']]  
y = df['wobacon']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso_model = Lasso(alpha=0.01)  

lasso_model.fit(X_train, y_train)

print("Intercept:", lasso_model.intercept_)
print("Coefficients:", lasso_model.coef_)

y_pred = lasso_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 7"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 7 Results Saved:")
print(f"R2: {model_results['Model 7']['R2']}")
print(f"RMSE: {model_results['Model 7']['RMSE']}")
print(f"Intercept: {model_results['Model 7']['Intercept']}")
print(f"Coefficients: {model_results['Model 7']['Coefficients']}")



Intercept: 0.25567460742188597
Coefficients: [ 0.00000000e+00 -0.00000000e+00  0.00000000e+00 -4.86625944e-06
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  5.13335759e-03  0.00000000e+00
 -5.94753742e-05  0.00000000e+00  0.00000000e+00  1.75479928e-03
  1.22641850e-04 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00]
Mean Squared Error (MSE): 0.0015119962712988844
R-squared (R2): 0.4359227143899451
Root Mean Squared Error (RMSE): 0.03888439624449484

Model 7 Results Saved:
R2: 0.4359227143899451
RMSE: 0.03888439624449484
Intercept: -10.657410760102502
Coefficients: {'lagged_xwobacon': 0.30413858676726824, 'lagged_player_age': 9.724395668759796e-05, 'lagged_year': 0.005181882524998328, 'lagged_pa': -6.961377781341139e-06, 'lagged_babip': 0.06436432752718732, 'lagged_xba': -0.2769670155571108, 'lagged_xslg': 0.10034330276040762, 'lagged_exit_velocity_avg': -0.0029147835871

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [59]:
## Ridge Regression

# Model 8

X = df[['lagged_xwobacon', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent', 'lagged_barrel_batted_rate', 'lagged_flareburner_percent', 'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent', 'lagged_hard_hit_percent', 'lagged_meatball_swing_percent', 'lagged_pull_percent', 'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_flyballs_percent', 'lagged_linedrives_percent', 'lagged_popups_percent', 'lagged_sprint_speed']]  
y = df['wobacon']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = Ridge(alpha=1.0) 

ridge_model.fit(X_train, y_train)

print("Intercept:", ridge_model.intercept_)
print("Coefficients:", ridge_model.coef_)

y_pred = ridge_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 8"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 8 Results Saved:")
print(f"R2: {model_results['Model 8']['R2']}")
print(f"RMSE: {model_results['Model 8']['RMSE']}")
print(f"Intercept: {model_results['Model 8']['Intercept']}")
print(f"Coefficients: {model_results['Model 8']['Coefficients']}")


Intercept: -9.179593163085425
Coefficients: [ 1.18412292e-02  1.18693131e-04  4.63563329e-03 -1.14044509e-05
  1.06190875e-02 -4.17076897e-03  8.98757531e-04 -2.12499106e-03
  2.03992991e-03  3.22239080e-06  7.34568616e-03 -2.19173916e-04
  1.11031418e-03 -2.15151987e-03 -1.93504879e-03  1.65691371e-03
  1.26451719e-04 -8.13585594e-04 -6.93020838e-04  3.99789794e-03
 -2.14068218e-04  7.69628996e-04 -1.24091221e-03  4.91534707e-03]
Mean Squared Error (MSE): 0.0013944542869134596
R-squared (R2): 0.4797738565891202
Root Mean Squared Error (RMSE): 0.03734239262438147

Model 8 Results Saved:
R2: 0.4797738565891202
RMSE: 0.03734239262438147
Intercept: -10.657410760102502
Coefficients: {'lagged_xwobacon': 0.30413858676726824, 'lagged_player_age': 9.724395668759796e-05, 'lagged_year': 0.005181882524998328, 'lagged_pa': -6.961377781341139e-06, 'lagged_babip': 0.06436432752718732, 'lagged_xba': -0.2769670155571108, 'lagged_xslg': 0.10034330276040762, 'lagged_exit_velocity_avg': -0.00291478358712

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [60]:
## Predicting the mean for next year's BB% to see if the above models add predictive value

# Model 9

mean_wobacon = y_train.mean()

y_pred_mean = np.full_like(y_test, fill_value=mean_wobacon, dtype=np.float64)

mse_mean = mean_squared_error(y_test, y_pred_mean)
r2_mean = r2_score(y_test, y_pred_mean)
rmse_mean = np.sqrt(mse_mean)

print("Mean-Based Model Metrics:")
print("Mean Squared Error (MSE):", mse_mean)
print("R-squared (R2):", r2_mean)
print("Root Mean Squared Error (RMSE):", rmse_mean)

model_results["Model 9"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 9 Results Saved:")
print(f"R2: {model_results['Model 9']['R2']}")
print(f"RMSE: {model_results['Model 9']['RMSE']}")
print(f"Intercept: {model_results['Model 9']['Intercept']}")
print(f"Coefficients: {model_results['Model 9']['Coefficients']}")


Mean-Based Model Metrics:
Mean Squared Error (MSE): 0.0026879967913668065
R-squared (R2): -0.002805339261979123
Root Mean Squared Error (RMSE): 0.05184589464332549

Model 9 Results Saved:
R2: 0.4797738565891202
RMSE: 0.03734239262438147
Intercept: -10.657410760102502
Coefficients: {'lagged_xwobacon': 0.30413858676726824, 'lagged_player_age': 9.724395668759796e-05, 'lagged_year': 0.005181882524998328, 'lagged_pa': -6.961377781341139e-06, 'lagged_babip': 0.06436432752718732, 'lagged_xba': -0.2769670155571108, 'lagged_xslg': 0.10034330276040762, 'lagged_exit_velocity_avg': -0.0029147835871212906, 'lagged_launch_angle_avg': 0.0015572891802703606, 'lagged_sweet_spot_percent': -0.00031212322991251885, 'lagged_barrel_batted_rate': 0.0037029367578655724, 'lagged_flareburner_percent': -0.00048200738118547783, 'lagged_poorlyunder_percent': 0.002343901759032171, 'lagged_poorlytopped_percent': -0.0018783834235430947, 'lagged_poorlyweak_percent': -0.002064490770152615, 'lagged_hard_hit_percent': 0.

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [61]:
## Random Forest

# Model 10

X = df[['lagged_xwobacon', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent', 'lagged_barrel_batted_rate', 'lagged_flareburner_percent', 'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent', 'lagged_hard_hit_percent', 'lagged_meatball_swing_percent', 'lagged_pull_percent', 'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_flyballs_percent', 'lagged_linedrives_percent', 'lagged_popups_percent', 'lagged_sprint_speed']]  
y = df['wobacon']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nRandom Forest Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

feature_importance = rf_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in sorted(zip(features, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

model_results["Model 10"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 10 Results Saved:")
print(f"R2: {model_results['Model 10']['R2']}")
print(f"RMSE: {model_results['Model 10']['RMSE']}")
print(f"Intercept: {model_results['Model 10']['Intercept']}")
print(f"Coefficients: {model_results['Model 10']['Coefficients']}")



  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):



Random Forest Model Evaluation:
Mean Squared Error (MSE): 0.0016999636806818196
R-squared (R2): 0.3657981062275213
Root Mean Squared Error (RMSE): 0.041230615817397384

Feature Importances:
lagged_barrel_batted_rate: 0.2254
lagged_xwobacon: 0.1718
lagged_exit_velocity_avg: 0.0907
lagged_hard_hit_percent: 0.0692
lagged_xslg: 0.0504
lagged_meatball_swing_percent: 0.0427
lagged_pa: 0.0298
lagged_linedrives_percent: 0.0275
lagged_poorlyweak_percent: 0.0269
lagged_pull_percent: 0.0242
lagged_babip: 0.0236
lagged_popups_percent: 0.0234
lagged_sprint_speed: 0.0234
lagged_straightaway_percent: 0.0217
lagged_sweet_spot_percent: 0.0212
lagged_player_age: 0.0212
lagged_flareburner_percent: 0.0175
lagged_xba: 0.0173
lagged_groundballs_percent: 0.0141
lagged_flyballs_percent: 0.0138
lagged_poorlytopped_percent: 0.0133
lagged_poorlyunder_percent: 0.0114
lagged_year: 0.0104
lagged_launch_angle_avg: 0.0091

Model 10 Results Saved:
R2: 0.3657981062275213
RMSE: 0.041230615817397384
Intercept: -10.65741

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [62]:
## XGBoost 

# Model 11

X = df[['lagged_xwobacon', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent', 'lagged_barrel_batted_rate', 'lagged_flareburner_percent', 'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent', 'lagged_hard_hit_percent', 'lagged_meatball_swing_percent', 'lagged_pull_percent', 'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_flyballs_percent', 'lagged_linedrives_percent', 'lagged_popups_percent', 'lagged_sprint_speed']]  
y = df['wobacon']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(
    n_estimators=100,      
    learning_rate=0.1,      
    max_depth=3,           
    subsample=0.8,        
    colsample_bytree=0.8,   
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nXGBoost Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

feature_importances = xgb_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in sorted(zip(features, feature_importances), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

model_results["Model 11"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 11 Results Saved:")
print(f"R2: {model_results['Model 11']['R2']}")
print(f"RMSE: {model_results['Model 11']['RMSE']}")
print(f"Intercept: {model_results['Model 11']['Intercept']}")
print(f"Coefficients: {model_results['Model 11']['Coefficients']}")





XGBoost Model Evaluation:
Mean Squared Error (MSE): 0.001673248995218954
R-squared (R2): 0.37576449804201606
Root Mean Squared Error (RMSE): 0.040905366337669606

Feature Importances:
lagged_barrel_batted_rate: 0.1662
lagged_xwobacon: 0.1226
lagged_exit_velocity_avg: 0.1054
lagged_hard_hit_percent: 0.0605
lagged_xslg: 0.0564
lagged_meatball_swing_percent: 0.0370
lagged_year: 0.0352
lagged_flareburner_percent: 0.0324
lagged_flyballs_percent: 0.0280
lagged_linedrives_percent: 0.0272
lagged_sweet_spot_percent: 0.0264
lagged_popups_percent: 0.0264
lagged_pull_percent: 0.0263
lagged_babip: 0.0262
lagged_straightaway_percent: 0.0259
lagged_groundballs_percent: 0.0256
lagged_sprint_speed: 0.0249
lagged_poorlyweak_percent: 0.0234
lagged_pa: 0.0231
lagged_poorlyunder_percent: 0.0228
lagged_player_age: 0.0201
lagged_launch_angle_avg: 0.0197
lagged_xba: 0.0196
lagged_poorlytopped_percent: 0.0187

Model 11 Results Saved:
R2: 0.37576449804201606
RMSE: 0.040905366337669606
Intercept: -10.6574107601

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [63]:
## Finding the best model thus far

print("All Model Results:")
for model_label, results in model_results.items():
    print(f"\n{model_label}:")
    print(f"  R2: {results['R2']:.4f}")
    print(f"  RMSE: {results['RMSE']:.4f}")
    print(f"  Intercept: {results['Intercept']:.4f}")
    print("  Coefficients:")
    for feature, coef in results["Coefficients"].items():
        print(f"    {feature}: {coef:.4f}")

## Model 2 has the highest R2 and lowest RMSE, so it is the best model thus far


All Model Results:

Model 1:
  R2: 0.5008
  RMSE: 0.0366
  Intercept: 0.1276
  Coefficients:
    lagged_xwobacon: 0.6522

Model 2:
  R2: 0.5047
  RMSE: 0.0364
  Intercept: 0.1354
  Coefficients:
    lagged_xwobacon: 0.6521
    lagged_player_age: -0.0003

Model 3:
  R2: 0.5041
  RMSE: 0.0365
  Intercept: -8.4457
  Coefficients:
    lagged_xwobacon: 0.6484
    lagged_year: 0.0042
    lagged_player_age: -0.0002

Model 4:
  R2: 0.4874
  RMSE: 0.0371
  Intercept: -10.6574
  Coefficients:
    lagged_xwobacon: 0.3041
    lagged_player_age: 0.0001
    lagged_year: 0.0052
    lagged_pa: -0.0000
    lagged_babip: 0.0644
    lagged_xba: -0.2770
    lagged_xslg: 0.1003
    lagged_exit_velocity_avg: -0.0029
    lagged_launch_angle_avg: 0.0016
    lagged_sweet_spot_percent: -0.0003
    lagged_barrel_batted_rate: 0.0037
    lagged_flareburner_percent: -0.0005
    lagged_poorlyunder_percent: 0.0023
    lagged_poorlytopped_percent: -0.0019
    lagged_poorlyweak_percent: -0.0021
    lagged_hard_hit_perc

In [64]:
## Predictions for 2025: Largest Improvement and Decline in wOBACON using Model 2

## Rerun the model 
X = df[['lagged_xwobacon', 'lagged_player_age']]  
y = df['wobacon']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

df_original[['last_name', 'first_name']] = df_original['last_name, first_name'].str.split(', ', expand=True)

df_2024 = df_original[df_original['year'] == 2024].copy()


df_2024['lagged_xwobacon'] = df_2024['xwobacon']  
df_2024['lagged_player_age'] = df_2024['player_age']   

X_2025 = df_2024[['lagged_xwobacon', 'lagged_player_age']]
df_2024['predicted_wobacon_2025'] = model.predict(X_2025)

df_2024['wobacon_difference'] = df_2024['predicted_wobacon_2025'] - df_2024['wobacon']

top_5_players = df_2024.nlargest(5, 'wobacon_difference')[
    ['last_name', 'first_name', 'player_age', 'wobacon', 'xwobacon', 'predicted_wobacon_2025', 'wobacon_difference']
]

bot_5_players = df_2024.nsmallest(5, 'wobacon_difference')[
    ['last_name', 'first_name', 'player_age', 'wobacon', 'xwobacon', 'predicted_wobacon_2025', 'wobacon_difference']
]

print("Top 5 Players with Largest Predicted Increase in wOBACON (2025):")
print(top_5_players)

print("Bottom 5 Players with Largest Predicted Decrease in wOBACON (2025):")
print(bot_5_players)



Intercept: 0.13542757700585528
Coefficients: [ 6.52085974e-01 -2.81000905e-04]
Mean Squared Error: 0.0013275952951131067
R-squared: 0.5047168007810195
Root Mean Squared Error (RMSE): 0.03643618112691157
Top 5 Players with Largest Predicted Increase in wOBACON (2025):
    last_name   first_name  player_age  wobacon  xwobacon  \
728     Morel  Christopher          25    0.324     0.381   
759    Bailey      Patrick          25    0.321     0.376   
761    Garcia       Maikel          24    0.292     0.331   
659  Crawford         J.P.          29    0.301     0.341   
686   Verdugo         Alex          28    0.296     0.320   

     predicted_wobacon_2025  wobacon_difference  
728                0.376847            0.052847  
759                0.373587            0.052587  
761                0.344524            0.052524  
659                0.349640            0.048640  
686                0.336227            0.040227  
Bottom 5 Players with Largest Predicted Decrease in wOBACON (2025

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
