## This is the first of 3 notebooks exploring xwOBA, its various components, and how they can be used to predict performance. 

## This notebook is focused on xwOBA overall. 

## Main research question: How predictive is xwOBA of future performance, especially wOBA?

## Data Used: Baseball Savant CSV with various statistics from 2021 to 2024 and a PA threshold of 400. 
     

In [34]:
## Import libraries

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor

In [35]:
## Read and examine data
## The savant csv is saved on the github repo

df_original = pd.read_csv('~/Desktop/savant.csv')
df_original.columns
print(df_original.columns.tolist())

['last_name, first_name', 'player_id', 'year', 'player_age', 'ab', 'pa', 'k_percent', 'bb_percent', 'babip', 'xba', 'xslg', 'woba', 'xwoba', 'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent', 'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent', 'avg_best_speed', 'avg_hyper_speed', 'z_swing_percent', 'oz_swing_percent', 'oz_swing_miss_percent', 'meatball_swing_percent', 'iz_contact_percent', 'whiff_percent', 'swing_percent', 'pull_percent', 'straightaway_percent', 'groundballs_percent', 'flyballs_percent', 'linedrives_percent', 'popups_percent', 'sprint_speed']


In [None]:
## Create laggged variables  

df = df_original.sort_values(by=['player_id', 'year'])
columns_to_lag = df.columns[df.columns.get_loc('player_id') + 1:]
lagged_df = df.groupby('player_id')[columns_to_lag].shift(1).add_prefix('lagged_')
df = pd.concat([df, lagged_df], axis=1)
df = df.dropna()
print(df)

    last_name, first_name  player_id  year  player_age   ab   pa  k_percent  \
188       Cabrera, Miguel     408234  2022          39  397  433       23.3   
189      Cruz Jr., Nelson     443558  2022          41  448  507       23.5   
190        Peralta, David     444482  2022          34  439  490       23.3   
393        Peralta, David     444482  2023          35  394  422       17.1   
191     Blackmon, Charlie     453568  2022          35  530  577       18.9   
..                    ...        ...   ...         ...  ...  ...        ...   
392        Vaughn, Andrew     683734  2022          24  510  555       17.3   
598        Vaughn, Andrew     683734  2023          25  566  615       21.0   
796        Vaughn, Andrew     683734  2024          26  570  619       21.3   
800        Doyle, Brenton     686668  2024          26  542  603       25.4   
811     Yoshida, Masataka     807799  2024          30  378  421       12.4   

     bb_percent  babip    xba  ...  lagged_iz_conta

In [37]:
## Simple linear regression: predicting next year wOBA based on previous year xwOBA

model_results = {}

# Model 1
X = df[['lagged_xwoba']]  
y = df['woba']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 1"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 1 Results Saved:")
print(f"R2: {model_results['Model 1']['R2']}")
print(f"RMSE: {model_results['Model 1']['RMSE']}")
print(f"Intercept: {model_results['Model 1']['Intercept']}")
print(f"Coefficients: {model_results['Model 1']['Coefficients']}")


Intercept: 0.1441255310523383
Coefficients: [0.55360409]
Mean Squared Error: 0.0007799340731601359
R-squared: 0.35038684795689223
Root Mean Squared Error (RMSE): 0.02792729978283142

Model 1 Results Saved:
R2: 0.35038684795689223
RMSE: 0.02792729978283142
Intercept: 0.1441255310523383
Coefficients: {'lagged_xwoba': 0.5536040861955068}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [38]:
## Simple Linear Regression: Adding in Age

# Model 2
X = df[['lagged_xwoba', 'lagged_player_age']]  
y = df['woba']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 2"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 2 Results Saved:")
print(f"R2: {model_results['Model 2']['R2']}")
print(f"RMSE: {model_results['Model 2']['RMSE']}")
print(f"Intercept: {model_results['Model 2']['Intercept']}")
print(f"Coefficients: {model_results['Model 2']['Coefficients']}")


Intercept: 0.15877012410160668
Coefficients: [ 0.55962444 -0.0005986 ]
Mean Squared Error: 0.0007586052301976316
R-squared: 0.3681518070516593
Root Mean Squared Error (RMSE): 0.02754278907804421

Model 2 Results Saved:
R2: 0.3681518070516593
RMSE: 0.02754278907804421
Intercept: 0.15877012410160668
Coefficients: {'lagged_xwoba': 0.5596244375244788, 'lagged_player_age': -0.0005986046707188332}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [39]:
## Adding in year

X = df[['lagged_xwoba', 'lagged_year', 'lagged_player_age']]  
y = df['woba']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 3"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 3 Results Saved:")
print(f"R2: {model_results['Model 3']['R2']}")
print(f"RMSE: {model_results['Model 3']['RMSE']}")
print(f"Intercept: {model_results['Model 3']['Intercept']}")
print(f"Coefficients: {model_results['Model 3']['Coefficients']}")



Intercept: -4.547965724298453
Coefficients: [ 0.55767506  0.0023278  -0.00057462]
Mean Squared Error: 0.0007603900053781419
R-squared: 0.3666652539305706
Root Mean Squared Error (RMSE): 0.02757517008792769

Model 3 Results Saved:
R2: 0.3666652539305706
RMSE: 0.02757517008792769
Intercept: -4.547965724298453
Coefficients: {'lagged_xwoba': 0.5576750575504529, 'lagged_year': 0.0023277973212030745, 'lagged_player_age': -0.0005746178874669729}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [40]:
## Another Linear Regression: Adding in a bunch of potentially relevant predictors

X = df[['lagged_xwoba','lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_xwobacon', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent', 'lagged_barrel_batted_rate', 'lagged_solidcontact_percent', 'lagged_flareburner_percent', 'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent', 'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent', 'lagged_pull_percent', 'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_linedrives_percent', 'lagged_popups_percent', 'lagged_sprint_speed']]  
y = df['woba']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 4"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 4 Results Saved:")
print(f"R2: {model_results['Model 4']['R2']}")
print(f"RMSE: {model_results['Model 4']['RMSE']}")
print(f"Intercept: {model_results['Model 4']['Intercept']}")
print(f"Coefficients: {model_results['Model 4']['Coefficients']}")



Intercept: -3.9337358475015876
Coefficients: [ 4.52423162e-01 -3.15147586e-04  1.85616257e-03  2.91332375e-06
 -1.28140640e-01  2.76845956e-03  2.85548237e-03  1.57129720e-04
 -6.00866698e-04 -3.22372162e-03 -3.51617013e-03 -1.72042544e-03
 -4.40236202e-03 -4.20483524e-03  3.14117848e-04  4.80577496e-04
  2.63465102e-05 -1.12423377e-04  1.59637253e-03 -1.50867935e-04
  5.21616032e-04  3.85134186e-03  2.67882303e-04 -5.44719609e-04
  2.96654920e-03]
Mean Squared Error: 0.0007877262990555925
R-squared: 0.3438966424387905
Root Mean Squared Error (RMSE): 0.0280664621756215

Model 4 Results Saved:
R2: 0.3438966424387905
RMSE: 0.0280664621756215
Intercept: -3.9337358475015876
Coefficients: {'lagged_xwoba': 0.45242316155508083, 'lagged_player_age': -0.0003151475856129229, 'lagged_year': 0.001856162565675328, 'lagged_pa': 2.9133237512221784e-06, 'lagged_xwobacon': -0.12814064047778898, 'lagged_exit_velocity_avg': 0.0027684595649454906, 'lagged_launch_angle_avg': 0.00285548236905071, 'lagged_sw

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
## The above model likely has a lot of multicollinearity. This uses VIF to check for multicollinearity and remove highly correlated variables
## The goal is to still incorporate many of the above predictors to create the most predictive model possible

# Model 5
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data['feature'] = df.columns
    vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

def reduce_vif(df, threshold=5.0):
    while True:
        vif_data = calculate_vif(df)
        max_vif = vif_data['VIF'].max()
        if max_vif > threshold:
            feature_to_remove = vif_data.sort_values('VIF', ascending=False).iloc[0]['feature']
            print(f'Removing {feature_to_remove} with VIF={max_vif}')
            df = df.drop(columns=[feature_to_remove])
        else:
            break
    return df

X_train_reduced = reduce_vif(X_train)

model_reduced = LinearRegression()
model_reduced.fit(X_train_reduced, y_train)

X_test_reduced = X_test[X_train_reduced.columns]
y_pred_reduced = model_reduced.predict(X_test_reduced)

print("Reduced Model Results:")
print("Intercept:", model_reduced.intercept_)
print("Coefficients:", dict(zip(X_train_reduced.columns, model_reduced.coef_)))

mse_reduced = mean_squared_error(y_test, y_pred_reduced)
r2_reduced = r2_score(y_test, y_pred_reduced)
rmse_reduced = np.sqrt(mse_reduced)

print("Mean Squared Error (Reduced):", mse_reduced)
print("R-squared (Reduced):", r2_reduced)
print("Root Mean Squared Error (RMSE) (Reduced):", rmse_reduced)

final_vif = calculate_vif(X_train_reduced)
print("Final VIF values:")
print(final_vif)

model_results["Model 5"] = {
    "R2": r2_reduced,
    "RMSE": rmse_reduced,
    "Intercept": model_reduced.intercept_,
    "Coefficients": dict(zip(X_train_reduced.columns, model_reduced.coef_)),
    "Final VIF": final_vif.to_dict(orient='records'), 
}

print("\nModel 5 Results Saved:")
print(f"R2: {model_results['Model 5']['R2']}")
print(f"RMSE: {model_results['Model 5']['RMSE']}")
print(f"Intercept: {model_results['Model 5']['Intercept']}")
print(f"Coefficients: {model_results['Model 5']['Coefficients']}")
print(f"Final VIF values: {model_results['Model 5']['Final VIF']}")

Removing lagged_year with VIF=109093.34202677768
Removing lagged_exit_velocity_avg with VIF=22291.66760459724
Removing lagged_groundballs_percent with VIF=1974.344817055038
Removing lagged_xwobacon with VIF=1793.473163577033
Removing lagged_iz_contact_percent with VIF=951.7505316377337
Removing lagged_z_swing_percent with VIF=770.9798574803996
Removing lagged_sprint_speed with VIF=649.0374709937787
Removing lagged_sweet_spot_percent with VIF=549.0817406350403
Removing lagged_xwoba with VIF=419.6249535329412
Removing lagged_poorlyunder_percent with VIF=338.0760496098787
Removing lagged_straightaway_percent with VIF=247.1256016482887
Removing lagged_hard_hit_percent with VIF=208.4753129814638
Removing lagged_meatball_swing_percent with VIF=166.63605414326236
Removing lagged_linedrives_percent with VIF=150.9195258896654
Removing lagged_poorlytopped_percent with VIF=88.78505180799326
Removing lagged_pull_percent with VIF=73.10222451878488
Removing lagged_player_age with VIF=65.560420406570

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [42]:
## Trying a stepwise model to see how it compares to all of the above models

# Model 6
def stepwise_selection(X, y, initial_features=[], threshold_in=0.05, threshold_out=0.05, verbose=True):
    included = list(initial_features)
    while True:
        changed = False
        
        excluded = list(set(X.columns) - set(included))
        new_pvals = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included + [new_column]])).fit()
            new_pvals[new_column] = model.pvalues[new_column]
        best_pval = new_pvals.min()
        if best_pval < threshold_in:
            best_feature = new_pvals.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Adding feature {best_feature} with p-value {best_pval}')
        
        model = sm.OLS(y, sm.add_constant(X[included])).fit()
        pvals = model.pvalues.iloc[1:]  
        worst_pval = pvals.max()
        if worst_pval > threshold_out:
            worst_feature = pvals.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Removing feature {worst_feature} with p-value {worst_pval}')
        
        if not changed:
            break
    
    return included

selected_features = stepwise_selection(X_train, y_train)

model_stepwise = LinearRegression()
X_train_stepwise = X_train[selected_features]
X_test_stepwise = X_test[selected_features]
model_stepwise.fit(X_train_stepwise, y_train)

y_pred_stepwise = model_stepwise.predict(X_test_stepwise)

print("\nStepwise Model Results:")
print("Intercept:", model_stepwise.intercept_)
print("Coefficients:", dict(zip(X_train_stepwise.columns, model_stepwise.coef_)))

mse_stepwise = mean_squared_error(y_test, y_pred_stepwise)
r2_stepwise = r2_score(y_test, y_pred_stepwise)
rmse_stepwise = np.sqrt(mse_stepwise)

print("Mean Squared Error (Stepwise):", mse_stepwise)
print("R-squared (Stepwise):", r2_stepwise)
print("Root Mean Squared Error (RMSE) (Stepwise):", rmse_stepwise)

model_results["Model 6"] = {
    "R2": r2_stepwise,
    "RMSE": rmse_stepwise,
    "Intercept": model_stepwise.intercept_,
    "Coefficients": dict(zip(X_train_stepwise.columns, model_stepwise.coef_)),
    "Selected Features": selected_features,
}

print("\nModel 6 Results Saved:")
print(f"R2: {model_results['Model 6']['R2']}")
print(f"RMSE: {model_results['Model 6']['RMSE']}")
print(f"Intercept: {model_results['Model 6']['Intercept']}")
print(f"Coefficients: {model_results['Model 6']['Coefficients']}")
print(f"Selected Features: {model_results['Model 6']['Selected Features']}")



Adding feature lagged_xwoba with p-value 2.4331900907232218e-26
Adding feature lagged_exit_velocity_avg with p-value 0.007314474360284081
Adding feature lagged_iz_contact_percent with p-value 0.01714424974483817
Adding feature lagged_linedrives_percent with p-value 0.006994089234937503

Stepwise Model Results:
Intercept: -0.23091130924157227
Coefficients: {'lagged_xwoba': 0.40707166539245854, 'lagged_exit_velocity_avg': 0.004051256487085949, 'lagged_iz_contact_percent': 0.0011929646647692342, 'lagged_linedrives_percent': -0.001571877597359122}
Mean Squared Error (Stepwise): 0.0008382202508276731
R-squared (Stepwise): 0.30183983751312704
Root Mean Squared Error (RMSE) (Stepwise): 0.02895203362162446

Model 6 Results Saved:
R2: 0.30183983751312704
RMSE: 0.02895203362162446
Intercept: -0.23091130924157227
Coefficients: {'lagged_xwoba': 0.40707166539245854, 'lagged_exit_velocity_avg': 0.004051256487085949, 'lagged_iz_contact_percent': 0.0011929646647692342, 'lagged_linedrives_percent': -0.

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [43]:
## Lasso regression

X = df[['lagged_xwoba', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_xwobacon',
        'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent',
        'lagged_barrel_batted_rate', 'lagged_solidcontact_percent', 'lagged_flareburner_percent',
        'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent',
        'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent',
        'lagged_meatball_swing_percent', 'lagged_iz_contact_percent', 'lagged_pull_percent',
        'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_linedrives_percent',
        'lagged_popups_percent', 'lagged_sprint_speed']]
y = df['woba']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso_model = Lasso(alpha=0.01)  

lasso_model.fit(X_train, y_train)

print("Intercept:", lasso_model.intercept_)
print("Coefficients:", lasso_model.coef_)

y_pred = lasso_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 7"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 7 Results Saved:")
print(f"R2: {model_results['Model 7']['R2']}")
print(f"RMSE: {model_results['Model 7']['RMSE']}")
print(f"Intercept: {model_results['Model 7']['Intercept']}")
print(f"Coefficients: {model_results['Model 7']['Coefficients']}")




Intercept: 0.1658031688510414
Coefficients: [ 0.00000000e+00 -0.00000000e+00  0.00000000e+00  3.78045575e-05
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  1.34832016e-03 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  1.44782276e-03  0.00000000e+00
 -1.76011261e-04  0.00000000e+00  8.99970208e-04  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00]
Mean Squared Error (MSE): 0.0009031377887676867
R-squared (R2): 0.24776951555216853
Root Mean Squared Error (RMSE): 0.030052250976718646

Model 7 Results Saved:
R2: 0.24776951555216853
RMSE: 0.030052250976718646
Intercept: -3.9337358475015876
Coefficients: {'lagged_xwoba': 0.45242316155508083, 'lagged_player_age': -0.0003151475856129229, 'lagged_year': 0.001856162565675328, 'lagged_pa': 2.9133237512221784e-06, 'lagged_xwobacon': -0.12814064047778898, 'lagged_exit_velocity_avg': 0.0027684595649454906, 'lagged_launch_angle_avg': 0.0028554823690

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [44]:
## Ridge Regression

X = df[['lagged_xwoba', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_xwobacon',
        'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent',
        'lagged_barrel_batted_rate', 'lagged_solidcontact_percent', 'lagged_flareburner_percent',
        'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent',
        'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent',
        'lagged_meatball_swing_percent', 'lagged_iz_contact_percent', 'lagged_pull_percent',
        'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_linedrives_percent',
        'lagged_popups_percent', 'lagged_sprint_speed']]
y = df['woba']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = Ridge(alpha=1.0) 

ridge_model.fit(X_train, y_train)

print("Intercept:", ridge_model.intercept_)
print("Coefficients:", ridge_model.coef_)

y_pred = ridge_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 8"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 8 Results Saved:")
print(f"R2: {model_results['Model 8']['R2']}")
print(f"RMSE: {model_results['Model 8']['RMSE']}")
print(f"Intercept: {model_results['Model 8']['Intercept']}")
print(f"Coefficients: {model_results['Model 8']['Coefficients']}")



Intercept: -3.9927600963881176
Coefficients: [ 2.34294651e-02 -1.82735153e-04  1.79144972e-03  1.80474015e-05
  4.67353534e-03  3.01691905e-03  3.27392324e-03  2.74999887e-04
  1.89285900e-03 -1.99392302e-03 -2.08908081e-03 -9.56982299e-04
 -3.40634882e-03 -2.82097970e-03  4.46812074e-04  8.29170871e-04
 -5.64879403e-04 -1.53432240e-04  2.81052796e-03 -7.58094017e-05
  7.31468843e-04  4.12752742e-03  4.63101738e-04 -5.12351701e-04
  2.23572186e-03]
Mean Squared Error (MSE): 0.000823784532584361
R-squared (R2): 0.3138634594483092
Root Mean Squared Error (RMSE): 0.028701646861885137

Model 8 Results Saved:
R2: 0.3138634594483092
RMSE: 0.028701646861885137
Intercept: -3.9337358475015876
Coefficients: {'lagged_xwoba': 0.45242316155508083, 'lagged_player_age': -0.0003151475856129229, 'lagged_year': 0.001856162565675328, 'lagged_pa': 2.9133237512221784e-06, 'lagged_xwobacon': -0.12814064047778898, 'lagged_exit_velocity_avg': 0.0027684595649454906, 'lagged_launch_angle_avg': 0.002855482369050

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [45]:
## Predicting the mean for all player's for next year's wOBA 
## This is a validation technique to compare to our prior models

mean_woba = y_train.mean()

y_pred_mean = np.full_like(y_test, fill_value=mean_woba, dtype=np.float64)

mse_mean = mean_squared_error(y_test, y_pred_mean)
r2_mean = r2_score(y_test, y_pred_mean)
rmse_mean = np.sqrt(mse_mean)

print("Mean-Based Model Metrics:")
print("Mean Squared Error (MSE):", mse_mean)
print("R-squared (R2):", r2_mean)
print("Root Mean Squared Error (RMSE):", rmse_mean)

model_results["Model 9"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 9 Results Saved:")
print(f"R2: {model_results['Model 9']['R2']}")
print(f"RMSE: {model_results['Model 9']['RMSE']}")
print(f"Intercept: {model_results['Model 9']['Intercept']}")
print(f"Coefficients: {model_results['Model 9']['Coefficients']}")



Mean-Based Model Metrics:
Mean Squared Error (MSE): 0.0012007087130712487
R-squared (R2): -7.962034976860544e-05
Root Mean Squared Error (RMSE): 0.03465124403353

Model 9 Results Saved:
R2: 0.3138634594483092
RMSE: 0.028701646861885137
Intercept: -3.9337358475015876
Coefficients: {'lagged_xwoba': 0.45242316155508083, 'lagged_player_age': -0.0003151475856129229, 'lagged_year': 0.001856162565675328, 'lagged_pa': 2.9133237512221784e-06, 'lagged_xwobacon': -0.12814064047778898, 'lagged_exit_velocity_avg': 0.0027684595649454906, 'lagged_launch_angle_avg': 0.00285548236905071, 'lagged_sweet_spot_percent': 0.00015712972032026746, 'lagged_barrel_batted_rate': -0.0006008666981693386, 'lagged_solidcontact_percent': -0.003223721620194792, 'lagged_flareburner_percent': -0.0035161701292285475, 'lagged_poorlyunder_percent': -0.0017204254371608275, 'lagged_poorlytopped_percent': -0.0044023620154284545, 'lagged_poorlyweak_percent': -0.004204835242971004, 'lagged_hard_hit_percent': 0.000314117847806470

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [46]:
## Random Forest Model

X = df[['lagged_xwoba', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_xwobacon',
        'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent',
        'lagged_barrel_batted_rate', 'lagged_solidcontact_percent', 'lagged_flareburner_percent',
        'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent',
        'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent',
        'lagged_meatball_swing_percent', 'lagged_iz_contact_percent', 'lagged_pull_percent',
        'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_linedrives_percent',
        'lagged_popups_percent', 'lagged_sprint_speed']]
y = df['woba']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nRandom Forest Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)


feature_importance = rf_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in sorted(zip(features, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

model_results["Model 10"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 10 Results Saved:")
print(f"R2: {model_results['Model 10']['R2']}")
print(f"RMSE: {model_results['Model 10']['RMSE']}")
print(f"Intercept: {model_results['Model 10']['Intercept']}")
print(f"Coefficients: {model_results['Model 10']['Coefficients']}")


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):



Random Forest Model Evaluation:
Mean Squared Error (MSE): 0.0008640152568181809
R-squared (R2): 0.28035497651638985
Root Mean Squared Error (RMSE): 0.029394136435999967

Feature Importances:
lagged_xwoba: 0.2980
lagged_exit_velocity_avg: 0.0560
lagged_poorlyweak_percent: 0.0484
lagged_iz_contact_percent: 0.0478
lagged_pa: 0.0475
lagged_xwobacon: 0.0411
lagged_linedrives_percent: 0.0376
lagged_hard_hit_percent: 0.0330
lagged_meatball_swing_percent: 0.0300
lagged_barrel_batted_rate: 0.0299
lagged_flareburner_percent: 0.0279
lagged_pull_percent: 0.0273
lagged_player_age: 0.0266
lagged_sprint_speed: 0.0250
lagged_poorlyunder_percent: 0.0249
lagged_oz_swing_percent: 0.0243
lagged_popups_percent: 0.0237
lagged_solidcontact_percent: 0.0236
lagged_z_swing_percent: 0.0236
lagged_sweet_spot_percent: 0.0228
lagged_straightaway_percent: 0.0224
lagged_poorlytopped_percent: 0.0186
lagged_groundballs_percent: 0.0153
lagged_launch_angle_avg: 0.0138
lagged_year: 0.0107

Model 10 Results Saved:
R2: 0.2

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [47]:
## XGBoost Model

X = df[['lagged_xwoba', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_xwobacon',
        'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_percent',
        'lagged_barrel_batted_rate', 'lagged_solidcontact_percent', 'lagged_flareburner_percent',
        'lagged_poorlyunder_percent', 'lagged_poorlytopped_percent', 'lagged_poorlyweak_percent',
        'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent',
        'lagged_meatball_swing_percent', 'lagged_iz_contact_percent', 'lagged_pull_percent',
        'lagged_straightaway_percent', 'lagged_groundballs_percent', 'lagged_linedrives_percent',
        'lagged_popups_percent', 'lagged_sprint_speed']]
y = df['woba']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(
    n_estimators=100,       
    learning_rate=0.1,    
    max_depth=3,            
    subsample=0.8,          
    colsample_bytree=0.8,   
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nXGBoost Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

feature_importances = xgb_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in sorted(zip(features, feature_importances), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

model_results["Model 11"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 11 Results Saved:")
print(f"R2: {model_results['Model 11']['R2']}")
print(f"RMSE: {model_results['Model 11']['RMSE']}")
print(f"Intercept: {model_results['Model 11']['Intercept']}")
print(f"Coefficients: {model_results['Model 11']['Coefficients']}")




XGBoost Model Evaluation:
Mean Squared Error (MSE): 0.0009842124860792493
R-squared (R2): 0.18024176995938002
Root Mean Squared Error (RMSE): 0.03137216100429247

Feature Importances:
lagged_xwoba: 0.1183
lagged_exit_velocity_avg: 0.0798
lagged_iz_contact_percent: 0.0530
lagged_barrel_batted_rate: 0.0485
lagged_flareburner_percent: 0.0423
lagged_xwobacon: 0.0418
lagged_hard_hit_percent: 0.0401
lagged_linedrives_percent: 0.0396
lagged_year: 0.0384
lagged_poorlyweak_percent: 0.0384
lagged_solidcontact_percent: 0.0360
lagged_meatball_swing_percent: 0.0351
lagged_poorlytopped_percent: 0.0351
lagged_pa: 0.0333
lagged_groundballs_percent: 0.0328
lagged_straightaway_percent: 0.0315
lagged_z_swing_percent: 0.0310
lagged_poorlyunder_percent: 0.0309
lagged_player_age: 0.0306
lagged_popups_percent: 0.0288
lagged_pull_percent: 0.0288
lagged_launch_angle_avg: 0.0287
lagged_sprint_speed: 0.0274
lagged_oz_swing_percent: 0.0253
lagged_sweet_spot_percent: 0.0244

Model 11 Results Saved:
R2: 0.18024176

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [48]:
## Finding the best model thus far

print("All Model Results:")
for model_label, results in model_results.items():
    print(f"\n{model_label}:")
    print(f"  R2: {results['R2']:.4f}")
    print(f"  RMSE: {results['RMSE']:.4f}")
    print(f"  Intercept: {results['Intercept']:.4f}")
    print("  Coefficients:")
    for feature, coef in results["Coefficients"].items():
        print(f"    {feature}: {coef:.4f}")

## Model 2 has the highest R2 and lowest RMSE, so it is the best model thus far


All Model Results:

Model 1:
  R2: 0.3504
  RMSE: 0.0279
  Intercept: 0.1441
  Coefficients:
    lagged_xwoba: 0.5536

Model 2:
  R2: 0.3682
  RMSE: 0.0275
  Intercept: 0.1588
  Coefficients:
    lagged_xwoba: 0.5596
    lagged_player_age: -0.0006

Model 3:
  R2: 0.3667
  RMSE: 0.0276
  Intercept: -4.5480
  Coefficients:
    lagged_xwoba: 0.5577
    lagged_year: 0.0023
    lagged_player_age: -0.0006

Model 4:
  R2: 0.3439
  RMSE: 0.0281
  Intercept: -3.9337
  Coefficients:
    lagged_xwoba: 0.4524
    lagged_player_age: -0.0003
    lagged_year: 0.0019
    lagged_pa: 0.0000
    lagged_xwobacon: -0.1281
    lagged_exit_velocity_avg: 0.0028
    lagged_launch_angle_avg: 0.0029
    lagged_sweet_spot_percent: 0.0002
    lagged_barrel_batted_rate: -0.0006
    lagged_solidcontact_percent: -0.0032
    lagged_flareburner_percent: -0.0035
    lagged_poorlyunder_percent: -0.0017
    lagged_poorlytopped_percent: -0.0044
    lagged_poorlyweak_percent: -0.0042
    lagged_hard_hit_percent: 0.0003
    

In [49]:
## Predictions for 2025: Largest Improvement and Decline in wOBA using Model 2

## Rerun the model
X = df[['lagged_xwoba', 'lagged_player_age']]  
y = df['woba']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

df_original[['last_name', 'first_name']] = df_original['last_name, first_name'].str.split(', ', expand=True)

df_2024 = df_original[df_original['year'] == 2024].copy()

df_2024['lagged_xwoba'] = df_2024['xwoba']  
df_2024['lagged_player_age'] = df_2024['player_age']   

X_2025 = df_2024[['lagged_xwoba', 'lagged_player_age']]
df_2024['predicted_woba_2025'] = model.predict(X_2025)

df_2024['woba_difference'] = df_2024['predicted_woba_2025'] - df_2024['woba']

top_5_players = df_2024.nlargest(5, 'woba_difference')[
    ['last_name', 'first_name', 'player_age', 'woba', 'xwoba', 'predicted_woba_2025', 'woba_difference']
]

bot_5_players = df_2024.nsmallest(5, 'woba_difference')[
    ['last_name', 'first_name', 'player_age', 'woba', 'xwoba', 'predicted_woba_2025', 'woba_difference']
]

print("Top 5 Players with Largest Predicted Increase in wOBA (2025):")
print(top_5_players)

print("Bottom 5 Players with Largest Predicted Decrease in wOBA (2025):")
print(bot_5_players)



Intercept: 0.15877012410160668
Coefficients: [ 0.55962444 -0.0005986 ]
Mean Squared Error: 0.0007586052301976316
R-squared: 0.3681518070516593
Root Mean Squared Error (RMSE): 0.02754278907804421
Top 5 Players with Largest Predicted Increase in wOBA (2025):
    last_name   first_name  player_age   woba  xwoba  predicted_woba_2025  \
761    Garcia       Maikel          24  0.270  0.300             0.312291   
759    Bailey      Patrick          25  0.281  0.319             0.322325   
728     Morel  Christopher          25  0.280  0.316             0.320646   
665      Siri         Jose          28  0.271  0.297             0.308218   
691      Ruiz      Keibert          25  0.268  0.279             0.299940   

     woba_difference  
761         0.042291  
759         0.041325  
728         0.040646  
665         0.037218  
691         0.031940  
Bottom 5 Players with Largest Predicted Decrease in wOBA (2025):
    last_name first_name  player_age   woba  xwoba  predicted_woba_2025  \
62

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
