In [71]:
## This is the 2nd of 3 notebooks exploring xwOBA, its various components, and how they can be used to predict performance.

## This notebook is focused on BB% and models xBB% as a function of various plate discipline metrics.

## While xwOBACON is a descriptive stat, it can be used in a predictive fashion, often predicting future performance quite well. Its inclusion of exit velocity and launch angle helps its predictive power.

## On the other hand, xwOBA as a stat does not attempt to make BB% predictive; instead, it simply uses actual BB%.

## Main Research Question: Can we create xBB% using plate discipline metrics, and how well does it predict future BB%?

## Data Used: Baseball Savant CSV with various statistics from 2021 to 2024 and a PA threshold of 400. 

In [111]:
## Import Libraries

import numpy as np
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor

In [112]:
## Load and Inspect Data

df_original = pd.read_csv('~/Desktop/savant.csv')
print(df_original.columns)
print(df_original.head())
print(df_original.columns.tolist())

Index(['last_name, first_name', 'player_id', 'year', 'player_age', 'ab', 'pa',
       'k_percent', 'bb_percent', 'babip', 'xba', 'xslg', 'woba', 'xwoba',
       'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg',
       'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent',
       'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent',
       'poorlyweak_percent', 'hard_hit_percent', 'avg_best_speed',
       'avg_hyper_speed', 'z_swing_percent', 'oz_swing_percent',
       'oz_swing_miss_percent', 'meatball_swing_percent', 'iz_contact_percent',
       'whiff_percent', 'swing_percent', 'pull_percent',
       'straightaway_percent', 'groundballs_percent', 'flyballs_percent',
       'linedrives_percent', 'popups_percent', 'sprint_speed'],
      dtype='object')
  last_name, first_name  player_id  year  player_age   ab   pa  k_percent  \
0       Cabrera, Miguel     408234  2021          38  472  526       22.4   
1        Molina, Yadier     425877  2021

In [113]:
## Creating Lagged Dataset

df = df_original.sort_values(by=['player_id', 'year'])
columns_to_lag = df.columns[df.columns.get_loc('player_id') + 1:] 
lagged_df = df.groupby('player_id')[columns_to_lag].shift(1).add_prefix('lagged_')
df = pd.concat([df, lagged_df], axis=1)
df = df.dropna()
print(df.columns.tolist())
print(df)


['last_name, first_name', 'player_id', 'year', 'player_age', 'ab', 'pa', 'k_percent', 'bb_percent', 'babip', 'xba', 'xslg', 'woba', 'xwoba', 'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg', 'sweet_spot_percent', 'barrel_batted_rate', 'solidcontact_percent', 'flareburner_percent', 'poorlyunder_percent', 'poorlytopped_percent', 'poorlyweak_percent', 'hard_hit_percent', 'avg_best_speed', 'avg_hyper_speed', 'z_swing_percent', 'oz_swing_percent', 'oz_swing_miss_percent', 'meatball_swing_percent', 'iz_contact_percent', 'whiff_percent', 'swing_percent', 'pull_percent', 'straightaway_percent', 'groundballs_percent', 'flyballs_percent', 'linedrives_percent', 'popups_percent', 'sprint_speed', 'lagged_year', 'lagged_player_age', 'lagged_ab', 'lagged_pa', 'lagged_k_percent', 'lagged_bb_percent', 'lagged_babip', 'lagged_xba', 'lagged_xslg', 'lagged_woba', 'lagged_xwoba', 'lagged_wobacon', 'lagged_xwobacon', 'lagged_exit_velocity_avg', 'lagged_launch_angle_avg', 'lagged_sweet_spot_per

In [114]:
## Linear Regression: Last Year's BB% to Predict Next Year's BB%

model_results = {}

# Model 1
X = df[['lagged_bb_percent']]  
y = df['bb_percent']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 1"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 1 Results Saved:")
print(f"R2: {model_results['Model 1']['R2']}")
print(f"RMSE: {model_results['Model 1']['RMSE']}")
print(f"Intercept: {model_results['Model 1']['Intercept']}")
print(f"Coefficients: {model_results['Model 1']['Coefficients']}")


Intercept: 2.3639447578618853
Coefficients: [0.71111456]
Mean Squared Error: 5.091403089831084
R-squared: 0.35307885280342644
Root Mean Squared Error (RMSE): 2.25641376742633

Model 1 Results Saved:
R2: 0.35307885280342644
RMSE: 2.25641376742633
Intercept: 2.3639447578618853
Coefficients: {'lagged_bb_percent': 0.7111145617876665}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [115]:
## Linear Regression: Adding in Age

# Model 2

X = df[['lagged_bb_percent', 'lagged_player_age']]  
y = df['bb_percent']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 2"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 2 Results Saved:")
print(f"R2: {model_results['Model 2']['R2']}")
print(f"RMSE: {model_results['Model 2']['RMSE']}")
print(f"Intercept: {model_results['Model 2']['Intercept']}")
print(f"Coefficients: {model_results['Model 2']['Coefficients']}")



Intercept: 2.7462298775964724
Coefficients: [ 0.71357973 -0.01453892]
Mean Squared Error: 5.07492678239251
R-squared: 0.35517235660221513
Root Mean Squared Error (RMSE): 2.2527598146257204

Model 2 Results Saved:
R2: 0.35517235660221513
RMSE: 2.2527598146257204
Intercept: 2.7462298775964724
Coefficients: {'lagged_bb_percent': 0.7135797300619657, 'lagged_player_age': -0.014538915817847534}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [116]:
## Linear Regression: Adding in Year

# Model 3

X = df[['lagged_bb_percent', 'lagged_year', 'lagged_player_age']]  
y = df['bb_percent']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 3"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 3 Results Saved:")
print(f"R2: {model_results['Model 3']['R2']}")
print(f"RMSE: {model_results['Model 3']['RMSE']}")
print(f"Intercept: {model_results['Model 3']['Intercept']}")
print(f"Coefficients: {model_results['Model 3']['Coefficients']}")


Intercept: -215.39192328435004
Coefficients: [ 0.71402749  0.10786902 -0.01357088]
Mean Squared Error: 5.091214655052115
R-squared: 0.3531027956029321
Root Mean Squared Error (RMSE): 2.2563720116709733

Model 3 Results Saved:
R2: 0.3531027956029321
RMSE: 2.2563720116709733
Intercept: -215.39192328435004
Coefficients: {'lagged_bb_percent': 0.7140274902798202, 'lagged_year': 0.10786902333552492, 'lagged_player_age': -0.01357088244013245}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [117]:
## Linear Regression: All Selected Predictors
## Hypothesis that HH% is a signal for hitters that get pitched around 

# Model 4

X = df[['lagged_bb_percent', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 'lagged_oz_swing_miss_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent']]  
y = df['bb_percent']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 4"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 4 Results Saved:")
print(f"R2: {model_results['Model 4']['R2']}")
print(f"RMSE: {model_results['Model 4']['RMSE']}")
print(f"Intercept: {model_results['Model 4']['Intercept']}")
print(f"Coefficients: {model_results['Model 4']['Coefficients']}")



Intercept: -233.29779276475446
Coefficients: [ 0.483548   -0.0156657   0.11922611  0.00232121  0.0374682   0.00569477
  0.00569477 -0.13645079 -0.00649789 -0.00726859 -0.02283108]
Mean Squared Error: 4.927383547829594
R-squared: 0.37391941647558713
Root Mean Squared Error (RMSE): 2.219771057525887

Model 4 Results Saved:
R2: 0.37391941647558713
RMSE: 2.219771057525887
Intercept: -233.29779276475446
Coefficients: {'lagged_bb_percent': 0.48354799914368196, 'lagged_player_age': -0.01566570447784367, 'lagged_year': 0.11922611119210685, 'lagged_pa': 0.0023212071002631854, 'lagged_hard_hit_percent': 0.03746820271216754, 'lagged_z_swing_percent': 0.0056947654068871985, 'lagged_oz_swing_percent': -0.13645079186539655, 'lagged_oz_swing_miss_percent': -0.006497890050379642, 'lagged_meatball_swing_percent': -0.007268587800101391, 'lagged_iz_contact_percent': -0.022831083207025406}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [118]:
## VIF Calculation to check for multicollinearity and find a better linear model 

# Model 5
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data['feature'] = df.columns
    vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

def reduce_vif(df, threshold=5.0):
    while True:
        vif_data = calculate_vif(df)
        max_vif = vif_data['VIF'].max()
        if max_vif > threshold:
            feature_to_remove = vif_data.sort_values('VIF', ascending=False).iloc[0]['feature']
            print(f'Removing {feature_to_remove} with VIF={max_vif}')
            df = df.drop(columns=[feature_to_remove])
        else:
            break
    return df

X_train_reduced = reduce_vif(X_train)

model_reduced = LinearRegression()
model_reduced.fit(X_train_reduced, y_train)

X_test_reduced = X_test[X_train_reduced.columns]
y_pred_reduced = model_reduced.predict(X_test_reduced)

print("Reduced Model Results:")
print("Intercept:", model_reduced.intercept_)
print("Coefficients:", dict(zip(X_train_reduced.columns, model_reduced.coef_)))

mse_reduced = mean_squared_error(y_test, y_pred_reduced)
r2_reduced = r2_score(y_test, y_pred_reduced)
rmse_reduced = np.sqrt(mse_reduced)

print("Mean Squared Error (Reduced):", mse_reduced)
print("R-squared (Reduced):", r2_reduced)
print("Root Mean Squared Error (RMSE) (Reduced):", rmse_reduced)

final_vif = calculate_vif(X_train_reduced)
print("Final VIF values:")
print(final_vif)

model_results["Model 5"] = {
    "R2": r2_reduced,
    "RMSE": rmse_reduced,
    "Intercept": model_reduced.intercept_,
    "Coefficients": dict(zip(X_train_reduced.columns, model_reduced.coef_)),
    "Final VIF": final_vif.to_dict(orient='records'), 
}

print("\nModel 5 Results Saved:")
print(f"R2: {model_results['Model 5']['R2']}")
print(f"RMSE: {model_results['Model 5']['RMSE']}")
print(f"Intercept: {model_results['Model 5']['Intercept']}")
print(f"Coefficients: {model_results['Model 5']['Coefficients']}")
print(f"Final VIF values: {model_results['Model 5']['Final VIF']}")



Removing lagged_z_swing_percent with VIF=inf
Removing lagged_year with VIF=1402.0691006251905
Removing lagged_meatball_swing_percent with VIF=150.23064621300756
Removing lagged_iz_contact_percent with VIF=114.3698786045578
Removing lagged_player_age with VIF=54.62171566888954
Removing lagged_hard_hit_percent with VIF=50.88488067379316
Removing lagged_pa with VIF=33.933700432325786
Removing lagged_oz_swing_miss_percent with VIF=25.003545863705018
Reduced Model Results:
Intercept: 6.484392417280757
Coefficients: {'lagged_bb_percent': 0.5622407708052628, 'lagged_oz_swing_percent': -0.09863692149546227}
Mean Squared Error (Reduced): 5.062117540291651
R-squared (Reduced): 0.35679991769851815
Root Mean Squared Error (RMSE) (Reduced): 2.2499150073484224
Final VIF values:
                   feature       VIF
0        lagged_bb_percent  4.522927
1  lagged_oz_swing_percent  4.522927

Model 5 Results Saved:
R2: 0.35679991769851815
RMSE: 2.2499150073484224
Intercept: 6.484392417280757
Coefficients

  vif = 1. / (1. - r_squared_i)
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [119]:
# Stepwise Regression

def stepwise_selection(X, y, initial_features=[], threshold_in=0.05, threshold_out=0.05, verbose=True):
    included = list(initial_features)
    while True:
        changed = False
        
        excluded = list(set(X.columns) - set(included))
        new_pvals = pd.Series(index=excluded, dtype=np.float64)
        for new_column in excluded:
            try:
                model = sm.OLS(y, sm.add_constant(X[included + [new_column]])).fit()
                new_pvals.loc[new_column] = model.pvalues.get(new_column, float('nan'))
            except Exception as e:
                print(f"Error adding {new_column}: {e}")
        
        best_pval = new_pvals.min()
        if best_pval < threshold_in:
            best_feature = new_pvals.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f"Adding feature {best_feature} with p-value {best_pval}")
        
        try:
            model = sm.OLS(y, sm.add_constant(X[included])).fit()
            pvals = model.pvalues.iloc[1:]  
        except Exception as e:
            print(f"Error in backward step: {e}")
            break
        
        worst_pval = pvals.max()
        if worst_pval > threshold_out:
            worst_feature = pvals.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f"Removing feature {worst_feature} with p-value {worst_pval}")
        
        if not changed:
            break
    
    return included

selected_features = stepwise_selection(X_train, y_train)
print("Selected Features:", selected_features)

X_train_stepwise = X_train[selected_features]
X_test_stepwise = X_test[selected_features]

model_stepwise = LinearRegression()
model_stepwise.fit(X_train_stepwise, y_train)

y_pred_stepwise = model_stepwise.predict(X_test_stepwise)

mse_stepwise = mean_squared_error(y_test, y_pred_stepwise)
r2_stepwise = r2_score(y_test, y_pred_stepwise)
rmse_stepwise = np.sqrt(mse_stepwise)

print("\nStepwise Model Results:")
print("Intercept:", model_stepwise.intercept_)
print("Coefficients:", dict(zip(X_train_stepwise.columns, model_stepwise.coef_)))
print("Mean Squared Error (Stepwise):", mse_stepwise)
print("R-squared (Stepwise):", r2_stepwise)
print("Root Mean Squared Error (RMSE) (Stepwise):", rmse_stepwise)

model_results["Model 6"] = {
    "R2": r2_stepwise,
    "RMSE": rmse_stepwise,
    "Intercept": model_stepwise.intercept_,
    "Coefficients": dict(zip(X_train_stepwise.columns, model_stepwise.coef_)),
    "Selected Features": selected_features,
}

print("\nModel 6 Results Saved:")
print(f"R2: {model_results['Model 6']['R2']}")
print(f"RMSE: {model_results['Model 6']['RMSE']}")
print(f"Intercept: {model_results['Model 6']['Intercept']}")
print(f"Coefficients: {model_results['Model 6']['Coefficients']}")
print(f"Selected Features: {model_results['Model 6']['Selected Features']}")


Error adding lagged_z_swing_percent: cannot reindex on an axis with duplicate labels
Adding feature lagged_bb_percent with p-value 2.2647124221303622e-59
Error adding lagged_z_swing_percent: cannot reindex on an axis with duplicate labels
Adding feature lagged_oz_swing_percent with p-value 0.00031033377843364424
Error adding lagged_z_swing_percent: cannot reindex on an axis with duplicate labels
Adding feature lagged_hard_hit_percent with p-value 0.005078487912547929
Error adding lagged_z_swing_percent: cannot reindex on an axis with duplicate labels
Adding feature lagged_pa with p-value 0.04754933840122885
Error adding lagged_z_swing_percent: cannot reindex on an axis with duplicate labels
Selected Features: ['lagged_bb_percent', 'lagged_oz_swing_percent', 'lagged_hard_hit_percent', 'lagged_pa']

Stepwise Model Results:
Intercept: 4.8623738718458185
Coefficients: {'lagged_bb_percent': 0.49045629544152136, 'lagged_oz_swing_percent': -0.12538323051181297, 'lagged_hard_hit_percent': 0.04

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [120]:
## Lasso Regression

# Model 7

X = df[['lagged_bb_percent', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 'lagged_oz_swing_miss_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent']]  
y = df['bb_percent']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso_model = Lasso(alpha=0.01)  

lasso_model.fit(X_train, y_train)

print("Intercept:", lasso_model.intercept_)
print("Coefficients:", lasso_model.coef_)

y_pred = lasso_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 7"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 7 Results Saved:")
print(f"R2: {model_results['Model 7']['R2']}")
print(f"RMSE: {model_results['Model 7']['RMSE']}")
print(f"Intercept: {model_results['Model 7']['Intercept']}")
print(f"Coefficients: {model_results['Model 7']['Coefficients']}")




Intercept: -200.73156829490262
Coefficients: [ 0.48212113 -0.01468189  0.10307111  0.00234485  0.03751512  0.00830278
  0.         -0.13519682 -0.00587968 -0.00553127 -0.02182633]
Mean Squared Error (MSE): 4.931271721347501
R-squared (R2): 0.3734253794432949
Root Mean Squared Error (RMSE): 2.220646689896324

Model 7 Results Saved:
R2: 0.3734253794432949
RMSE: 2.220646689896324
Intercept: -233.29779276475446
Coefficients: {'lagged_bb_percent': 0.48354799914368196, 'lagged_player_age': -0.01566570447784367, 'lagged_year': 0.11922611119210685, 'lagged_pa': 0.0023212071002631854, 'lagged_hard_hit_percent': 0.03746820271216754, 'lagged_z_swing_percent': 0.0056947654068871985, 'lagged_oz_swing_percent': -0.13645079186539655, 'lagged_oz_swing_miss_percent': -0.006497890050379642, 'lagged_meatball_swing_percent': -0.007268587800101391, 'lagged_iz_contact_percent': -0.022831083207025406}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [121]:
## Ridge Regression 

# Model 8 

X = df[['lagged_bb_percent', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 'lagged_oz_swing_miss_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent']]  
y = df['bb_percent']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = Ridge(alpha=1.0)  

ridge_model.fit(X_train, y_train)

print("Intercept:", ridge_model.intercept_)
print("Coefficients:", ridge_model.coef_)

y_pred = ridge_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

model_results["Model 8"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 8 Results Saved:")
print(f"R2: {model_results['Model 8']['R2']}")
print(f"RMSE: {model_results['Model 8']['RMSE']}")
print(f"Intercept: {model_results['Model 8']['Intercept']}")
print(f"Coefficients: {model_results['Model 8']['Coefficients']}")


Intercept: -232.22039089131687
Coefficients: [ 0.48313469 -0.01564132  0.11869775  0.00232229  0.03750417  0.00567893
  0.00567893 -0.13657557 -0.0064917  -0.00727144 -0.02286108]
Mean Squared Error (MSE): 4.927438470454066
R-squared (R2): 0.3739124379263339
Root Mean Squared Error (RMSE): 2.2197834287276916

Model 8 Results Saved:
R2: 0.3739124379263339
RMSE: 2.2197834287276916
Intercept: -233.29779276475446
Coefficients: {'lagged_bb_percent': 0.48354799914368196, 'lagged_player_age': -0.01566570447784367, 'lagged_year': 0.11922611119210685, 'lagged_pa': 0.0023212071002631854, 'lagged_hard_hit_percent': 0.03746820271216754, 'lagged_z_swing_percent': 0.0056947654068871985, 'lagged_oz_swing_percent': -0.13645079186539655, 'lagged_oz_swing_miss_percent': -0.006497890050379642, 'lagged_meatball_swing_percent': -0.007268587800101391, 'lagged_iz_contact_percent': -0.022831083207025406}


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [122]:
## Predicting the mean for next year's BB% to see if the above models add value

#  Model 9

mean_bb_per = y_train.mean()

y_pred_mean = np.full_like(y_test, fill_value=mean_bb_per, dtype=np.float64)

mse_mean = mean_squared_error(y_test, y_pred_mean)
r2_mean = r2_score(y_test, y_pred_mean)
rmse_mean = np.sqrt(mse_mean)

print("Mean-Based Model Metrics:")
print("Mean Squared Error (MSE):", mse_mean)
print("R-squared (R2):", r2_mean)
print("Root Mean Squared Error (RMSE):", rmse_mean)

model_results["Model 9"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 9 Results Saved:")
print(f"R2: {model_results['Model 9']['R2']}")
print(f"RMSE: {model_results['Model 9']['RMSE']}")
print(f"Intercept: {model_results['Model 9']['Intercept']}")
print(f"Coefficients: {model_results['Model 9']['Coefficients']}")



Mean-Based Model Metrics:
Mean Squared Error (MSE): 7.912289505102897
R-squared (R2): -0.005347114200379366
Root Mean Squared Error (RMSE): 2.8128792197858226

Model 9 Results Saved:
R2: 0.3739124379263339
RMSE: 2.2197834287276916
Intercept: -233.29779276475446
Coefficients: {'lagged_bb_percent': 0.48354799914368196, 'lagged_player_age': -0.01566570447784367, 'lagged_year': 0.11922611119210685, 'lagged_pa': 0.0023212071002631854, 'lagged_hard_hit_percent': 0.03746820271216754, 'lagged_z_swing_percent': 0.0056947654068871985, 'lagged_oz_swing_percent': -0.13645079186539655, 'lagged_oz_swing_miss_percent': -0.006497890050379642, 'lagged_meatball_swing_percent': -0.007268587800101391, 'lagged_iz_contact_percent': -0.022831083207025406}


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [123]:
## Random Forest

# Model 10

X = df[['lagged_bb_percent', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 'lagged_oz_swing_miss_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent']]  
y = df['bb_percent']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nRandom Forest Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

feature_importance = rf_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in sorted(zip(features, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

model_results["Model 10"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 10 Results Saved:")
print(f"R2: {model_results['Model 10']['R2']}")
print(f"RMSE: {model_results['Model 10']['RMSE']}")
print(f"Intercept: {model_results['Model 10']['Intercept']}")
print(f"Coefficients: {model_results['Model 10']['Coefficients']}")





Random Forest Model Evaluation:
Mean Squared Error (MSE): 5.5566379545454545
R-squared (R2): 0.29396542825490013
Root Mean Squared Error (RMSE): 2.3572522042720534

Feature Importances:
lagged_bb_percent: 0.5462
lagged_oz_swing_percent: 0.1098
lagged_hard_hit_percent: 0.0815
lagged_pa: 0.0597
lagged_iz_contact_percent: 0.0462
lagged_oz_swing_miss_percent: 0.0422
lagged_meatball_swing_percent: 0.0362
lagged_player_age: 0.0292
lagged_z_swing_percent: 0.0187
lagged_z_swing_percent: 0.0178
lagged_year: 0.0123

Model 10 Results Saved:
R2: 0.29396542825490013
RMSE: 2.3572522042720534
Intercept: -233.29779276475446
Coefficients: {'lagged_bb_percent': 0.48354799914368196, 'lagged_player_age': -0.01566570447784367, 'lagged_year': 0.11922611119210685, 'lagged_pa': 0.0023212071002631854, 'lagged_hard_hit_percent': 0.03746820271216754, 'lagged_z_swing_percent': 0.0056947654068871985, 'lagged_oz_swing_percent': -0.13645079186539655, 'lagged_oz_swing_miss_percent': -0.006497890050379642, 'lagged_me

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [124]:
## XGBoost

# Model 11

X = df[['lagged_bb_percent', 'lagged_player_age', 'lagged_year', 'lagged_pa', 'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 'lagged_oz_swing_miss_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent']]  
y = df['bb_percent'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(
    n_estimators=100,       
    learning_rate=0.1,      
    max_depth=3,            
    subsample=0.8,         
    colsample_bytree=0.8,   
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("\nXGBoost Model Evaluation:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

feature_importances = xgb_model.feature_importances_
features = X.columns
print("\nFeature Importances:")
for feature, importance in sorted(zip(features, feature_importances), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")

model_results["Model 11"] = {
    "R2": r2,
    "RMSE": rmse,
    "Intercept": model.intercept_,
    "Coefficients": dict(zip(X.columns, model.coef_)),
}

print("\nModel 11 Results Saved:")
print(f"R2: {model_results['Model 11']['R2']}")
print(f"RMSE: {model_results['Model 11']['RMSE']}")
print(f"Intercept: {model_results['Model 11']['Intercept']}")
print(f"Coefficients: {model_results['Model 11']['Coefficients']}")



XGBoost Model Evaluation:
Mean Squared Error (MSE): 5.654403779421897
R-squared (R2): 0.2815431590945564
Root Mean Squared Error (RMSE): 2.3778990263301547

Feature Importances:
lagged_bb_percent: 0.3352
lagged_oz_swing_percent: 0.1858
lagged_z_swing_percent: 0.0821
lagged_hard_hit_percent: 0.0726
lagged_pa: 0.0631
lagged_meatball_swing_percent: 0.0613
lagged_iz_contact_percent: 0.0589
lagged_oz_swing_miss_percent: 0.0588
lagged_year: 0.0461
lagged_player_age: 0.0362

Model 11 Results Saved:
R2: 0.2815431590945564
RMSE: 2.3778990263301547
Intercept: -233.29779276475446
Coefficients: {'lagged_bb_percent': 0.48354799914368196, 'lagged_player_age': -0.01566570447784367, 'lagged_year': 0.11922611119210685, 'lagged_pa': 0.0023212071002631854, 'lagged_hard_hit_percent': 0.03746820271216754, 'lagged_z_swing_percent': 0.0056947654068874275, 'lagged_oz_swing_percent': 0.0056947654068871985, 'lagged_oz_swing_miss_percent': -0.13645079186539655, 'lagged_meatball_swing_percent': -0.00649789005037

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [125]:
## Finding the best model thus far

print("All Model Results:")
for model_label, results in model_results.items():
    print(f"\n{model_label}:")
    print(f"  R2: {results['R2']:.4f}")
    print(f"  RMSE: {results['RMSE']:.4f}")
    print(f"  Intercept: {results['Intercept']:.4f}")
    print("  Coefficients:")
    for feature, coef in results["Coefficients"].items():
        print(f"    {feature}: {coef:.4f}")


All Model Results:

Model 1:
  R2: 0.3531
  RMSE: 2.2564
  Intercept: 2.3639
  Coefficients:
    lagged_bb_percent: 0.7111

Model 2:
  R2: 0.3552
  RMSE: 2.2528
  Intercept: 2.7462
  Coefficients:
    lagged_bb_percent: 0.7136
    lagged_player_age: -0.0145

Model 3:
  R2: 0.3531
  RMSE: 2.2564
  Intercept: -215.3919
  Coefficients:
    lagged_bb_percent: 0.7140
    lagged_year: 0.1079
    lagged_player_age: -0.0136

Model 4:
  R2: 0.3739
  RMSE: 2.2198
  Intercept: -233.2978
  Coefficients:
    lagged_bb_percent: 0.4835
    lagged_player_age: -0.0157
    lagged_year: 0.1192
    lagged_pa: 0.0023
    lagged_hard_hit_percent: 0.0375
    lagged_z_swing_percent: 0.0057
    lagged_oz_swing_percent: -0.1365
    lagged_oz_swing_miss_percent: -0.0065
    lagged_meatball_swing_percent: -0.0073
    lagged_iz_contact_percent: -0.0228

Model 5:
  R2: 0.3568
  RMSE: 2.2499
  Intercept: 6.4844
  Coefficients:
    lagged_bb_percent: 0.5622
    lagged_oz_swing_percent: -0.0986

Model 6:
  R2: 0.3614


In [126]:
## Predictions for 2025: Largest Improvement and Decline in BB% using Model 8

X = df[['lagged_bb_percent', 'lagged_player_age', 'lagged_year', 'lagged_pa', 
        'lagged_hard_hit_percent', 'lagged_z_swing_percent', 'lagged_oz_swing_percent', 
        'lagged_oz_swing_miss_percent', 'lagged_meatball_swing_percent', 'lagged_iz_contact_percent']]
y = df['bb_percent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

print("Intercept:", ridge_model.intercept_)
print("Coefficients:", ridge_model.coef_)

y_pred = ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)
print("Root Mean Squared Error (RMSE):", rmse)

df_original[['last_name', 'first_name']] = df_original['last_name, first_name'].str.split(', ', expand=True)

df_2024 = df_original[df_original['year'] == 2024].copy()

df_2024['lagged_bb_percent'] = df_2024['bb_percent']
df_2024['lagged_player_age'] = df_2024['player_age']
df_2024['lagged_year'] = df_2024['year']
df_2024['lagged_pa'] = df_2024['pa']
df_2024['lagged_hard_hit_percent'] = df_2024['hard_hit_percent']
df_2024['lagged_z_swing_percent'] = df_2024['z_swing_percent']
df_2024['lagged_oz_swing_percent'] = df_2024['oz_swing_percent']
df_2024['lagged_oz_swing_miss_percent'] = df_2024['oz_swing_miss_percent']
df_2024['lagged_meatball_swing_percent'] = df_2024['meatball_swing_percent']
df_2024['lagged_iz_contact_percent'] = df_2024['iz_contact_percent']

X_2025 = df_2024[X_train.columns]

df_2024['predicted_bb_percent_2025'] = ridge_model.predict(X_2025)

df_2024['bb_percent_difference'] = df_2024['predicted_bb_percent_2025'] - df_2024['lagged_bb_percent']

top_5_players = df_2024.nlargest(5, 'bb_percent_difference')[
    ['last_name', 'first_name', 'player_age', 'bb_percent', 
     'predicted_bb_percent_2025', 'bb_percent_difference']
]

bot_5_players = df_2024.nsmallest(5, 'bb_percent_difference')[
    ['last_name', 'first_name', 'player_age', 'bb_percent', 
     'predicted_bb_percent_2025', 'bb_percent_difference']
]

print("Top 5 Players with Largest Predicted Increase in BB% (2025):")
print(top_5_players)

print("Bottom 5 Players with Largest Predicted Decrease in BB% (2025):")
print(bot_5_players)


Intercept: -232.21995391976785
Coefficients: [ 0.48313468 -0.01564122  0.11869754  0.0023223   0.03750421  0.01135549
 -0.13657466 -0.0064917  -0.00727003 -0.02286103]
Mean Squared Error (MSE): 4.927439381609083
R-squared (R2): 0.37391232215364056
Root Mean Squared Error (RMSE): 2.2197836339627974
Top 5 Players with Largest Predicted Increase in BB% (2025):
        last_name first_name  player_age  bb_percent  \
761        Garcia     Maikel          24         6.7   
669  Kiner-Falefa      Isiah          29         3.2   
678    De La Cruz      Bryan          27         4.7   
768      Westburg     Jordan          25         4.9   
609          Pham      Tommy          36         7.3   

     predicted_bb_percent_2025  bb_percent_difference  
761                   9.001140               2.301140  
669                   5.226222               2.026222  
678                   6.574978               1.874978  
768                   6.706838               1.806838  
609                   9

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
