## Setup

In [2]:
import pandas as pd

In [70]:
baseball = pd.read_csv('data/baseball.csv')
baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['C', '1B', '2B', '3B', 'SS', 'OF']].sum(axis = 1)
baseball = baseball.drop(['Position'], axis = 1)

In [4]:
# packages used in each section below
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

import shap
from sklearn.decomposition import PCA

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [71]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 621)

In [6]:
cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

X_transform = preprocessor.fit_transform(X)

selected_features = np.concatenate([
    np.array(num_columns),
    np.array(preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(cat_columns)),
    np.array(['C', '1B', '2B', '3B', 'SS', 'OF'])
])


## Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
rfe_rf = RFE(estimator = RandomForestRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_rf.fit(X_transform, y)

In [9]:
selected_features[rfe_rf.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP',
       'SH', 'SF', 'IBB', 'Num_Pos', 'Tm_LAA', 'Tm_LAD', 'Tm_MULTIPLE',
       'Tm_NYM', 'Tm_NYY', 'Tm_SFG', 'Tm_STL', 'Acquired_Free Agency',
       'Acquired_Traded'], dtype=object)

In [10]:
red_num_columns = ['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'RAA', 'WAA', 'RAR',
               'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP',
               'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos', 'Season']
red_cat_columns = ['Tm', 'Acquired']
X_rf = X[red_num_columns + red_cat_columns]

In [11]:
X_rf_red_train, X_rf_red_test, y_rf_red_train, y_rf_red_test = train_test_split(X_rf, y, test_size = .25, random_state = 621)

In [12]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

rf_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', RandomForestRegressor(n_estimators = 150, min_samples_leaf = 10))
    ]
)

In [13]:
rf_red_pipe.fit(X_rf_red_train, y_rf_red_train)
rf_red_train_mse = mean_squared_error(y_rf_red_train, rf_red_pipe.predict(X_rf_red_train))
rf_red_test_mse = mean_squared_error(y_rf_red_test, rf_red_pipe.predict(X_rf_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(rf_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(rf_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 4000195.7570256125
Test RMSE: 5380843.936012433


In [14]:
rf_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators = 150, min_samples_leaf = 10))
    ]
)

In [15]:
rf_pipe.fit(X_train, y_train)
rf_train_mse = mean_squared_error(y_train, rf_pipe.predict(X_train))
rf_test_mse = mean_squared_error(y_test, rf_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(rf_train_mse)}')
print(f'Test RMSE: {np.sqrt(rf_test_mse)}')

Random Forest Metrics:
Training RMSE: 3985306.945110512
Test RMSE: 5387223.611190923


## XGBoost 

In [16]:
from xgboost import XGBRegressor

In [17]:
rfe_xg = RFE(estimator = XGBRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_xg.fit(X_transform, y)

In [18]:
selected_features[rfe_xg.support_]

array(['Season', 'RBI', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS+',
       'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos', 'Tm_ARI',
       'Tm_BOS', 'Tm_CHC', 'Tm_CIN', 'Tm_DET', 'Tm_HOU', 'Tm_KCR',
       'Tm_LAA', 'Tm_LAD', 'Tm_MIA', 'Tm_MIN', 'Tm_MULTIPLE', 'Tm_NYM',
       'Tm_NYY', 'Tm_OAK', 'Tm_PHI', 'Tm_STL', 'Tm_TBR', 'Tm_TEX',
       'Acquired_Amateur Draft', 'Acquired_Amateur Free Agent',
       'Acquired_Free Agency', 'Acquired_Traded', 'Bat_L', 'Bat_R', '1B',
       '2B'], dtype=object)

In [19]:
red_num_columns = ['Season', 'RBI', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS+', 
                   'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos']
red_cat_columns = ['Tm', 'Acquired', 'Bat', 'C', '1B', '2B', '3B', 'SS', 'OF']
X_xg = X[red_num_columns + red_cat_columns]

In [20]:
X_xg_red_train, X_xg_red_test, y_xg_red_train, y_xg_red_test = train_test_split(X_xg, y, test_size = .25, random_state = 621)

In [21]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

xg_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', XGBRegressor())
    ]
)

In [22]:
xg_red_pipe.fit(X_xg_red_train, y_xg_red_train)
xg_red_train_mse = mean_squared_error(y_xg_red_train, xg_red_pipe.predict(X_xg_red_train))
xg_red_test_mse = mean_squared_error(y_xg_red_test, xg_red_pipe.predict(X_xg_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(xg_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(xg_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 1304272.8356999753
Test RMSE: 5238201.958554674


In [23]:
xg_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', XGBRegressor())
    ]
)

In [24]:
xg_pipe.fit(X_train, y_train)
xg_train_mse = mean_squared_error(y_train, xg_pipe.predict(X_train))
xg_test_mse = mean_squared_error(y_test, xg_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(xg_train_mse)}')
print(f'Test RMSE: {np.sqrt(xg_test_mse)}')

Random Forest Metrics:
Training RMSE: 996000.2897536749
Test RMSE: 5174242.088274057


## Gradient Boosting

In [25]:
from sklearn.ensemble import GradientBoostingRegressor

In [26]:
rfe_gb = RFE(estimator = GradientBoostingRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_gb.fit(X_transform, y)

In [27]:
selected_features[rfe_gb.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'SH',
       'SF', 'IBB', 'Num_Pos', 'Tm_LAA', 'Tm_LAD', 'Tm_MIN',
       'Tm_MULTIPLE', 'Tm_NYY', 'Tm_OAK', 'Tm_STL',
       'Acquired_Free Agency', 'Acquired_Traded', '2B'], dtype=object)

In [28]:
red_num_columns = ['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'SH',
       'SF', 'IBB', 'Num_Pos']
red_cat_columns = ['Tm', 'Acquired', 'C', '1B', '2B', '3B', 'SS', 'OF']
X_gb = X[red_num_columns + red_cat_columns]

In [29]:
X_gb_red_train, X_gb_red_test, y_gb_red_train, y_gb_red_test = train_test_split(X_gb, y, test_size = .25, random_state = 621)

In [30]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

gb_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', GradientBoostingRegressor())
    ]
)

In [31]:
gb_red_pipe.fit(X_gb_red_train, y_gb_red_train)
gb_red_train_mse = mean_squared_error(y_gb_red_train, gb_red_pipe.predict(X_gb_red_train))
gb_red_test_mse = mean_squared_error(y_gb_red_test, gb_red_pipe.predict(X_gb_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(gb_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(gb_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 4300166.982328629
Test RMSE: 5159717.79693106


In [32]:
gb_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', GradientBoostingRegressor())
    ]
)

In [33]:
gb_pipe.fit(X_train, y_train)
gb_train_mse = mean_squared_error(y_train, gb_pipe.predict(X_train))
gb_test_mse = mean_squared_error(y_test, gb_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(gb_train_mse)}')
print(f'Test RMSE: {np.sqrt(gb_test_mse)}')

Random Forest Metrics:
Training RMSE: 4296144.0886627985
Test RMSE: 5153455.823894265


## ADA Boosting

In [34]:
from sklearn.ensemble import AdaBoostRegressor

In [35]:
rfe_ada = RFE(estimator = AdaBoostRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_ada.fit(X_transform, y)

In [36]:
selected_features[rfe_ada.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'CS', 'BB',
       'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'GDP', 'HBP', 'SH', 'SF',
       'IBB', 'Num_Pos', 'Tm_CHC', 'Tm_COL', 'Tm_DET', 'Tm_HOU', 'Tm_LAA',
       'Tm_NYM', 'Tm_NYY', 'Acquired_Free Agency', 'Bat_L', 'Bat_R', '2B'],
      dtype=object)

In [37]:
red_num_columns = ['PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA', 'WAA',
       'RAR', 'WAR', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'BB', 'SO', 'BA',
       'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB',
       'Num_Pos']
red_cat_columns = ['Tm', 'Acquired', 'Bat', 'C', '1B', '2B', '3B', 'SS', 'OF']
X_ada = X[red_num_columns + red_cat_columns]

In [38]:
X_ada_red_train, X_ada_red_test, y_ada_red_train, y_ada_red_test = train_test_split(X_ada, y, test_size = .25, random_state = 621)

In [39]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

ada_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', AdaBoostRegressor())
    ]
)

In [40]:
ada_red_pipe.fit(X_ada_red_train, y_ada_red_train)
ada_red_train_mse = mean_squared_error(y_ada_red_train, ada_red_pipe.predict(X_ada_red_train))
ada_red_test_mse = mean_squared_error(y_ada_red_test, ada_red_pipe.predict(X_ada_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(ada_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(ada_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 6879873.626559459
Test RMSE: 7021047.164716296


In [41]:
ada_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = 30, random_state = 621)),
        ('model', AdaBoostRegressor(random_state = 621))
    ]
)

In [42]:
ada_pipe.fit(X_train, y_train)
ada_train_mse = mean_squared_error(y_train, ada_pipe.predict(X_train))
ada_test_mse = mean_squared_error(y_test, ada_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(ada_train_mse)}')
print(f'Test RMSE: {np.sqrt(ada_test_mse)}')

Random Forest Metrics:
Training RMSE: 6005706.449179022
Test RMSE: 6434392.497233446


## Support Vector Machine

In [43]:
from sklearn.svm import SVC

In [44]:
svm_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', SVC())
    ]
)

In [45]:
svm_pipe.fit(X_train, y_train)
svm_train_mse = mean_squared_error(y_train, svm_pipe.predict(X_train))
svm_test_mse = mean_squared_error(y_test, svm_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(svm_train_mse)}')
print(f'Test RMSE: {np.sqrt(svm_test_mse)}')

Random Forest Metrics:
Training RMSE: 5958921.78673805
Test RMSE: 6496242.391000514


## KNN

In [46]:
from sklearn.neighbors import KNeighborsRegressor

In [47]:
knn_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = 63)),
        ('model', KNeighborsRegressor())
    ]
)

In [48]:
knn_pipe.fit(X_train, y_train)
knn_train_mse = mean_squared_error(y_train, knn_pipe.predict(X_train))
knn_test_mse = mean_squared_error(y_test, knn_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(knn_train_mse)}')
print(f'Test RMSE: {np.sqrt(knn_test_mse)}')

Random Forest Metrics:
Training RMSE: 4357478.9227963155
Test RMSE: 5627354.873436867


## DNN

In [73]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dropout, Dense
from bayes_opt import BayesianOptimization
import warnings

warnings.filterwarnings('ignore')

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Define the objective function
def nn_cv_score(units, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Dense(units=int(units), activation='relu', input_shape=(X_train_scaled.shape[1],)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='mean_squared_error', optimizer=optimizer)

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)

    return -mse

# Define the search space
pbounds = {'units': (16, 256),
           'dropout_rate': (0.0, 0.5),
           'learning_rate': (1e-5, 1e-2)}

# Initialize the Bayesian optimization object
optimizer = BayesianOptimization(f=nn_cv_score, pbounds=pbounds, random_state=42)

# Run the optimization
optimizer.maximize(init_points=5, n_iter=10)

# Get the best hyperparameters
best_params = optimizer.max['params']
print("Best Hyperparameters:", best_params)


|   iter    |  target   | dropou... | learni... |   units   |
-------------------------------------------------------------
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
| [0m1        [0m | [0m-3.645e+1[0m | [0m0.1873   [0m | [0m0.009508 [0m | [0m191.7    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
| [0m2        [0m | [0m-6.86e+13[0m | [0m0.2993   [0m | [0m0.001569 [0m | [0m53.44    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
| [0m3        [0m | [0m-3.849e+1[0m | [0m0.02904  [0m | [0m0.008663 [0m | [0m160.3    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
| [0m4        [0m | [0m-6.927e+1[0m | [0m0.354    [0m | [0m0.0002156[0m | [0m248.8    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
| [0m5        [0m | [0m-6.811e+1[0m | [0m0.4162   [0m | [0m0.002131 [0m | [0m59.64    [0m |


In [77]:


# Define the objective function
def nn_cv_score(units, dropout_rate, learning_rate, num_layers, **layer_units):
    model = Sequential()
    model.add(Dense(units=int(units), activation='relu', input_shape=(X_train_scaled.shape[1],)))
    model.add(Dropout(dropout_rate))

    for i in range(1, int(num_layers) + 1):
        model.add(Dense(units=int(layer_units[f'layer_units_{i}']), activation='relu'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation='linear'))

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='mean_squared_error', optimizer=optimizer)

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)

    return -mse

# Define the search space
pbounds = {'units': (16, 256),
           'dropout_rate': (0.0, 0.5),
           'learning_rate': (1e-5, 1e-2),
           'num_layers': (1, 5)}

for i in range(1, 6):
    pbounds[f'layer_units_{i}'] = (16, 256)

# Initialize the Bayesian optimization object
optimizer = BayesianOptimization(f=nn_cv_score, pbounds=pbounds, random_state=42)

# Run the optimization
optimizer.maximize(init_points=5, n_iter=10)

# Get the best hyperparameters
best_params = optimizer.max['params']
print("Best Hyperparameters:", best_params)


|   iter    |  target   | dropou... | layer_... | layer_... | layer_... | layer_... | layer_... | learni... | num_la... |   units   |
-------------------------------------------------------------------------------------------------------------------------------------
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
| [0m1        [0m | [0m-2.577e+1[0m | [0m0.1873   [0m | [0m244.2    [0m | [0m191.7    [0m | [0m159.7    [0m | [0m53.44    [0m | [0m53.44    [0m | [0m0.0005903[0m | [0m4.465    [0m | [0m160.3    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
| [0m2        [0m | [0m-2.677e+1[0m | [0m0.354    [0m | [0m20.94    [0m | [0m248.8    [0m | [0m215.8    [0m | [0m66.96    [0m | [0m59.64    [0m | [0m0.001842 [0m | [0m2.217    [0m | [0m141.9    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
| [95m3        [0m | [95m-2.471e+1[0m | [95m0.216    [0m | [95