In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

import shap
from sklearn.decomposition import PCA

In [51]:
baseball = pd.read_csv('data/baseball.csv')
baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['Pos_C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['Pos_1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['Pos_2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['Pos_3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['Pos_SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['Pos_OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['Pos_C', 'Pos_1B', 'Pos_2B', 'Pos_3B', 'Pos_SS', 'Pos_OF']].sum(axis = 1)
baseball['R/AB'] = baseball['R'] / baseball['AB']
baseball['2B/AB'] = baseball['2B'] / baseball['AB']
baseball['3B/AB'] = baseball['3B'] / baseball['AB']
baseball['HR/AB'] = baseball['HR'] / baseball['AB']
baseball['RBI/AB'] = baseball['RBI'] / baseball['AB']
baseball['BB/PA'] = baseball['BB'] / baseball['PA']
baseball['SB - CS'] = baseball['SB'] - baseball['CS']
baseball['BB - SO'] = baseball['BB'] - baseball['SO'] # measures a batters eye
baseball['E/Def-Inn'] = baseball['E'] / baseball['Def-Inn']
baseball['DP/Def-Inn'] = baseball['DP'] / baseball['Def-Inn']

baseball = baseball.drop(['Position', 'Def-Inn', 'PO', 'A', 'E', 'DP', 'PA', 'AB', 'R', 'H', 
                          '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB'], axis = 1)


In [53]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 621)

In [54]:
cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['Pos_C', 'Pos_1B', 'Pos_2B', 'Pos_3B', 'Pos_SS', 'Pos_OF']]

cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

X_transform = preprocessor.fit_transform(X)

selected_features = np.concatenate([
    np.array(num_columns),
    np.array(preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(cat_columns)),
    np.array(['Pos_C', 'Pos_1B', 'Pos_2B', 'Pos_3B', 'Pos_SS', 'Pos_OF'])
])


In [55]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dropout, Dense
from bayes_opt import BayesianOptimization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import warnings
import joblib

warnings.filterwarnings('ignore')


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_transform, y, test_size = .2, random_state = 621)

In [57]:
def dnn_model_score(neurons, dropout_rate, learning_rate, epochs, batch_size, patience, num_layers, **layer_neurons):
    model = Sequential()
    model.add(Dense(int(neurons), activation='relu', input_shape = (X_train.shape[1],)))
    model.add(Dropout(dropout_rate))

    for i in range(1, int(num_layers) + 1):
        model.add(Dense(int(layer_neurons[f'layer_neurons_{i}']), activation='relu'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation = 'linear'))

    optimizer = Adam(learning_rate = learning_rate)
    model.compile(loss = 'mean_squared_error', optimizer = optimizer)

    es = EarlyStopping(monitor = 'val_loss', patience = int(patience), restore_best_weights = True)

    model.fit(X_train, y_train, validation_split = .2, epochs = int(epochs), batch_size = int(batch_size), callbacks = es, verbose = 0)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    return -mse

pbounds = {'neurons': (32, 256),
           'dropout_rate': (0.0, 0.5),
           'learning_rate': (0.01, 1),
           'epochs' : (100, 500),
           'batch_size' : (32, 500),
           'patience' : (20, 50),
           'num_layers': (1, 5)}

for i in range(1, 6):
    pbounds[f'layer_neurons_{i}'] = (32, 256)

optimizer = BayesianOptimization(f = dnn_model_score, pbounds = pbounds, random_state = 42)

optimizer.maximize(init_points = 5, n_iter = 10)

best_params = optimizer.max['params']
print("Best Hyperparameters:", best_params)


|   iter    |  target   | batch_... | dropou... |  epochs   | layer_... | layer_... | layer_... | layer_... | layer_... | learni... |  neurons  | num_la... | patience  |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
| [0m1        [0m | [0m-2.342e+1[0m | [0m207.3    [0m | [0m0.4754   [0m | [0m392.8    [0m | [0m166.1    [0m | [0m66.95    [0m | [0m66.94    [0m | [0m45.01    [0m | [0m226.0    [0m | [0m0.6051   [0m | [0m190.6    [0m | [0m1.082    [0m | [0m49.1     [0m |
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
| [0m2        [0m | [0m-2.362e+1[0m | [0m421.6    [0m | [0m0.1062   [0m | [0m172.7    [0m | [0m73.08    [0m | [0m100.2    [0m | [0m149.5    [0m | [0m128.8    [0m | [0m97.24    [0m | [0m0.6157   [0m | [0m63.25 

In [58]:
batch_size = int(best_params['batch_size'])
dropout_rate = best_params['dropout_rate']
epochs = int(best_params['epochs'])
neurons = []
neurons.append(int(best_params['neurons']))
neurons.append(int(best_params['layer_neurons_1']))
neurons.append(int(best_params['layer_neurons_2']))
neurons.append(int(best_params['layer_neurons_3']))
neurons.append(int(best_params['layer_neurons_4']))
neurons.append(int(best_params['layer_neurons_5']))
learning_rate = best_params['learning_rate']
num_layers = int(best_params['num_layers'])
patience = int(best_params['patience'])

In [59]:
model = Sequential()

model.add(Dense(name = 'Dense1', units = neurons[0], input_dim = X_train.shape[1], activation = 'relu'))
model.add(Dropout(name = 'Dropout1', rate = dropout_rate))

for i in range(1, num_layers + 1):
    model.add(Dense(name = f'Dense{i + 1}', units = neurons[i], activation = 'relu'))
    model.add(Dropout(name = f'Dropout{i + 1}', rate = dropout_rate))

model.add(Dense(name = 'Output', units = 1, activation = 'linear'))

optimizer = Adam(learning_rate = learning_rate)

model.compile(optimizer = optimizer, loss = 'mean_squared_error')

model.summary()

In [60]:
es = EarlyStopping(monitor = 'val_loss', patience = int(patience), restore_best_weights = True)

history = model.fit(X_train, y_train, validation_split = .2, batch_size = batch_size, epochs = epochs, callbacks = es, verbose = 2)

Epoch 1/237
14/14 - 3s - 213ms/step - loss: 50376845819904.0000 - val_loss: 35216148135936.0000
Epoch 2/237
14/14 - 0s - 19ms/step - loss: 33823595167744.0000 - val_loss: 27806905925632.0000
Epoch 3/237
14/14 - 0s - 20ms/step - loss: 31751099383808.0000 - val_loss: 24726487433216.0000
Epoch 4/237
14/14 - 0s - 17ms/step - loss: 29393344790528.0000 - val_loss: 24126066524160.0000
Epoch 5/237
14/14 - 0s - 16ms/step - loss: 28801050345472.0000 - val_loss: 24942865285120.0000
Epoch 6/237
14/14 - 0s - 17ms/step - loss: 27835355889664.0000 - val_loss: 23157991800832.0000
Epoch 7/237
14/14 - 0s - 16ms/step - loss: 27041095221248.0000 - val_loss: 23205297258496.0000
Epoch 8/237
14/14 - 0s - 17ms/step - loss: 27106700427264.0000 - val_loss: 22662537543680.0000
Epoch 9/237
14/14 - 0s - 17ms/step - loss: 26786547105792.0000 - val_loss: 22838920609792.0000
Epoch 10/237
14/14 - 0s - 17ms/step - loss: 25471475515392.0000 - val_loss: 22688535937024.0000
Epoch 11/237
14/14 - 0s - 17ms/step - loss: 2550

In [61]:
test_preds = model.predict(X_test)
train_preds = model.predict(X_train)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [62]:
train_rmse = mean_squared_error(y_train, train_preds)
test_rmse = mean_squared_error(y_test, test_preds)

print('Deep Neural Network Metrics:')
print(f'Train RMSE: {np.sqrt(train_rmse)}')
print(f'Test RMSE: {np.sqrt(test_rmse)}')

Deep Neural Network Metrics:
Train RMSE: 3031693.1268392135
Test RMSE: 4818131.924759854


In [63]:
# save the model, pca, and preprocessor so that new data can be fit using the same criteria
#model.save('dnn_new/best_model.keras')
#joblib.dump(pca, 'dnn/pca45.joblib')
#joblib.dump(preprocessor, 'dnn_new/best_preprocssor.joblib')

['dnn_new/best_preprocssor.joblib']