## Setup

In [85]:
import pandas as pd

In [86]:
baseball = pd.read_csv('data/baseball.csv')
baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['C', '1B', '2B', '3B', 'SS', 'OF']].sum(axis = 1)
baseball = baseball.drop(['Position'], axis = 1)

In [4]:
# packages used in each section below
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

import shap
from sklearn.decomposition import PCA

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [87]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 621)

In [88]:
cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

X_transform = preprocessor.fit_transform(X)

selected_features = np.concatenate([
    np.array(num_columns),
    np.array(preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(cat_columns)),
    np.array(['C', '1B', '2B', '3B', 'SS', 'OF'])
])


## Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [8]:
rfe_rf = RFE(estimator = RandomForestRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_rf.fit(X_transform, y)

In [9]:
selected_features[rfe_rf.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP',
       'SH', 'SF', 'IBB', 'Num_Pos', 'Tm_LAA', 'Tm_LAD', 'Tm_MULTIPLE',
       'Tm_NYM', 'Tm_NYY', 'Tm_SFG', 'Tm_STL', 'Acquired_Free Agency',
       'Acquired_Traded'], dtype=object)

In [10]:
red_num_columns = ['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'RAA', 'WAA', 'RAR',
               'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP',
               'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos', 'Season']
red_cat_columns = ['Tm', 'Acquired']
X_rf = X[red_num_columns + red_cat_columns]

In [11]:
X_rf_red_train, X_rf_red_test, y_rf_red_train, y_rf_red_test = train_test_split(X_rf, y, test_size = .25, random_state = 621)

In [12]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

rf_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', RandomForestRegressor(n_estimators = 150, min_samples_leaf = 10))
    ]
)

In [13]:
rf_red_pipe.fit(X_rf_red_train, y_rf_red_train)
rf_red_train_mse = mean_squared_error(y_rf_red_train, rf_red_pipe.predict(X_rf_red_train))
rf_red_test_mse = mean_squared_error(y_rf_red_test, rf_red_pipe.predict(X_rf_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(rf_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(rf_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 4000195.7570256125
Test RMSE: 5380843.936012433


In [14]:
rf_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators = 150, min_samples_leaf = 10))
    ]
)

In [15]:
rf_pipe.fit(X_train, y_train)
rf_train_mse = mean_squared_error(y_train, rf_pipe.predict(X_train))
rf_test_mse = mean_squared_error(y_test, rf_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(rf_train_mse)}')
print(f'Test RMSE: {np.sqrt(rf_test_mse)}')

Random Forest Metrics:
Training RMSE: 3985306.945110512
Test RMSE: 5387223.611190923


## XGBoost 

In [16]:
from xgboost import XGBRegressor

In [17]:
rfe_xg = RFE(estimator = XGBRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_xg.fit(X_transform, y)

In [18]:
selected_features[rfe_xg.support_]

array(['Season', 'RBI', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS+',
       'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos', 'Tm_ARI',
       'Tm_BOS', 'Tm_CHC', 'Tm_CIN', 'Tm_DET', 'Tm_HOU', 'Tm_KCR',
       'Tm_LAA', 'Tm_LAD', 'Tm_MIA', 'Tm_MIN', 'Tm_MULTIPLE', 'Tm_NYM',
       'Tm_NYY', 'Tm_OAK', 'Tm_PHI', 'Tm_STL', 'Tm_TBR', 'Tm_TEX',
       'Acquired_Amateur Draft', 'Acquired_Amateur Free Agent',
       'Acquired_Free Agency', 'Acquired_Traded', 'Bat_L', 'Bat_R', '1B',
       '2B'], dtype=object)

In [19]:
red_num_columns = ['Season', 'RBI', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS+', 
                   'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos']
red_cat_columns = ['Tm', 'Acquired', 'Bat', 'C', '1B', '2B', '3B', 'SS', 'OF']
X_xg = X[red_num_columns + red_cat_columns]

In [20]:
X_xg_red_train, X_xg_red_test, y_xg_red_train, y_xg_red_test = train_test_split(X_xg, y, test_size = .25, random_state = 621)

In [21]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

xg_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', XGBRegressor())
    ]
)

In [22]:
xg_red_pipe.fit(X_xg_red_train, y_xg_red_train)
xg_red_train_mse = mean_squared_error(y_xg_red_train, xg_red_pipe.predict(X_xg_red_train))
xg_red_test_mse = mean_squared_error(y_xg_red_test, xg_red_pipe.predict(X_xg_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(xg_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(xg_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 1304272.8356999753
Test RMSE: 5238201.958554674


In [23]:
xg_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', XGBRegressor())
    ]
)

In [24]:
xg_pipe.fit(X_train, y_train)
xg_train_mse = mean_squared_error(y_train, xg_pipe.predict(X_train))
xg_test_mse = mean_squared_error(y_test, xg_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(xg_train_mse)}')
print(f'Test RMSE: {np.sqrt(xg_test_mse)}')

Random Forest Metrics:
Training RMSE: 996000.2897536749
Test RMSE: 5174242.088274057


## Gradient Boosting

In [25]:
from sklearn.ensemble import GradientBoostingRegressor

In [26]:
rfe_gb = RFE(estimator = GradientBoostingRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_gb.fit(X_transform, y)

In [27]:
selected_features[rfe_gb.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'SH',
       'SF', 'IBB', 'Num_Pos', 'Tm_LAA', 'Tm_LAD', 'Tm_MIN',
       'Tm_MULTIPLE', 'Tm_NYY', 'Tm_OAK', 'Tm_STL',
       'Acquired_Free Agency', 'Acquired_Traded', '2B'], dtype=object)

In [28]:
red_num_columns = ['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'SH',
       'SF', 'IBB', 'Num_Pos']
red_cat_columns = ['Tm', 'Acquired', 'C', '1B', '2B', '3B', 'SS', 'OF']
X_gb = X[red_num_columns + red_cat_columns]

In [29]:
X_gb_red_train, X_gb_red_test, y_gb_red_train, y_gb_red_test = train_test_split(X_gb, y, test_size = .25, random_state = 621)

In [30]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

gb_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', GradientBoostingRegressor())
    ]
)

In [31]:
gb_red_pipe.fit(X_gb_red_train, y_gb_red_train)
gb_red_train_mse = mean_squared_error(y_gb_red_train, gb_red_pipe.predict(X_gb_red_train))
gb_red_test_mse = mean_squared_error(y_gb_red_test, gb_red_pipe.predict(X_gb_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(gb_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(gb_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 4300166.982328629
Test RMSE: 5159717.79693106


In [32]:
gb_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', GradientBoostingRegressor())
    ]
)

In [33]:
gb_pipe.fit(X_train, y_train)
gb_train_mse = mean_squared_error(y_train, gb_pipe.predict(X_train))
gb_test_mse = mean_squared_error(y_test, gb_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(gb_train_mse)}')
print(f'Test RMSE: {np.sqrt(gb_test_mse)}')

Random Forest Metrics:
Training RMSE: 4296144.0886627985
Test RMSE: 5153455.823894265


## ADA Boosting

In [34]:
from sklearn.ensemble import AdaBoostRegressor

In [35]:
rfe_ada = RFE(estimator = AdaBoostRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfe_ada.fit(X_transform, y)

In [36]:
selected_features[rfe_ada.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'CS', 'BB',
       'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'GDP', 'HBP', 'SH', 'SF',
       'IBB', 'Num_Pos', 'Tm_CHC', 'Tm_COL', 'Tm_DET', 'Tm_HOU', 'Tm_LAA',
       'Tm_NYM', 'Tm_NYY', 'Acquired_Free Agency', 'Bat_L', 'Bat_R', '2B'],
      dtype=object)

In [37]:
red_num_columns = ['PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA', 'WAA',
       'RAR', 'WAR', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'BB', 'SO', 'BA',
       'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB',
       'Num_Pos']
red_cat_columns = ['Tm', 'Acquired', 'Bat', 'C', '1B', '2B', '3B', 'SS', 'OF']
X_ada = X[red_num_columns + red_cat_columns]

In [38]:
X_ada_red_train, X_ada_red_test, y_ada_red_train, y_ada_red_test = train_test_split(X_ada, y, test_size = .25, random_state = 621)

In [39]:
red_preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, red_num_columns),
        ('cat', cat_transformer, red_cat_columns)
    ], remainder = 'passthrough'
)

ada_red_pipe = Pipeline(
    steps = [
        ('preprocessor', red_preprocessor),
        ('model', AdaBoostRegressor())
    ]
)

In [40]:
ada_red_pipe.fit(X_ada_red_train, y_ada_red_train)
ada_red_train_mse = mean_squared_error(y_ada_red_train, ada_red_pipe.predict(X_ada_red_train))
ada_red_test_mse = mean_squared_error(y_ada_red_test, ada_red_pipe.predict(X_ada_red_test))
print('Reduced Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(ada_red_train_mse)}')
print(f'Test RMSE: {np.sqrt(ada_red_test_mse)}')

Reduced Random Forest Metrics:
Training RMSE: 6879873.626559459
Test RMSE: 7021047.164716296


In [41]:
ada_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = 30, random_state = 621)),
        ('model', AdaBoostRegressor(random_state = 621))
    ]
)

In [42]:
ada_pipe.fit(X_train, y_train)
ada_train_mse = mean_squared_error(y_train, ada_pipe.predict(X_train))
ada_test_mse = mean_squared_error(y_test, ada_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(ada_train_mse)}')
print(f'Test RMSE: {np.sqrt(ada_test_mse)}')

Random Forest Metrics:
Training RMSE: 6005706.449179022
Test RMSE: 6434392.497233446


## Support Vector Machine

In [43]:
from sklearn.svm import SVC

In [44]:
svm_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', SVC())
    ]
)

In [45]:
svm_pipe.fit(X_train, y_train)
svm_train_mse = mean_squared_error(y_train, svm_pipe.predict(X_train))
svm_test_mse = mean_squared_error(y_test, svm_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(svm_train_mse)}')
print(f'Test RMSE: {np.sqrt(svm_test_mse)}')

Random Forest Metrics:
Training RMSE: 5958921.78673805
Test RMSE: 6496242.391000514


## KNN

In [46]:
from sklearn.neighbors import KNeighborsRegressor

In [47]:
knn_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = 63)),
        ('model', KNeighborsRegressor())
    ]
)

In [48]:
knn_pipe.fit(X_train, y_train)
knn_train_mse = mean_squared_error(y_train, knn_pipe.predict(X_train))
knn_test_mse = mean_squared_error(y_test, knn_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(knn_train_mse)}')
print(f'Test RMSE: {np.sqrt(knn_test_mse)}')

Random Forest Metrics:
Training RMSE: 4357478.9227963155
Test RMSE: 5627354.873436867


## DNN

In [89]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dropout, Dense
from bayes_opt import BayesianOptimization
from keras.optimizers import Adam
import warnings

warnings.filterwarnings('ignore')

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)


NOTES:
- Should look at different dropout rates for the different layers we add (similar code to adding more layers)
- Should look into different EPOCHS or optimizers in order to optimally fit and train the data
- Should also look into different batch sizes or maybe different activations for the different layers and run more iterations to find the best fit

In [94]:
def dnn_cv_score(neurons, dropout_rate, learning_rate, num_layers, **layer_neurons):
    model = Sequential()
    model.add(Dense(int(neurons), activation='relu', input_shape = (X_train_scaled.shape[1],)))
    model.add(Dropout(dropout_rate))

    for i in range(1, int(num_layers) + 1):
        model.add(Dense(int(layer_neurons[f'layer_neurons_{i}']), activation='relu'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(1, activation = 'linear'))

    optimizer = Adam(learning_rate = learning_rate)
    model.compile(loss = 'mean_squared_error', optimizer = optimizer)

    model.fit(X_train_scaled, y_train, epochs = 50, batch_size = 32, verbose = 0)

    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)

    return -mse

num = 5
pbounds = {'neurons': (32, 256),
           'dropout_rate': (0.0, 0.5),
           'learning_rate': (0.01, 1),
           'num_layers': (1, num)}

for i in range(1, num + 1):
    pbounds[f'layer_neurons_{i}'] = (32, 256)

optimizer = BayesianOptimization(f = dnn_cv_score, pbounds = pbounds, random_state = 42)

optimizer.maximize(init_points = 5, n_iter = 10)

best_params = optimizer.max['params']
print("Best Hyperparameters:", best_params)


|   iter    |  target   | dropou... | layer_... | layer_... | layer_... | layer_... | layer_... | learni... |  neurons  | num_la... |
-------------------------------------------------------------------------------------------------------------------------------------
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
| [0m1        [0m | [0m-2.555e+1[0m | [0m0.1873   [0m | [0m245.0    [0m | [0m196.0    [0m | [0m166.1    [0m | [0m66.95    [0m | [0m66.94    [0m | [0m0.0675   [0m | [0m226.0    [0m | [0m3.404    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
| [0m2        [0m | [0m-4.232e+1[0m | [0m0.354    [0m | [0m36.61    [0m | [0m249.3    [0m | [0m218.5    [0m | [0m79.56    [0m | [0m72.73    [0m | [0m0.1916   [0m | [0m100.2    [0m | [0m3.099    [0m |
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
| [95m3        [0m | [95m-2.325e+1[0m | [95m0.216    [0m | [95

In [102]:
model = Sequential()

model.add(Dense(name = 'Dense1', units = 208, input_dim = X_train_scaled.shape[1], activation = 'relu'))
model.add(Dropout(name = 'Dropout1', rate = 0.21597250932105788))
model.add(Dense(name = 'Dense2', units = 97, activation = 'relu'))
model.add(Dropout(name = 'Dropout2', rate = 0.21597250932105788))
model.add(Dense(name = 'Dense3', units = 169, activation = 'relu'))
model.add(Dropout(name = 'Dropout3', rate = 0.21597250932105788))

model.add(Dense(name = 'Output', units = 1, activation = 'linear'))

optimizer = Adam(learning_rate = 0.46150928437486555)

model.compile(optimizer = optimizer, loss = 'mean_squared_error')

model.summary()

In [105]:
history = model.fit(X_train_scaled, y_train, validation_split = .2, batch_size = 32, epochs = 50, verbose = 2)

Epoch 1/50
95/95 - 0s - 3ms/step - loss: 18338015084544.0000 - val_loss: 20678656917504.0000
Epoch 2/50
95/95 - 0s - 3ms/step - loss: 21794148843520.0000 - val_loss: 18322819121152.0000
Epoch 3/50
95/95 - 0s - 3ms/step - loss: 17304932843520.0000 - val_loss: 18819330342912.0000
Epoch 4/50
95/95 - 0s - 3ms/step - loss: 19888938680320.0000 - val_loss: 17620600356864.0000
Epoch 5/50
95/95 - 0s - 3ms/step - loss: 17964841566208.0000 - val_loss: 17325166166016.0000
Epoch 6/50
95/95 - 0s - 3ms/step - loss: 19538691227648.0000 - val_loss: 19871941263360.0000
Epoch 7/50
95/95 - 0s - 3ms/step - loss: 17414075973632.0000 - val_loss: 18772473675776.0000
Epoch 8/50
95/95 - 0s - 3ms/step - loss: 17859868622848.0000 - val_loss: 23820754747392.0000
Epoch 9/50
95/95 - 0s - 3ms/step - loss: 15389833560064.0000 - val_loss: 18521371181056.0000
Epoch 10/50
95/95 - 0s - 3ms/step - loss: 16033118158848.0000 - val_loss: 16381464543232.0000
Epoch 11/50
95/95 - 0s - 4ms/step - loss: 18599884357632.0000 - val_l

In [108]:
test_preds = model.predict(X_test_scaled)
train_preds = model.predict(X_train_scaled)

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [109]:
train_rmse = mean_squared_error(y_train, train_preds)
test_rmse = mean_squared_error(y_test, test_preds)

print(f'Train RMSE: {np.sqrt(train_rmse)}')
print(f'Test RMSE: {np.sqrt(test_rmse)}')

Train RMSE: 3290079.899915252
Test RMSE: 4962892.118275316
