## Setup

In [5]:
import pandas as pd

from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

In [6]:
baseball = pd.read_csv('data/baseball.csv')
baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['Pos_C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['Pos_1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['Pos_2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['Pos_3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['Pos_SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['Pos_OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['Pos_C', 'Pos_1B', 'Pos_2B', 'Pos_3B', 'Pos_SS', 'Pos_OF']].sum(axis = 1)
baseball['R/AB'] = baseball['R'] / baseball['AB']
baseball['2B/AB'] = baseball['2B'] / baseball['AB']
baseball['3B/AB'] = baseball['3B'] / baseball['AB']
baseball['HR/AB'] = baseball['HR'] / baseball['AB']
baseball['RBI/AB'] = baseball['RBI'] / baseball['AB']
baseball['BB/PA'] = baseball['BB'] / baseball['PA']
baseball['SB - CS'] = baseball['SB'] - baseball['CS']
baseball['BB - SO'] = baseball['BB'] - baseball['SO'] # measures a batters eye
baseball['E/Def-Inn'] = baseball['E'] / baseball['Def-Inn']
baseball['DP/Def-Inn'] = baseball['DP'] / baseball['Def-Inn']

baseball = baseball.drop(['Position', 'Def-Inn', 'PO', 'A', 'E', 'DP', 'PA', 'AB', 'R', 'H', 
                          '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB'], axis = 1)

baseball.to_csv('best_model/engineered_data.csv', index = False)

In [7]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 621)

In [8]:
cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

X_transform = preprocessor.fit_transform(X)

selected_features = np.concatenate([
    np.array(num_columns),
    np.array(preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(cat_columns)),
    np.array(['Pos_C', 'Pos_1B', 'Pos_2B', 'Pos_3B', 'Pos_SS', 'Pos_OF'])
])


## Random Forest

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
rf_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators = 200, min_samples_split = 14))
    ]
)

In [11]:
rf_pipe.fit(X_train, y_train)
rf_train_mse = mean_squared_error(y_train, rf_pipe.predict(X_train))
rf_test_mse = mean_squared_error(y_test, rf_pipe.predict(X_test))
print('Random Forest Metrics:')
print(f'Training RMSE: {np.sqrt(rf_train_mse)}')
print(f'Test RMSE: {np.sqrt(rf_test_mse)}')

Random Forest Metrics:
Training RMSE: 3096922.999504919
Test RMSE: 5403629.740016949


## XGBoost

In [13]:
from xgboost import XGBRegressor

In [14]:
xg_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(n_estimators = 100, learning_rate = .03, max_depth = 6))
    ]
)

In [15]:
xg_pipe.fit(X_train, y_train)
xg_train_mse = mean_squared_error(y_train, xg_pipe.predict(X_train))
xg_test_mse = mean_squared_error(y_test, xg_pipe.predict(X_test))
print('XGBoost Metrics:')
print(f'Training RMSE: {np.sqrt(xg_train_mse)}')
print(f'Test RMSE: {np.sqrt(xg_test_mse)}')

XGBoost Metrics:
Training RMSE: 3845982.7038474297
Test RMSE: 5302389.606622183


## Gradient Boosting

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

In [17]:
gb_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', GradientBoostingRegressor(learning_rate = 0.075, n_estimators = 200, max_depth = 4))
    ]
)

In [18]:
gb_pipe.fit(X_train, y_train)
gb_train_mse = mean_squared_error(y_train, gb_pipe.predict(X_train))
gb_test_mse = mean_squared_error(y_test, gb_pipe.predict(X_test))
print('Gradient Boosting Metrics:')
print(f'Training RMSE: {np.sqrt(gb_train_mse)}')
print(f'Test RMSE: {np.sqrt(gb_test_mse)}')

Gradient Boosting Metrics:
Training RMSE: 3563290.8473157436
Test RMSE: 5213203.4697976615


## ADA Boost

In [19]:
from sklearn.ensemble import AdaBoostRegressor

In [21]:
ada_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', AdaBoostRegressor(n_estimators = 200, learning_rate = 0.01))
    ]
)

In [22]:
ada_pipe.fit(X_train, y_train)
ada_train_mse = mean_squared_error(y_train, ada_pipe.predict(X_train))
ada_test_mse = mean_squared_error(y_test, ada_pipe.predict(X_test))
print('ADABoost Metrics:')
print(f'Training RMSE: {np.sqrt(ada_train_mse)}')
print(f'Test RMSE: {np.sqrt(ada_test_mse)}')

ADABoost Metrics:
Training RMSE: 5627181.076502853
Test RMSE: 5837697.75206595


## KNN

In [23]:
from sklearn.neighbors import KNeighborsRegressor

In [24]:
knn_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', KNeighborsRegressor(n_neighbors = 5, weights = 'uniform'))
    ]
)

In [25]:
knn_pipe.fit(X_train, y_train)
knn_train_mse = mean_squared_error(y_train, knn_pipe.predict(X_train))
knn_test_mse = mean_squared_error(y_test, knn_pipe.predict(X_test))
print('K-Nearest Neighbors Metrics:')
print(f'Training RMSE: {np.sqrt(knn_train_mse)}')
print(f'Test RMSE: {np.sqrt(knn_test_mse)}')

K-Nearest Neighbors Metrics:
Training RMSE: 4378889.69760937
Test RMSE: 5573251.811634824


## ADA Boost PCA

In [28]:
from sklearn.decomposition import PCA

In [51]:
ada_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = 37)),
        ('model', AdaBoostRegressor(n_estimators = 200, learning_rate = 0.01))
    ]
)

In [52]:
ada_pipe.fit(X_train, y_train)
ada_train_mse = mean_squared_error(y_train, ada_pipe.predict(X_train))
ada_test_mse = mean_squared_error(y_test, ada_pipe.predict(X_test))
print('ADABoost with PCA Metrics:')
print(f'Training RMSE: {np.sqrt(ada_train_mse)}')
print(f'Test RMSE: {np.sqrt(ada_test_mse)}')

ADABoost with PCA Metrics:
Training RMSE: 5539579.804091425
Test RMSE: 5876331.874261794


## KNN with PCA

In [53]:
knn_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components = 63)),
        ('model', KNeighborsRegressor(n_neighbors = 5, weights = 'uniform'))
    ]
)

In [54]:
knn_pipe.fit(X_train, y_train)
knn_train_mse = mean_squared_error(y_train, knn_pipe.predict(X_train))
knn_test_mse = mean_squared_error(y_test, knn_pipe.predict(X_test))
print('K-Nearest Neighbors with PCA Metrics:')
print(f'Training RMSE: {np.sqrt(knn_train_mse)}')
print(f'Test RMSE: {np.sqrt(knn_test_mse)}')

K-Nearest Neighbors with PCA Metrics:
Training RMSE: 4376254.285284927
Test RMSE: 5570590.870175524
