In [51]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## Read and Preprocess Data

In [52]:
baseball = pd.read_csv('data/baseball.csv')

baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['C', '1B', '2B', '3B', 'SS', 'OF']].sum(axis = 1)
baseball = baseball.drop(['Position'], axis = 1)

In [53]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']
#y = (baseball['Salary'] - np.mean(baseball['Salary'])) / np.std(baseball['Salary'])

cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

In [29]:
num_columns = ['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'RAA', 'WAA', 'RAR',
               'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP',
               'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos']
cat_columns = ['Tm', 'Acquired']
X = X[num_columns + cat_columns]

In [54]:
cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 621)

## Create and Fit Pipeline Random Forest

In [56]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators = 150, min_samples_leaf = 10))
    ]
)

In [57]:
pipe.fit(X_train, y_train)

In [58]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [59]:
print(f'Train MAE: {train_mse}')
print(f'Test MAE: {test_mse}')
print(f'Mean of Y: {np.std(y)}')

Train MAE: 2699791.0581118637
Test MAE: 3636496.7966094958
Mean of Y: 6395365.8881033715


## Create and Fit Pipeline XGBoost

In [60]:
from xgboost import XGBRegressor

In [61]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', XGBRegressor())
    ]
)

In [62]:
pipe.fit(X_train, y_train)

In [63]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [64]:
print(f'Train MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Mean of Y: {np.std(y)}')

Train MSE: 706301.9455093243
Test MSE: 3551142.690736695
Mean of Y: 6395365.8881033715


## Create and Fit Linear Regression

In [65]:
from sklearn.linear_model import LinearRegression

In [66]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]
)

In [67]:
pipe.fit(X_train, y_train)

In [68]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [69]:
print(f'Train MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Mean of Y: {np.std(y)}')

Train MSE: 3606662.989986825
Test MSE: 3837288.3341232226
Mean of Y: 6395365.8881033715


## Create and Fit KNN

In [70]:
from sklearn.neighbors import KNeighborsRegressor

In [71]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', KNeighborsRegressor())
    ]
)

In [72]:
pipe.fit(X_train, y_train)

In [73]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [74]:
print(f'Train MSE: {train_mse}')
print(f'Test MSE: {test_mse}')
print(f'Mean of Y: {np.std(y)}')

Train MSE: 2923013.0613438734
Test MSE: 3831372.763665086
Mean of Y: 6395365.8881033715
