In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## Read and Preprocess Data

In [2]:
baseball = pd.read_csv('data/baseball.csv')

baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['C', '1B', '2B', '3B', 'SS', 'OF']].sum(axis = 1)
baseball = baseball.drop(['Position'], axis = 1)

In [3]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']
#y = (baseball['Salary'] - np.mean(baseball['Salary'])) / np.std(baseball['Salary'])

cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

In [23]:
num_columns = ['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'RAA', 'WAA', 'RAR',
               'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP',
               'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Num_Pos']
cat_columns = ['Tm', 'Acquired']
X = X[num_columns + cat_columns]

In [24]:
cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 621)

## Create and Fit Pipeline Random Forest

In [26]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators = 150, min_samples_leaf = 10))
    ]
)

In [27]:
pipe.fit(X_train, y_train)

In [28]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [29]:
print(f'Train MAE: {np.sqrt(train_mse)}')
print(f'Test MAE: {np.sqrt(test_mse)}')
print(f'Mean of Y: {np.std(y)}')

Train MAE: 1668.9069644665165
Test MAE: 1928.3255068324122
Mean of Y: 6395365.8881033715


In [30]:
y_test['predicted'] = pipe.predict(X_test)

In [41]:
np.max(pipe.predict(X_test))

19422081.38522924

## Create and Fit Pipeline XGBoost

In [10]:
from xgboost import XGBRegressor

In [11]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', XGBRegressor())
    ]
)

In [12]:
pipe.fit(X_train, y_train)

In [13]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [14]:
print(f'Train MSE: {np.sqrt(train_mse)}')
print(f'Test MSE: {np.sqrt(test_mse)}')
print(f'Mean of Y: {np.std(y)}')

Train MSE: 840.417720844417
Test MSE: 1884.4475823796997
Mean of Y: 6395365.8881033715


## Create and Fit Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]
)

In [17]:
pipe.fit(X_train, y_train)

In [18]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [19]:
print(f'Train MSE: {np.sqrt(train_mse)}')
print(f'Test MSE: {np.sqrt(test_mse)}')
print(f'Mean of Y: {np.std(y)}')

Train MSE: 1899.1216364379677
Test MSE: 1958.8997764365645
Mean of Y: 6395365.8881033715


## Create and Fit KNN

In [43]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', KNeighborsRegressor())
    ]
)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
train_mse = mean_absolute_error(y_train, pipe.predict(X_train))
test_mse = mean_absolute_error(y_test, pipe.predict(X_test))

In [None]:
print(f'Train MSE: {np.sqrt(train_mse)}')
print(f'Test MSE: {np.sqrt(test_mse)}')
print(f'Mean of Y: {np.std(y)}')