# KBO Projections Modeling

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Train Test Split

Because the goal of the project is to project full season performance from partial season performance, I will be splitting based on the season column, withholding the partial 2020 season for testing. I will then drop the season column, since it serves more as a nominal column for the sake of the specified projections. I am performing this split before fitting and applying the standard scaler to the rest of the data to avoid data leakage. Then I will apply train_test_split to the pre-2020 data to build models that define the feature relationships for our three response variables. After that, I will examine the question of projecting from partial to full season.

In [2]:
df = pd.read_csv("KBO_Projections_Data_Modeling.csv")

In [3]:
# split by season, and then drop season column
pd.set_option('mode.chained_assignment',None)
season_2020 = df[df['Season']==2020]
split = df[df['Season']!=2020]
season_2020.drop(['Season'],axis=1,inplace=True)
split.drop(['Season'],axis=1,inplace=True)

In [4]:
# X, y
X = split.drop(['RBI_rate','HR_rate','BA'],axis=1)
X_2020 = season_2020.drop(['RBI_rate','HR_rate','BA'],axis=1)
y = split[['RBI_rate','HR_rate','BA']]
y_2020 = season_2020[['RBI_rate','HR_rate','BA']]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=409)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_2020 = scaler.transform(X_2020)

## Modeling

This is a regression problem. I plan to try the following models out-of-the-box, and see how they perform. If one is well ahead in terms of performance, I will use that one and tune it. If more than one is very close in terms of performance, I will tune them a bit and see how much performance improvement we can get.

1. Logistic Regression
2. Lasso
3. ElasticNet
4. RidgeRegression
5. SVR(kernel-'linear')
6. SVR(kernel-'rbf')

In terms of evaluation metrics, I will be using RMSE as my leading indicator.

In [7]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [21]:
scores_dict = {}

In [22]:
def initial_scores(reg, y):
    '''for a given already initiated regressor reg, returns both test and train
    RMSE, r2, and MAE for the given response variable y'''
    reg.fit(X_train,y_train[y])
    y_pred_train = reg.predict(X_train)
    RMSE_train = mean_squared_error(y_train[y], y_pred_train, squared=False)
    r2_train = r2_score(y_train[y],y_pred_train)
    MAE_train = mean_absolute_error(y_train[y], y_pred_train)
    y_pred_test = reg.predict(X_test)
    RMSE_test = mean_squared_error(y_test[y], y_pred_test, squared=False)
    r2_test = r2_score(y_test[y],y_pred_test)
    MAE_test = mean_absolute_error(y_test[y], y_pred_test)
    return RMSE_train, r2_train, MAE_train, RMSE_test, r2_test, MAE_test

In [10]:
def initial_train_scores(reg):
    print("Training Set Scores: ")
    for y in ['RBI_rate','HR_rate','BA']:
        reg.fit(X_train,y_train[y])
        y_pred = reg.predict(X_train)
        RMSE = mean_squared_error(y_train[y], y_pred, squared=False)
        r2 = r2_score(y_train[y],y_pred)
        MAE = mean_absolute_error(y_train[y], y_pred)
        print(y + ": RMSE: " + str(round(RMSE,3)) + " r2: " + str(round(r2,3)) + " MAE: " + str(round(MAE,3)))

In [11]:
def initial_test_scores(reg):
    print("Test Set Scores: ")
    for y in ['RBI_rate','HR_rate','BA']:
        reg.fit(X_train,y_train[y])
        y_pred = reg.predict(X_test)
        RMSE = mean_squared_error(y_test[y], y_pred, squared=False)
        r2 = r2_score(y_test[y],y_pred)
        MAE = mean_absolute_error(y_test[y], y_pred)
        print(y + ": RMSE: " + str(round(RMSE,3)) + " r2: " + str(round(r2,3)) + " MAE: " + str(round(MAE,3)))

### Linear Regression

In [26]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

scores_dict["reg"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["reg"][y] = initial_scores(reg,y)

Training Set Scores: 
RBI_rate: RMSE: 0.056 r2: 0.532 MAE: 0.028
HR_rate: RMSE: 0.005 r2: 0.944 MAE: 0.002
BA: RMSE: 0.011 r2: 0.99 MAE: 0.005
Test Set Scores: 
RBI_rate: RMSE: 0.081 r2: 0.385 MAE: 0.031
HR_rate: RMSE: 0.007 r2: 0.95 MAE: 0.002
BA: RMSE: 0.014 r2: 0.987 MAE: 0.006


### Lasso

In [61]:
from sklearn.linear_model import Lasso
lasso = Lasso()

scores_dict["lasso"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["lasso"][y] = initial_scores(lasso,y)

Training Set Scores: 
RBI_rate: RMSE: 0.082 r2: 0.0 MAE: 0.051
HR_rate: RMSE: 0.019 r2: 0.0 MAE: 0.014
BA: RMSE: 0.112 r2: 0.0 MAE: 0.078
Test Set Scores: 
RBI_rate: RMSE: 0.104 r2: -0.002 MAE: 0.053
HR_rate: RMSE: 0.029 r2: -0.001 MAE: 0.015
BA: RMSE: 0.118 r2: -0.002 MAE: 0.078


### ElasticNet

In [62]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet()

scores_dict["enet"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["enet"][y] = initial_scores(enet,y)

Training Set Scores: 
RBI_rate: RMSE: 0.082 r2: 0.0 MAE: 0.051
HR_rate: RMSE: 0.019 r2: 0.0 MAE: 0.014
BA: RMSE: 0.112 r2: 0.0 MAE: 0.078
Test Set Scores: 
RBI_rate: RMSE: 0.104 r2: -0.002 MAE: 0.053
HR_rate: RMSE: 0.029 r2: -0.001 MAE: 0.015
BA: RMSE: 0.118 r2: -0.002 MAE: 0.078


### RidgeRegression

In [63]:
from sklearn.linear_model import Ridge
ridge = Ridge()

scores_dict["ridge"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["ridge"][y] = initial_scores(ridge,y)

Training Set Scores: 
RBI_rate: RMSE: 0.056 r2: 0.532 MAE: 0.028
HR_rate: RMSE: 0.005 r2: 0.944 MAE: 0.002
BA: RMSE: 0.011 r2: 0.99 MAE: 0.005
Test Set Scores: 
RBI_rate: RMSE: 0.081 r2: 0.385 MAE: 0.031
HR_rate: RMSE: 0.007 r2: 0.949 MAE: 0.002
BA: RMSE: 0.014 r2: 0.987 MAE: 0.006


### SVR(kernel-'linear')

In [64]:
from sklearn.svm import SVR
svr_lin = SVR(kernel='linear')

scores_dict["svr_lin"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["svr_lin"][y] = initial_scores(svr_lin,y)

Training Set Scores: 
RBI_rate: RMSE: 0.07 r2: 0.288 MAE: 0.045
HR_rate: RMSE: 0.084 r2: -18.151 MAE: 0.083
BA: RMSE: 0.063 r2: 0.678 MAE: 0.06
Test Set Scores: 
RBI_rate: RMSE: 0.097 r2: 0.122 MAE: 0.046
HR_rate: RMSE: 0.084 r2: -7.295 MAE: 0.083
BA: RMSE: 0.064 r2: 0.708 MAE: 0.06


### SVR(kernel-'rbf')

In [65]:
svr_rbf = SVR(kernel='rbf')

scores_dict["svr_rbf"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["svr_rbf"][y] = initial_scores(svr_rbf,y)

Training Set Scores: 
RBI_rate: RMSE: 0.063 r2: 0.425 MAE: 0.046
HR_rate: RMSE: 0.086 r2: -19.195 MAE: 0.085
BA: RMSE: 0.039 r2: 0.877 MAE: 0.029
Test Set Scores: 
RBI_rate: RMSE: 0.099 r2: 0.088 MAE: 0.049
HR_rate: RMSE: 0.088 r2: -8.095 MAE: 0.085
BA: RMSE: 0.054 r2: 0.793 MAE: 0.031


In [66]:
myLabels = ['Logistic Regression', 'Lasso', 'ElasticNet',
            'RidgeRegression', 'SVR(kernel-linear)', 'SVR(kernel-rbf)']