# KBO Projections Modeling

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Train Test Split

Because the goal of the project is to project full season performance from partial season performance, I will be splitting based on the season column, withholding the partial 2020 season for testing. I will then drop the season column, since it serves more as a nominal column for the sake of the specified projections. I am performing this split before fitting and applying the standard scaler to the rest of the data to avoid data leakage. Then I will apply train_test_split to the pre-2020 data to build models that define the feature relationships for our three response variables. After that, I will examine the question of projecting from partial to full season.

In [2]:
df = pd.read_csv("KBO_Projections_Data_Modeling.csv")

In [3]:
# split by season, and then drop season column
pd.set_option('mode.chained_assignment',None)
season_2020 = df[df['Season']==2020]
split = df[df['Season']!=2020]
season_2020.drop(['Season'],axis=1,inplace=True)
split.drop(['Season'],axis=1,inplace=True)

In [4]:
# X, y
X = split.drop(['RBI_rate','HR_rate','BA'],axis=1)
X_2020 = season_2020.drop(['RBI_rate','HR_rate','BA'],axis=1)
y = split[['RBI_rate','HR_rate','BA']]
y_2020 = season_2020[['RBI_rate','HR_rate','BA']]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=409)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_2020 = scaler.transform(X_2020)

## Modeling

This is a regression problem. I plan to try the following models out-of-the-box, and see how they perform. If one is well ahead in terms of performance, I will use that one and tune it. If more than one is very close in terms of performance, I will tune them a bit and see how much performance improvement we can get.

1. Logistic Regression
2. Lasso
3. ElasticNet
4. RidgeRegression
5. SVR(kernel-'linear')
6. SVR(kernel-'rbf')

In terms of evaluation metrics, I will be using RMSE as my leading indicator.

In [7]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [8]:
scores_dict = {}

In [9]:
def initial_scores(reg, y):
    '''for a given already initiated regressor reg, returns both test and train
    RMSE, r2, and MAE for the given response variable y'''
    reg.fit(X_train,y_train[y])
    y_pred_train = reg.predict(X_train)
    RMSE_train = mean_squared_error(y_train[y], y_pred_train, squared=False)
    r2_train = r2_score(y_train[y],y_pred_train)
    MAE_train = mean_absolute_error(y_train[y], y_pred_train)
    y_pred_test = reg.predict(X_test)
    RMSE_test = mean_squared_error(y_test[y], y_pred_test, squared=False)
    r2_test = r2_score(y_test[y],y_pred_test)
    MAE_test = mean_absolute_error(y_test[y], y_pred_test)
    return RMSE_train, r2_train, MAE_train, RMSE_test, r2_test, MAE_test

In [10]:
def initial_train_scores(reg):
    print("Training Set Scores: ")
    for y in ['RBI_rate','HR_rate','BA']:
        reg.fit(X_train,y_train[y])
        y_pred = reg.predict(X_train)
        RMSE = mean_squared_error(y_train[y], y_pred, squared=False)
        r2 = r2_score(y_train[y],y_pred)
        MAE = mean_absolute_error(y_train[y], y_pred)
        print(y + ": RMSE: " + str(round(RMSE,3)) + " r2: " + str(round(r2,3)) + " MAE: " + str(round(MAE,3)))

In [11]:
def initial_test_scores(reg):
    print("Test Set Scores: ")
    for y in ['RBI_rate','HR_rate','BA']:
        reg.fit(X_train,y_train[y])
        y_pred = reg.predict(X_test)
        RMSE = mean_squared_error(y_test[y], y_pred, squared=False)
        r2 = r2_score(y_test[y],y_pred)
        MAE = mean_absolute_error(y_test[y], y_pred)
        print(y + ": RMSE: " + str(round(RMSE,3)) + " r2: " + str(round(r2,3)) + " MAE: " + str(round(MAE,3)))

### Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

scores_dict["reg"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["reg"][y] = initial_scores(reg,y)

### Lasso

In [13]:
from sklearn.linear_model import Lasso
lasso = Lasso()

scores_dict["lasso"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["lasso"][y] = initial_scores(lasso,y)

### ElasticNet

In [14]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet()

scores_dict["enet"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["enet"][y] = initial_scores(enet,y)

### RidgeRegression

In [15]:
from sklearn.linear_model import Ridge
ridge = Ridge()

scores_dict["ridge"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["ridge"][y] = initial_scores(ridge,y)

### SVR(kernel-'linear')

In [16]:
from sklearn.svm import SVR
svr_lin = SVR(kernel='linear')

scores_dict["svr_lin"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["svr_lin"][y] = initial_scores(svr_lin,y)

### SVR(kernel-'rbf')

In [17]:
svr_rbf = SVR(kernel='rbf')

scores_dict["svr_rbf"] = {}
for y in ['RBI_rate','HR_rate','BA']:
    scores_dict["svr_rbf"][y] = initial_scores(svr_rbf,y)

In [22]:

pd.DataFrame.from_dict({(i,j): scores_dict[i][j] 
                           for i in scores_dict.keys() 
                           for j in scores_dict[i].keys()},
                       orient='index')

Unnamed: 0,0,1,2,3,4,5
"(reg, RBI_rate)",0.056393,0.532466,0.02806,0.081449,0.384631,0.03124
"(reg, HR_rate)",0.004552,0.943516,0.002232,0.00655,0.949742,0.002413
"(reg, BA)",0.011277,0.989798,0.005163,0.013601,0.986775,0.005579
"(lasso, RBI_rate)",0.082475,0.0,0.051031,0.103927,-0.001906,0.05251
"(lasso, HR_rate)",0.019151,0.0,0.014098,0.029228,-0.000757,0.014577
"(lasso, BA)",0.111648,0.0,0.07805,0.1184,-0.002186,0.077888
"(enet, RBI_rate)",0.082475,0.0,0.051031,0.103927,-0.001906,0.05251
"(enet, HR_rate)",0.019151,0.0,0.014098,0.029228,-0.000757,0.014577
"(enet, BA)",0.111648,0.0,0.07805,0.1184,-0.002186,0.077888
"(ridge, RBI_rate)",0.056393,0.532465,0.028062,0.081437,0.384798,0.03124


In [66]:
myLabels = ['Logistic Regression', 'Lasso', 'ElasticNet',
            'RidgeRegression', 'SVR(kernel-linear)', 'SVR(kernel-rbf)']