In [17]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn import model_selection
from sklearn import metrics
from skopt import dump, load
from scipy.stats import reciprocal
import skopt
import numpy as np
import pandas as pd
import keras_tuner as kt
from sklearn import metrics
import datetime
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data from Sources

In [3]:
df = pd.read_excel('mlb_team_batting_wp.xlsx')

In [4]:
df.columns

Index(['Season', 'Team', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI',
       'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'SB', 'CS', 'AVG', 'W', 'L',
       'win_per'],
      dtype='object')

In [5]:
df.shape

(1468, 23)

# Data Pre-Processing

The target for this model will be predicting winning percentage using offensive statistics provided by pybaseball. The following pre-processings must be done:
- Remove wins and loss column, this is part of the prediction
- Remove the average column, batting average is an aggregate statistic and I want to focus on continous values
- Removing hits (H) as it is the sum of the different types of hits
- Remove runs category, I want to focus on other statistics as Runs is known to be an indicator of wins
- Remove RBI for the same reason

In [6]:
mlb_df = df.drop(labels=['Season', 'Team', 'H', 'R', 'RBI', 'AVG', 'W', 'L', 'AB'], axis=1)
mlb_df.columns

Index(['PA', '1B', '2B', '3B', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH',
       'SB', 'CS', 'win_per'],
      dtype='object')

In [7]:
mlb_df.shape

(1468, 14)

### Normalizing Input Vector

Since the values range widely, apply a scaler function to the input vector. Output vector can remain the same as it will always be a value between 0 and 1

In [8]:
x = mlb_df[['PA', '1B', '2B', '3B', 'HR', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'SB', 'CS']].to_numpy()
y = mlb_df[['win_per']].to_numpy()

In [9]:
x_scaler = MinMaxScaler().fit(x)
x_scaled = x_scaler.transform(x)

In [10]:
# Split last 25% of data for test data
split_val = int(x.shape[0] * .75)
x_train_full, x_test = x_scaled[:split_val], x_scaled[split_val:]
y_train_full, y_test = y[:split_val], y[split_val:]

# Simple Model Build

In [19]:
def build_simple_model():
    model = keras.models.Sequential()
    
    model.add(keras.layers.InputLayer(input_shape=(13,), name='input'))
    
    model.add(keras.layers.Dense(units=8, activation='relu', name='hidden'))
    
    model.add(keras.layers.Dense(units=1, name='output'))
    
    model.compile(
        loss='mean_absolute_error',
        optimizer='sgd',
        metrics=['mean_squared_error', 'mean_absolute_error']
    )
    
    return model

In [20]:
sm_model = build_simple_model()
sm_model.fit(x_train_full, y_train_full, epochs=30, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7fe11f7dfe10>

In [21]:
predicted = sm_model.predict(x_test)
mean_absolute_error(predicted, y_test)

0.06247961965178924

# Impact of Initialization

Look for the differences in the simple model using different starting values for synaptic weights.

# Impact of Learning Algorithm

Impact of learning algorithm on training time, MSE, and complexity.

# Hyper-Parameter Tuning using SKOPT

Hyper-Parameter Tuning for the following values:
- Number of layers
- Number of neurons in each layer
- Learning algorithm (?)
- Learning rate
- Dropout rate
- L1, L2, L1_L2 regularization

# Model Performance & Evaluation

Evaluate the $R^2$, MSE, MAE, and other metrics for the final tuned model