In [1]:
# Import our dependencies
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelBinarizer
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf

2023-09-14 20:09:42.909487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [71]:
# PREDICTION INPUT DATA PREP

# Read in input data
input_df = pd.read_csv('STARTERS_input_data.csv')

# Take out columns that we'll need in order to read our predictions
id_info = input_df[['pitcher','code']]

# Drop columns that will not be included in training
input_df = input_df.drop(columns = ['Date','pitcher','code'])

# Dropping this column separately just b/c I maybe want to come back to considering NOT dropping it
# Yeah, the opponent performance is still here, but sometimes it's just the TEAM itself that seems
# to make a difference for some teams/pitchers (i.e., Devers vs. NYY)
input_df = input_df.drop(columns = 'Opp')

# Separate target as y and features as X
Xp = input_df

# Get list of non-numerical columns
Xp_cat = Xp.dtypes[Xp.dtypes == 'object'].index.tolist()

# Get dummies for those columns
one_hot = pd.get_dummies(Xp[Xp_cat])

# Drop the original non-numerical columns
Xp = Xp.drop(columns = Xp_cat)

# Put in ALL the possible buckets
Xp[['bucket_1_-10<.<=0',
       'bucket_1_.<=-10', 'bucket_1_0<.<=10', 'bucket_1_10<.<=20',
       'bucket_1_20<.<=30', 'bucket_1_30<.', 'bucket_5_-10<.<=0',
       'bucket_5_.<=-10', 'bucket_5_0<.<=10', 'bucket_5_10<.<=20',
       'bucket_5_20<.<=30', 'bucket_5_30<.', 'bucket_all_-10<.<=0',
       'bucket_all_.<=-10', 'bucket_all_0<.<=10', 'bucket_all_10<.<=20',
       'bucket_all_20<.<=30', 'bucket_all_30<.']] = 0

# update the bucket columns with the numerical dummies for those non-numerical columns
Xp.update(one_hot)

# This is officially the data we're going to put into our predictor
Xp.head(2)


Unnamed: 0,Dec_1,DR_1,H_1,ER_1,BB_1,SO_1,HR_1,HBP_1,FIP_1,Outs_1,...,bucket_5_0<.<=10,bucket_5_10<.<=20,bucket_5_20<.<=30,bucket_5_30<.,bucket_all_-10<.<=0,bucket_all_.<=-10,bucket_all_0<.<=10,bucket_all_10<.<=20,bucket_all_20<.<=30,bucket_all_30<.
0,-1.0,5.0,4.0,4.0,4.0,2.0,0.0,0.0,4.93,6.0,...,0,0,0,0,0,0,1,0,0,0
1,0.0,6.0,5.0,1.0,3.0,8.0,1.0,0.0,3.66,18.0,...,0,1,0,0,0,0,0,1,0,0


In [3]:
## TESTING DATA PREP

# Read in training data
training_df = pd.read_csv('training_data.csv')

# Just have to do this once b/c I forgot to put this in the first time
training_df = training_df.drop(columns = 'Unnamed: 0')

# Drop columns that will not be included in training
training_df = training_df.drop(columns = ['Date','pitcher','code'])

# Dropping this column separately just b/c I maybe want to come back to considering NOT dropping it
# Yeah, the opponent performance is still here, but sometimes it's just the TEAM itself that seems
# to make a difference for some teams/pitchers (i.e., Devers vs. NYY)
training_df = training_df.drop(columns = 'Opp')


# Separate target as y and features as X
y = training_df['bucket']
X = training_df.drop(columns = 'bucket')

# Get list of non-numerical columns
X_cat = X.dtypes[X.dtypes == 'object'].index.tolist()

# Get dummies for those columns
one_hot = pd.get_dummies(X[X_cat])

# Drop the original non-numerical columns
X = X.drop(columns = X_cat)

# join in the numerical dummies for those non-numerical columns
X = X.join(one_hot)

X.head(2)

Unnamed: 0,Dec_1,DR_1,H_1,ER_1,BB_1,SO_1,HR_1,HBP_1,FIP_1,Outs_1,...,bucket_5_0<.<=10,bucket_5_10<.<=20,bucket_5_20<.<=30,bucket_5_30<.,bucket_all_-10<.<=0,bucket_all_.<=-10,bucket_all_0<.<=10,bucket_all_10<.<=20,bucket_all_20<.<=30,bucket_all_30<.
0,0.0,5.0,10.0,6.0,2.0,3.0,3.0,0.0,5.63,15.0,...,0,0,0,0,1,0,0,0,0,0
1,1.0,1.0,1.0,0.0,1.0,4.0,0.0,0.0,1.83,5.0,...,1,0,0,0,0,0,1,0,0,0


In [34]:
# This is us converting one vector of 6 distinct classes into basically get-dummies
from tensorflow.keras.utils import to_categorical

outcomes = ['.<=-10','-10<.<=0','0<.<=10','10<.<=20','20<.<=30','30<.']

y_factors = to_categorical(pd.Categorical(y,categories = outcomes).codes,6)


# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y_factors, random_state=42)

## Preprocess numerical data for neural network
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [45]:
# Define the deep learning model 
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=22, activation="relu", input_dim = X_train.shape[1]))
nn_model.add(tf.keras.layers.Dense(units=18, activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=14, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=10, activation="tanh"))
nn_model.add(tf.keras.layers.Dense(units=6, activation="softmax"))


# Compile the Sequential model together and customize metrics
nn_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
58/58 - 0s - loss: 1.3420 - accuracy: 0.4541 - 256ms/epoch - 4ms/step
Loss: 1.342038869857788, Accuracy: 0.454054057598114


In [67]:
predicted_perf = nn_model.predict(Xp)
predicted_perf = pd.DataFrame(predicted_perf)
predicted_perf.columns = outcomes
predicted_perf




Unnamed: 0,.<=-10,-10<.<=0,0<.<=10,10<.<=20,20<.<=30,30<.
0,0.067397,0.011343,0.762461,0.082313,0.07647,1.7e-05
1,0.017994,0.19632,0.452361,0.299309,0.033976,4e-05
2,0.016433,0.441355,0.487411,0.054218,0.000496,8.7e-05
3,0.011617,0.17244,0.447317,0.308852,0.059725,4.9e-05
4,0.004951,0.029847,0.892041,0.068495,0.004662,3e-06
5,0.044725,0.067399,0.793044,0.078402,0.016365,6.4e-05
6,0.041856,0.819933,0.10149,0.030972,0.001055,0.004694
7,0.006956,0.14483,0.693658,0.143972,0.010562,2.2e-05
8,0.140477,0.231873,0.228246,0.348826,0.050215,0.000363
9,0.024714,0.17826,0.352399,0.388218,0.056368,4.2e-05


In [72]:
predicted_perf = predicted_perf.round(1)

predicted_perf

output = id_info.join(predicted_perf)
output


Unnamed: 0,pitcher,code,.<=-10,-10<.<=0,0<.<=10,10<.<=20,20<.<=30,30<.
0,Josiah Gray,grayjo03,0.1,0.0,0.8,0.1,0.1,0.0
1,Mitch Keller,kellemi03,0.0,0.2,0.4,0.3,0.0,0.0
2,Derek Law,lawde01,0.0,0.4,0.5,0.0,0.0,0.0
3,Reese Olson,olsonre01,0.0,0.2,0.4,0.3,0.1,0.0
4,Michael King,kingmi01,0.0,0.0,0.9,0.1,0.0,0.0
5,Clarke Schmidt,schmicl01,0.0,0.1,0.8,0.1,0.0,0.0
6,Tanner Houck,houckta01,0.0,0.8,0.1,0.0,0.0,0.0
7,Eury Pérez,perezeu02,0.0,0.1,0.7,0.1,0.0,0.0
8,Adrian Houser,housead01,0.1,0.2,0.2,0.4,0.0,0.0
9,Merrill Kelly,kellyme01,0.0,0.2,0.4,0.4,0.1,0.0


In [73]:

print("--- 28.12 minutes ---")


--- 28.12 minutes ---
