In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Input, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
# from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [2]:
data = pd.read_csv('shota_imanaga_2024.csv')
data = data.iloc[::-1].reset_index(drop=True)

# Encode the target column (Pitch type)
label_encoder = LabelEncoder()
data['Pitch type'] = label_encoder.fit_transform(data['Pitch type'])

# One-hot encode the 'Description' column
one_hot_encoder = OneHotEncoder()
description_encoded = one_hot_encoder.fit_transform(data[['Description']]).toarray()

# Add one-hot encoded columns back to the DataFrame
description_columns = [f"Description_{i}" for i in range(description_encoded.shape[1])]
description_df = pd.DataFrame(description_encoded, columns=description_columns)

# Concatenate one-hot encoded data with the original data
data = pd.concat([data.reset_index(drop=True), description_df.reset_index(drop=True)], axis=1)
data = data.drop(columns=['Description'])  # Drop the original 'Description' column


FileNotFoundError: [Errno 2] No such file or directory: 'shota_imanaga_2024.csv'

In [None]:
data

In [None]:
def create_sequences(df, sequence_length, feature_columns, target_column):
    sequences = []
    targets = []

    for _, group in df.groupby('Date'):  # Group by game
        features = group[feature_columns].values
        targets_group = group[target_column].values

        # Create sequences within each group
        for i in range(len(features) - sequence_length):
            sequences.append(features[i:i + sequence_length])  # Sequence of pitches
            targets.append(targets_group[i + sequence_length])  # Next pitch type

    return np.array(sequences), np.array(targets)

# Define feature columns and target column
feature_columns = ['Batter ID', 'isStrike', 'Zone', 'Strike Detail']
target_column = 'Pitch type'
sequence_length = 5

# Generate sequences
X, y = create_sequences(data, sequence_length, feature_columns, target_column)

# Pad sequences to a uniform length
X = pad_sequences(X, maxlen=sequence_length, dtype='float32', padding='post', truncating='post')

In [None]:
print(X.shape)
print(y.shape)
# print(data.groupby('Date').size())

In [None]:
# Define split ratios
train_ratio = 0.68
val_ratio = 0.12
test_ratio = 0.2

# First, split into training and temp (validation + testing)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1 - train_ratio), random_state=42)

# Then, split temp into validation and testing
val_test_ratio = test_ratio / (test_ratio + val_ratio)  # Adjust split ratio for remaining data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_test_ratio, random_state=42)

# Print dataset sizes
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")


In [None]:
# VERSION: simple RNN
# # Number of unique pitch types (replace with actual number from your data)
# num_classes = len(set(y))  # Assuming `y` is label-encoded

# # Build the model
# model = Sequential([
#     Input(shape=(X_train.shape[1], X_train.shape[2])),  # Input: (sequence_length, num_features)
#     SimpleRNN(64, return_sequences=False),  # RNN with 64 units
#     Dense(32, activation='relu'),           # Fully connected layer
#     Dense(num_classes, activation='softmax')  # Output layer for classification
# ])

# # Compile the model
# model.compile(
#     optimizer='adam',
#     loss='sparse_categorical_crossentropy',  # Use sparse_categorical_crossentropy for label-encoded targets
#     metrics=['accuracy']
# )

# model.summary()

# # VERSION: GRU
# num_classes = len(set(y))  # Assuming `y` is label-encoded
# # Hyperparameters
# embedding_dim = 16
# gru_units = 128
# dense_units = 64
# dropout_rate = 0.3
# learning_rate = 0.001

# # Model architecture
# model = Sequential([
#     # # Embedding layer for Batter ID if needed (only for categorical IDs)
#     # Embedding(input_dim=1000,  # Adjust based on unique values in Batter ID
#     #           output_dim=embedding_dim,
#     #           input_length=X_train.shape[1]),  # Only if Batter ID is a primary feature

#     # GRU layer
#     GRU(gru_units, return_sequences=False),

#     # Batch Normalization
#     BatchNormalization(),

#     # Fully connected dense layer
#     Dense(dense_units, activation='relu'),

#     # Dropout for regularization
#     Dropout(dropout_rate),

#     # Output layer for classification
#     Dense(num_classes, activation='softmax')
# ])

# # Compile the model
# model.compile(
#     optimizer=Adam(learning_rate=learning_rate),
#     loss='sparse_categorical_crossentropy',  # Use for integer-encoded targets
#     metrics=['accuracy']
# )

# model.summary()

# VERSION: LSTM
# Number of unique pitch types (target classes)
num_classes = len(set(y))  # Assuming `y` is label-encoded

# Hyperparameters
lstm_units = 128
dense_units_1 = 64
dense_units_2 = 32
dropout_rate = 0.4
learning_rate = 0.0001

# Model architecture
model = Sequential([
    # First LSTM layer
    LSTM(lstm_units, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]), recurrent_dropout=0.2),

    # Batch Normalization
    BatchNormalization(),

    # Second LSTM layer (stacked)
    LSTM(lstm_units // 2, return_sequences=False, recurrent_dropout=0.2),

    # Batch Normalization
    BatchNormalization(),

    # Fully connected dense layer 1
    Dense(dense_units_1, activation='relu'),

    # Dropout for regularization
    Dropout(dropout_rate),

    # Fully connected dense layer 2
    Dense(dense_units_2, activation='relu'),

    # Dropout for regularization
    Dropout(dropout_rate),

    # Output layer for classification
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(
    # optimizer=Adam(learning_rate=ExponentialDecay(initial_learning_rate=0.001, decay_steps=1000, decay_rate=0.96)),
    optimizer=Adam(learning_rate=learning_rate),
    loss='sparse_categorical_crossentropy',  # Use for integer-encoded targets
    metrics=['accuracy']
)

model.summary()


In [None]:
# history = model.fit(
#     X_train, y_train,          # Training data
#     validation_data=(X_val, y_val),  # Validation data
#     epochs=20,                 # Number of epochs
#     batch_size=32,             # Batch size
#     verbose=1                  # Verbosity level
# )


# # VERSION: GRU
# # Callbacks for better training
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5, verbose=1)
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# # Train the model
# history = model.fit(
#     X_train, y_train,
#     validation_data=(X_val, y_val),
#     epochs=30,                   # Increase if no overfitting
#     batch_size=32,               # Adjust for GPU/CPU memory limits
#     callbacks=[reduce_lr, early_stopping],
#     verbose=1
# )


# VERSION: LSTM
# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,
    batch_size=32,
    callbacks=[reduce_lr, early_stopping],
    verbose=1
)

Epoch 1/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - accuracy: 0.0431 - loss: 2.1805 - val_accuracy: 0.0000e+00 - val_loss: 2.1309 - learning_rate: 1.0000e-04
Epoch 2/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.0924 - loss: 2.1125 - val_accuracy: 0.0000e+00 - val_loss: 2.0836 - learning_rate: 1.0000e-04
Epoch 3/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.1364 - loss: 2.0751 - val_accuracy: 0.2662 - val_loss: 2.0458 - learning_rate: 1.0000e-04
Epoch 4/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.2200 - loss: 2.0409 - val_accuracy: 0.2662 - val_loss: 2.0217 - learning_rate: 1.0000e-04
Epoch 5/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.2972 - loss: 2.0136 - val_accuracy: 0.5290 - val_loss: 1.9961 - learning_rate: 1.0000e-04
Epoch 6/30
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━

In [None]:
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=1)
print(f"training Loss: {train_loss}")
print(f"training Accuracy: {train_accuracy}")
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f"valing Loss: {val_loss}")
print(f"valing Accuracy: {val_accuracy}")

[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4910 - loss: 1.5671
training Loss: 1.5489451885223389
training Accuracy: 0.5042117834091187
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5247 - loss: 1.5483 
valing Loss: 1.551474690437317
valing Accuracy: 0.5290102362632751


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Testing Loss: {test_loss}")
print(f"Testing Accuracy: {test_accuracy}")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4994 - loss: 1.5370
Testing Loss: 1.5520493984222412
Testing Accuracy: 0.4938775599002838
