# Final Version

In [153]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Input, Dense

## Data Preprocess

### Transform data
NOTE: the value range and number of unique values below are just an estimation, it's merely listed to help you understand each feature.

Abbreviations: b.n. before normalization, b.e. before encoding.

Normalized features:
- release_speed (value 0~105)
- launch_speed (value 0~120, empty=>0 b.n.)
- launch_angle (value -90~90, empty=>0 b.n.)
- hit_distance_sc (value 0~400, empty=>0 b.n.)

One-hot encoded features:
- pitch_type (unique 7) (as an input feature, don't use one-hot for output)
- strikes (unique 3)
- balls (unique 4)
- zone (unique 14)
- description (unique 11)
- bb_type (unique 4, empty=>str b.e)

Embedded features:
- batter (integer ID)

In [154]:
# Read data and reverse it (sorted by Game Date)
data = pd.read_csv('shota_imanaga_all_data_2024.csv')
data = data.iloc[::-1].reset_index(drop=True)

print(len(set(data['bb_type'])))
print(len(set(data['description'])))
print(len(set(data['bb_type'])))
print(len(set(data['pitch_type'])))

5
11
5
8


We will store raw data in `data` and processed data in `processed_data` with only the target features.

In [155]:
processed_data = pd.DataFrame()
processed_output = pd.DataFrame()

### CONFIGURE ###
normalized_features = ['release_speed', 'launch_speed', 'launch_angle', 'hit_distance_sc']
normalized_features_possibly_missing = ['launch_speed', 'launch_angle', 'hit_distance_sc']

one_hot_features = ['pitch_type', 'strikes', 'balls', 'zone', 'description', 'bb_type']
one_hot_features_possibly_missing = ['bb_type']

embedded_features = ['batter']

output_feature = 'pitch_type'
### CONFIGURE ###

# Normalization
for feature in normalized_features:
    processed_data[feature] = data[feature]
    # Fill empty entries with 0
    if feature in normalized_features_possibly_missing:
        processed_data[feature] = processed_data[feature].fillna(0)
    # Normalize
    scaler = MinMaxScaler(feature_range=(-1, 1))
    processed_data[feature] = scaler.fit_transform(processed_data[[feature]])

# One-Hot Encode
ont_hot_tmp_df = pd.DataFrame()
for feature in one_hot_features:
    ont_hot_tmp_df[feature] = data[feature]
    # Fill empty entries with 'Unknown'
    if feature in one_hot_features_possibly_missing:
        ont_hot_tmp_df[feature] = ont_hot_tmp_df[feature].fillna('Unknown')
one_hot_encoder = OneHotEncoder(sparse_output=False)
encoded = one_hot_encoder.fit_transform(ont_hot_tmp_df[one_hot_features])
processed_data[one_hot_encoder.get_feature_names_out(one_hot_features)] = encoded

# Label encode for embedding
for feature in embedded_features:
    processed_data[feature] = data[feature]
    label_encoder = LabelEncoder()
    processed_data[feature] = label_encoder.fit_transform(processed_data[feature])

# Label output
label_encoder = LabelEncoder()
processed_output[output_feature] = label_encoder.fit_transform(data[output_feature])

# Final processed dataset
print("Processed Data Shape:", processed_data.shape)
processed_data, processed_output

Processed Data Shape: (2590, 49)


(      release_speed  launch_speed  launch_angle  hit_distance_sc  \
 0          0.891892     -1.000000     -0.086420        -1.000000   
 1          0.945946     -1.000000     -0.086420        -1.000000   
 2          0.906634      0.715035      0.209877         0.596372   
 3          0.936118      0.386364      0.061728        -0.301587   
 4          0.484029     -1.000000     -0.086420        -1.000000   
 ...             ...           ...           ...              ...   
 2585       0.778870     -1.000000     -0.086420        -1.000000   
 2586       0.837838     -1.000000     -0.086420        -1.000000   
 2587       0.690418      0.253497     -0.740741        -0.995465   
 2588       0.818182     -1.000000     -0.086420        -1.000000   
 2589       0.429975      0.625874     -0.222222        -0.959184   
 
       pitch_type_CH  pitch_type_CS  pitch_type_CU  pitch_type_FC  \
 0               0.0            0.0            0.0            0.0   
 1               0.0            

### Create Subsequences

In [156]:
# Step 3: Initialize variables for creating sequences
# numeric_features = ['isStrike', 'Zone', 'Strike Detail']
# categorical_features = ['Pitch type']
batter_column = 'batter'

# List to store sequences
sequences = []
current_sequence = []
current_batter_id = None

# Step 4: Iterate through the dataset row by row
for index, row in processed_data.iterrows():
    batter_id = row[batter_column]
    row_features = row.values

    if batter_id != current_batter_id and current_sequence:
        # If batter changes or we're at the end of a sequence, save the current sequence
        sequences.append({
            'Batter ID': current_batter_id,
            'Sequence': np.array(current_sequence)
        })
        current_sequence = []  # Reset sequence for the new batter

    # Add the current row to the sequence
    current_sequence.append(row_features)
    current_batter_id = batter_id

# Save the last sequence if it exists
if current_sequence:
    sequences.append({
        'Batter ID': current_batter_id,
        'Sequence': np.array(current_sequence)
    })

# Convert sequences into a structured format if needed
sequence_array = [entry['Sequence'] for entry in sequences]

In [157]:
def pad_sequence(sequence, target_length=12, padding_value=0):
    """
    對序列進行填充，使其長度達到 target_length。

    :param sequence: 需要填充的序列
    :param target_length: 目標長度
    :param padding_value: 填充的數值，預設為 0
    :return: 填充後的序列
    """
    current_length = len(sequence)

    # 如果當前序列長度小於目標長度，則進行填充
    if current_length < target_length:
        padding_needed = target_length - current_length
        # 使用 np.pad 來填充序列，填充行的數量
        padded_sequence = np.pad(sequence,
                                 ((0, padding_needed), (0, 0)),  # 填充的行和列
                                 mode='constant', constant_values=padding_value)
        return padded_sequence
    else:
        return sequence  # 如果序列長度已經足夠長，則返回原始序列

In [158]:
all_sequences = []
for batter in sequences:
    batter_id = batter['Batter ID']  # 獲取打者 ID
    # print(batter_id)

    batter_sequences = batter['Sequence']  # 獲取該打者的所有序列
    sequence_length=len(batter_sequences)
    for length in range(1, sequence_length):  # 子序列的長度從 1 到 sequence_length
      for start_idx in range(sequence_length - length ):  # 每個起始位置
        sub_sequence = batter_sequences[start_idx:start_idx + length]  # 取得子序列
        padded_sub_sequence = pad_sequence(sub_sequence, target_length=12)
        all_sequences.append(padded_sub_sequence)


X=np.array(all_sequences)

print(X[1][0])
print(processed_data.iloc[[1]])

[ 0.94594595 -1.         -0.08641975 -1.          0.          0.
  0.          0.          1.          0.          0.          0.
  0.          1.          0.          1.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          1.          0.
  0.          0.          1.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          1.          0.          0.          0.          0.
  0.        ]
   release_speed  launch_speed  launch_angle  hit_distance_sc  pitch_type_CH  \
1       0.945946          -1.0      -0.08642             -1.0            0.0   

   pitch_type_CS  pitch_type_CU  pitch_type_FC  pitch_type_FF  pitch_type_FS  \
1            0.0            0.0            0.0            1.0            0.0   

   ...  description_hit_into_play  description_missed_bunt  \
1  ...                        0.0                      0.0   

   description_swin

In [159]:
batter_column = 'batter'

# List to store sequences
sequences = []
current_sequence = []
current_batter_id = None

# Step 4: Iterate through the dataset row by row
for index, row in processed_data.iterrows():
    batter_id = row[batter_column]
    row_features = processed_output.iloc[index].values

    if batter_id != current_batter_id and current_sequence:
        # If batter changes or we're at the end of a sequence, save the current sequence
        sequences.append({
            'Batter ID': current_batter_id,
            'Sequence': np.array(current_sequence)
        })
        current_sequence = []  # Reset sequence for the new batter

    # Add the current row to the sequence
    current_sequence.append(row_features)
    current_batter_id = batter_id

# Save the last sequence if it exists
if current_sequence:
    sequences.append({
        'Batter ID': current_batter_id,
        'Sequence': np.array(current_sequence)
    })

# Convert sequences into a structured format if needed
sequence_array = [entry['Sequence'] for entry in sequences]
y_sequences = []
for batter in sequences:
    batter_id = batter['Batter ID']  # 獲取打者 ID

    batter_sequences = batter['Sequence']  # 獲取該打者的所有序列
    sequence_length=len(batter_sequences)
    for length in range(1, sequence_length):  # 子序列的長度從 1 到 sequence_length
      for start_idx in range(1,sequence_length - length+1):  # 每個起始位置
        sub_sequence = batter_sequences[start_idx:start_idx + length]  # 取得子序列
        padded_sub_sequence = pad_sequence(sub_sequence, target_length=12)
        y_sequences.append(padded_sub_sequence)

y=np.array(y_sequences)
y[10:13], processed_output.iloc[3:8]

(array([[[5],
         [4],
         [4],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0]],
 
        [[4],
         [4],
         [5],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0]],
 
        [[5],
         [4],
         [4],
         [5],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0]]]),
    pitch_type
 3           4
 4           5
 5           4
 6           4
 7           5)

### Split data

In [160]:
# Define split ratios
train_ratio = 0.64
val_ratio = 0.16
test_ratio = 0.2

# First, split into training and temp (validation + testing)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1 - train_ratio), random_state=42)

# Then, split temp into validation and testing
val_test_ratio = test_ratio / (test_ratio + val_ratio)  # Adjust split ratio for remaining data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_test_ratio, random_state=42)

# Print dataset sizes
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")

Training set: (3116, 12, 49), (3116, 12, 1)
Validation set: (779, 12, 49), (779, 12, 1)
Testing set: (975, 12, 49), (975, 12, 1)


## Model

In [176]:
# VERSION: simple RNN
# Number of unique pitch types (replace with actual number from your data)
num_classes = processed_output[output_feature].nunique()  # Assuming `y` is label-encoded
print(num_classes)

# Build the model
print(X_train.shape)
model = Sequential([
    Input(shape=(X_train.shape[1], X_train.shape[2])),  # Input: (sequence_length, num_features)
    SimpleRNN(64, return_sequences=True),  # RNN with 64 units
    Dense(32, activation='relu'),           # Fully connected layer
    Dense(num_classes, activation='softmax')  # Output layer for classification
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',  # Use sparse_categorical_crossentropy for label-encoded targets
    metrics=['accuracy']
)

model.summary()

8
(3116, 12, 49)


In [178]:
history = model.fit(
    X_train, y_train,          # Training data
    validation_data=(X_val, y_val),  # Validation data
    epochs=10,                 # Number of epochs
    batch_size=32,             # Batch size
    verbose=1                  # Verbosity level
)

Epoch 1/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9132 - loss: 0.2052 - val_accuracy: 0.9086 - val_loss: 0.2144
Epoch 2/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9121 - loss: 0.2053 - val_accuracy: 0.9099 - val_loss: 0.2104
Epoch 3/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9160 - loss: 0.1993 - val_accuracy: 0.9115 - val_loss: 0.2087
Epoch 4/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9135 - loss: 0.2023 - val_accuracy: 0.9150 - val_loss: 0.2063
Epoch 5/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9160 - loss: 0.1971 - val_accuracy: 0.9157 - val_loss: 0.2044
Epoch 6/10
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9140 - loss: 0.1943 - val_accuracy: 0.9141 - val_loss: 0.2038
Epoch 7/10
[1m98/98[0m [32m━━━━━━━━━

In [179]:
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=1)
print(f"training Loss: {train_loss}")
print(f"training Accuracy: {train_accuracy}")
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=1)
print(f"valing Loss: {val_loss}")
print(f"valing Accuracy: {val_accuracy}")

[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9201 - loss: 0.1997
training Loss: 0.19192101061344147
training Accuracy: 0.921748161315918
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9173 - loss: 0.2097
valing Loss: 0.20322538912296295
valing Accuracy: 0.9175223708152771


In [180]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Testing Loss: {test_loss}")
print(f"Testing Accuracy: {test_accuracy}")

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9173 - loss: 0.1980
Testing Loss: 0.201390340924263
Testing Accuracy: 0.9168375730514526
