In [722]:
### Import Packages

import pandas as pd
import numpy as np
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tcn import TCN
from tensorflow.keras.layers import Input, Normalization, Conv1D, MaxPooling1D, Flatten, Dense, LSTM, GRU, Dropout
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential

In [723]:
### Read in team data

with open('all_teams_data.pkl', 'rb') as file:
    team_data = pickle.load(file)

print("Data loaded from 'all_teams_data.pkl'")

Data loaded from 'all_teams_data.pkl'


In [724]:
team_data.shape

(266, 163, 51)

In [725]:
### Read in classes data

# Determine whether to have success defined by three categories or five categories. Setting map=True will result in three categories and map=False will result in five.
map = True

playoff_teams = pd.read_csv('playoff_teams.csv')
success = playoff_teams['Level of Success']
if map:
    mappings = {'WC': 'First Round', 'DS': 'First Round', 'LS': 'League Series', 'WS': 'World Series', 'C': 'World Series'}
    success = success.map(mappings)
success = success.to_numpy()

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(success), y=success)
class_weights = dict(enumerate(class_weights))

In [726]:
### One-hot encode class labels

categories, inverse = np.unique(success, return_inverse=True)
y_all = np.zeros((success.size, categories.size))
y_all[np.arange(success.size), inverse] = 1

In [727]:
categories

array(['First Round', 'League Series', 'World Series'], dtype=object)

In [728]:
y_all.shape

(266, 3)

In [729]:
### Split data into train, validation, and test sets

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(team_data, y_all, test_size = 0.2, random_state = 98)

# Ensure consistent and appropriate data
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)

# Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 98*2)

In [730]:
### Enhance minority classes with oversampling

n_samples, time_steps, features = X_train.shape
X_flattened = X_train.reshape(n_samples, -1)
smote = SMOTE(sampling_strategy='not majority')
X_resampled, y_resampled = smote.fit_resample(X_flattened, y_train)
X_resampled = X_resampled.reshape(-1, time_steps, features)

In [731]:
### Create Normalization layer for model

norm_layer = Normalization()
norm_layer.adapt(X_resampled)

In [732]:
### Create architecture for team model

team_model = Sequential()
team_model.add(Input(shape=(team_data.shape[1], team_data.shape[2])))
team_model.add(norm_layer)
# team_model.add(TCN(nb_filters=10, dilations=[1, 2, 4, 8, 16, 32], dropout_rate=0.2, use_skip_connections=True, use_batch_norm=True))
# team_model.add(LSTM(256, return_sequences=True))
value = 128
for i in range(5):
    team_model.add(Conv1D(value, 3, activation='relu'))
    # value = value // 2
    team_model.add(Dropout(0.1))
team_model.add(MaxPooling1D(2))
# team_model.add(LSTM(256, return_sequences=True))
team_model.add(Conv1D(256, 3, activation='relu'))
team_model.add(Dropout(0.1))
team_model.add(MaxPooling1D(2))
# team_model.add(LSTM(256, return_sequences=True))
team_model.add(Conv1D(64, 3, activation='relu'))
team_model.add(Dropout(0.1))
team_model.add(LSTM(512))
team_model.add(Flatten())
team_model.add(Dense(128, activation='relu'))
# team_model.add(Dropout(0.5))
team_model.add(Dense(y_all.shape[1], activation='softmax'))
team_model.summary()

In [733]:
### Compile team model

optimizer = Adam(learning_rate=0.00001, clipvalue=1.0)
early_stopping = EarlyStopping(monitor='accuracy', patience=10, restore_best_weights=True)

team_model.compile(optimizer=optimizer,
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

In [734]:
### Fit team model

team_model.fit(X_resampled, y_resampled, batch_size=2, epochs=10, validation_data=(X_val, y_val), callbacks=early_stopping)

Epoch 1/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 117ms/step - accuracy: 0.3959 - loss: 1.0978 - val_accuracy: 0.5814 - val_loss: 1.0911
Epoch 2/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 109ms/step - accuracy: 0.3357 - loss: 1.0986 - val_accuracy: 0.5814 - val_loss: 1.0878
Epoch 3/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 111ms/step - accuracy: 0.3975 - loss: 1.0927 - val_accuracy: 0.5814 - val_loss: 1.0923
Epoch 4/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 111ms/step - accuracy: 0.3090 - loss: 1.0963 - val_accuracy: 0.5814 - val_loss: 1.0848
Epoch 5/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 113ms/step - accuracy: 0.2832 - loss: 1.0984 - val_accuracy: 0.5814 - val_loss: 1.0815
Epoch 6/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 128ms/step - accuracy: 0.3224 - loss: 1.0964 - val_accuracy: 0.5814 - val_loss: 1.0787
Epoch 7/10

<keras.src.callbacks.history.History at 0x19aa65790>

In [735]:
### Evaluate team model

team_model.evaluate(X_test, y_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.5247 - loss: 1.0773 


[1.069587230682373, 0.5370370149612427]

In [736]:
### Create and save model predictions for all teams

X_all = team_data
X_all = X_all.astype(np.float64)
X_all = np.nan_to_num(X_all)

team_predictions = team_model.predict(X_all)
team_predictions = pd.DataFrame(team_predictions)
team_predictions.columns = categories

# Save predictions from team model to .csv file
if map:
    team_predictions.to_csv('team_predictions_mapped.csv')
else:
    team_predictions.to_csv('team_predictions.csv')

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 122ms/step


In [737]:
### Examine model predictions for test set

team_predictions = team_model.predict(X_test)
team_predictions = pd.DataFrame(team_predictions)
team_predictions.columns = categories

y_test_temp = pd.DataFrame(y_test)
y_test_temp.columns = categories

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step


In [738]:
### Glance at model predictions for test set

team_predictions.head()

Unnamed: 0,First Round,League Series,World Series
0,0.351023,0.328422,0.320554
1,0.347522,0.324383,0.328095
2,0.350409,0.328674,0.320918
3,0.350378,0.325685,0.323937
4,0.354731,0.324372,0.320897


In [739]:
### Glance at ground-truth for test set

y_test_temp.head()

Unnamed: 0,First Round,League Series,World Series
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
