In [61]:
import pandas as pd
import numpy as np

# # import tensorflow as tf
from tensorflow.python.keras.layers import Input, LSTM, Dense, Masking, Embedding, concatenate
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import pad_sequences

In [62]:
df = pd.read_csv("./processed_data/dynamic_feats.csv")
df.sort_values(["year","HomeTeam"],inplace=True)

__Feature List__

In [63]:
other_league_cols = ["nba_season_flag", "nfl_season_flag", "nhl_season_flag"]

common_features = ["year",
                    "DayofWeek",
                    "HomeTeamGameNumber",
                    "VisitingTeamGameNumber",
                    "park_age",
                    "Month",
                    "DayofMonth",
                    "DayFlag",
                    "NonRegular_ParkFlag",
                    "StadiumCapacity"]+\
                    ["home_payroll", "home_top_salary", "visiting_payroll", "visiting_top_salary"] +\
                    other_league_cols

rolling_feats = ['HomeTeamOffense_Homeruns-1',
                'HomeTeamOffense_Homeruns-2',
                'HomeTeamOffense_Homeruns-3',
                'HomeTeamOffense_Homeruns-4',
                'HomeTeamOffense_Homeruns-5',
                'HomeTeamOffense_Homeruns-6',
                'HomeTeamOffense_Homeruns-7',
                'HomeTeamOffense_Strickouts-1',
                'HomeTeamOffense_Strickouts-2',
                'HomeTeamOffense_Strickouts-3',
                'HomeTeamOffense_Strickouts-4',
                'HomeTeamOffense_Strickouts-5',
                'HomeTeamOffense_Strickouts-6',
                'HomeTeamOffense_Strickouts-7',
                'HomeTeamPitchers_TeamEarnedRuns-1',
                'HomeTeamPitchers_TeamEarnedRuns-2',
                'HomeTeamPitchers_TeamEarnedRuns-3',
                'HomeTeamPitchers_TeamEarnedRuns-4',
                'HomeTeamPitchers_TeamEarnedRuns-5',
                'HomeTeamPitchers_TeamEarnedRuns-6',
                'HomeTeamPitchers_TeamEarnedRuns-7']

prev_match_features = ['Homewin_rate', 'Homeday_league_rank']

categorical_feat_cols = ["VisitingTeam",
                        "VisitingTeamLeague",
                        "HomeTeam",
                        "HomeTeamLeague",
                        "BallParkID"]

continuous_feat_cols = common_features + rolling_feats + prev_match_features

__Constants and Helper Functions__

In [64]:
TARGET_COL = "AttendanceRatio"

# 84 games at most in a season, 22 seasons
LEN_TS = 84*22
NUM_CONT_FEATURES = len(continuous_feat_cols)
NUM_CAT_FEATURES = len(categorical_feat_cols)
VALIDATION_YEARS = [2022]

ranking_map_cols = ["Date","Team"]
ranking_info_cols = ["total_wins","day_league_rank","win_rate"]

def pad_with_bfill(arr_list, max_len):
    # Pad each array with backfill to the length of the longest array
    padded_arr_list = []
    for arr in arr_list:
        padded_arr = np.pad(arr, pad_width=[(max_len - len(arr), 0),(0,0),(0,0)], mode='edge')
        padded_arr_list.append(padded_arr)
    return padded_arr_list

In [65]:
from sklearn.preprocessing import MinMaxScaler

val_idx = df["year"].isin(VALIDATION_YEARS)
train_idx = ~val_idx
normalizers = {}

for col in continuous_feat_cols+[TARGET_COL]:
    norm_arr_train = df.loc[train_idx, col].values.reshape(-1,1)
    norm_arr_val = df.loc[val_idx, col].values.reshape(-1,1)
    normalizers[col] = MinMaxScaler().fit(norm_arr_train)
    df.loc[train_idx, col] = normalizers[col].transform(norm_arr_train)
    df.loc[val_idx, col] = normalizers[col].transform(norm_arr_val)

In [66]:
from sklearn.preprocessing import LabelEncoder

# preprocess categorical columns
encoders = []

for col in categorical_feat_cols:
    encoder = LabelEncoder()
    data_cat = df[col].fillna('N/A').astype(str)
    df[col+"_encoded"] = encoder.fit_transform(data_cat)
    encoders.append(encoder)

categorical_feat_cols = [col+"_encoded" for col in categorical_feat_cols]

## Model Training

### Reshaping data for RNN

In [67]:
x_train = []
y_train = []
x_val = []
y_val = []
x_train_cat = []
x_val_cat = []

num_teams = df["HomeTeam"].nunique()

for team, team_df in df.groupby(["HomeTeam"]):
    team_train_df = team_df.loc[train_idx,:]
    team_val_df = team_df.loc[val_idx,:]
    train_games = len(team_train_df)
    val_games = len(team_val_df)
    train_features = team_train_df.loc[:, continuous_feat_cols
                                ].values.reshape(train_games, NUM_CONT_FEATURES, 1)
    train_labels = team_train_df.loc[:, TARGET_COL
                               ].values.reshape(train_games, 1)
    val_features = team_val_df.loc[:, continuous_feat_cols
                                ].values.reshape(val_games, NUM_CONT_FEATURES, 1)
    val_labels = team_val_df.loc[:, 
                           TARGET_COL].values.reshape(val_games, 1)
    
    # Categorical Features
    train_cat_features = team_train_df.loc[:, categorical_feat_cols
                                ].values.reshape(train_games, NUM_CAT_FEATURES, 1)
    val_cat_features = team_val_df.loc[:, categorical_feat_cols
                                    ].values.reshape(val_games, NUM_CAT_FEATURES, 1)
    
    x_train.append(train_features)
    y_train.append(train_labels)
    x_val.append(val_features)
    y_val.append(val_labels)
    x_train_cat.append(train_cat_features)
    x_val_cat.append(val_cat_features)

len_ts_train = df.loc[train_idx,:].groupby("HomeTeam")["HomeTeamGameNumber"].size().max()
len_ts_val = df.loc[val_idx,:].groupby("HomeTeam")["HomeTeamGameNumber"].size().max()

x_train = pad_sequences(x_train, value=-1, maxlen=len_ts_train).reshape(len_ts_train, num_teams, NUM_CONT_FEATURES, 1)
y_train = pad_sequences(y_train, value=-1, maxlen=len_ts_train).reshape(len_ts_train, num_teams, 1)

x_val = pad_sequences(x_val, value=-1, maxlen=len_ts_val).reshape(len_ts_val, num_teams, NUM_CONT_FEATURES, 1)
y_val = pad_sequences(y_val, value=-1, maxlen=len_ts_val).reshape(len_ts_val, num_teams, 1)

x_train_cat = np.array(pad_with_bfill(x_train_cat, len_ts_train))
x_val_cat = np.array(pad_with_bfill(x_val_cat, len_ts_val))

x_train_cat = [arr.reshape(len_ts_train, num_teams) for arr in np.split(x_train_cat, NUM_CAT_FEATURES, axis=2)]
x_val_cat = [arr.reshape(len_ts_val, num_teams) for arr in np.split(x_val_cat, NUM_CAT_FEATURES, axis=2)]

### Basic Model
- Implement Batch Normalization! --> Data Leakage otherwise
    - Len of TS: 84*20 (number of games per season)*(number of seasons)
    - Number of TS: 30 (number of teams)
    - Number of Features: 15
- Add masking: DONE
- Perform pre-padding: DONE
- Add normalization: DONE but not batch?!
- Target Variable change: Use %age attendance instead of absolute capacity --> DONE but using different logic
- Add early stopping: DONE
- Add embeddings DONE

In [68]:
# Constants for Training
EPOCHS = 50
MIN_DELTA = 0.001
PATIENCE = 3

# add an embedding layer for the categorical inputs
categorical_embedding_size = 10

# Define the input shape
cont_input_shape = x_train.shape[1:3]
# cat_input_shape = x_train_cat.shape[1:3]
output_shape = 1

early_stopper = EarlyStopping(monitor='val_loss',
                              mode='min',
                              min_delta=MIN_DELTA,
                              patience=PATIENCE)

## For every categorical variable we have it's own input
categorical_inputs = [Input(shape=(num_teams, ),
                      name='cat_' + str(i + 1)) for i in range(NUM_CAT_FEATURES)]
masked_categorical_inputs = [Masking(input_shape=(num_teams, ))(inp) for inp in categorical_inputs]

## Shared embedding layer
cat_embedding = Embedding(input_dim=np.max(x_train_cat) + 1,
                                output_dim=categorical_embedding_size,
                                input_length=len_ts_train)

## Repeat this for every categorical column
cat_embeddings = [cat_embedding(inp) for inp in masked_categorical_inputs]

time_series_input = Input(shape=cont_input_shape)
masked_ts_input = Masking(input_shape=cont_input_shape)(time_series_input)

## Concatenate the time series input and the embedding outputs
x = concatenate([masked_ts_input] + cat_embeddings, axis=-1)

In [69]:
x = LSTM(128,
    activation="relu",
    return_sequences=True,
    dropout=0.2)(x)

x = LSTM(64,
    activation="relu",
    return_sequences=True,
    dropout=0.2)(x)

x = LSTM(64,
    activation="relu",
    return_sequences=True,
    dropout=0.2)(x)

x = Dense(output_shape)(x)

model = Model(inputs=[time_series_input]+categorical_inputs, outputs=x)
# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model with your pandas DataFrame
model.fit([x_train]+x_train_cat,
          y_train,
          epochs=50,
          batch_size=32,
          validation_data=([x_val]+x_val_cat, y_val),
          callbacks=[early_stopper])

# Predict the next season (84 games) for all 30 teams (a single time series) using the trained model
predictions = model.predict([x_val]+x_val_cat)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


In [70]:
def inverse_transform(col, normalizer_dict=normalizers, data=df):
    return normalizer_dict[col].inverse_transform(
        data[col].values.reshape(-1,1))

In [71]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

y_pred = (predictions[1:,:].reshape(81*30,))*df.loc[val_idx, "StadiumCapacity"]
y_true = df.loc[val_idx, "Attendance"] #y_val[1:,:].reshape(81*30,)
mean_absolute_error(y_true, y_pred), mean_absolute_percentage_error(y_true, y_pred), mean_squared_error(y_true, y_pred)

(26786.084402221986, 1.0000015545617773, 845598093.3573964)

In [32]:
def calculate_error(y_test, y_pred, x_test, w1=0.05):
    april_mask = x_test["Month"]==0.14285714
    mn_twins_mask = x_test["HomeTeam"]=="MIN"
    april_error = w1*(np.abs(y_test[(april_mask) & (mn_twins_mask)]-y_pred[(april_mask) & (mn_twins_mask)]).sum())\
                    +(1-w1)*(np.abs(y_test[(april_mask) & ~(mn_twins_mask)]-y_pred[(april_mask) & ~(mn_twins_mask)])).sum()
    season_error = w1*(np.abs(y_test[mn_twins_mask]-y_pred[mn_twins_mask]).sum())\
                        +(1-w1)*(np.abs(y_test[~mn_twins_mask]-y_pred[~mn_twins_mask]).sum())
    return april_error, season_error    

In [33]:
df.loc[val_idx, "Month"].unique()

array([0.14285714, 0.28571429, 0.42857143, 0.57142857, 0.71428571,
       0.85714286, 1.        ])

In [34]:
calculate_error(y_true, y_pred, df[val_idx])

(0.0, 53.95099459695629)

In [10]:
# import tensorflow as tf

# class FeedBack(tf.keras.Model):
#     def __init__(self, units, out_steps):
#         super().__init__()
#         self.out_steps = out_steps
#         self.units = units
#         self.lstm_cell = tf.keras.layers.LSTMCell(units)
#         # Also wrap the LSTMCell in an RNN to simplify the `warmup` method.
#         self.lstm_rnn = tf.keras.layers.RNN(self.lstm_cell, return_state=True)
#         self.dense = tf.keras.layers.Dense(output_shape)

# def warmup(self, inputs):
#   # inputs.shape => (batch, time, features)
#   # x.shape => (batch, lstm_units)
#   x, *state = self.lstm_rnn(inputs)

#   # predictions.shape => (batch, features)
#   prediction = self.dense(x)
#   return prediction, state

# feedback_model = FeedBack(units=32, out_steps=82)

# FeedBack.warmup = warmup

# def call(self, inputs, training=None):
#   # Use a TensorArray to capture dynamically unrolled outputs.
#   predictions = []
#   # Initialize the LSTM state.
#   prediction, state = self.warmup(inputs)

#   # Insert the first prediction.
#   predictions.append(prediction)

#   # Run the rest of the prediction steps.
#   for n in range(1, self.out_steps):
#     # Use the last prediction as input.
#     x = prediction
#     # Execute one lstm step.
#     x, state = self.lstm_cell(x, states=state,
#                               training=training)
#     # Convert the lstm output to a prediction.
#     prediction = self.dense(x)
#     # Add the prediction to the output.
#     predictions.append(prediction)

#   # predictions.shape => (time, batch, features)
#   predictions = tf.stack(predictions)
#   # predictions.shape => (batch, time, features)
#   predictions = tf.transpose(predictions, [1, 0, 2])
#   return predictions

# FeedBack.call = call

# history = compile_and_fit(feedback_model, multi_window)

# IPython.display.clear_output()

# multi_val_performance['AR LSTM'] = feedback_model.evaluate(multi_window.val)
# multi_performance['AR LSTM'] = feedback_model.evaluate(multi_window.test, verbose=0)
# multi_window.plot(feedback_model)