In [58]:
import pandas as pd
import yaml
import os

with open("../config.yaml","r") as file_object:
  config=yaml.load(file_object, Loader=yaml.SafeLoader)
assert config

data_dir = os.path.join(os.path.pardir, config['data_dir'])
players_csv = os.path.join(data_dir, 'players.csv')
teams_csv = os.path.join(data_dir, 'teams.csv')
games_csv = os.path.join(data_dir, 'games.csv')

players_df = pd.read_csv(players_csv)
teams_df = pd.read_csv(teams_csv)
games_df = pd.read_csv(games_csv)

games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SEASON_ID          2234 non-null   int64  
 1   TEAM_ID            2234 non-null   int64  
 2   TEAM_ABBREVIATION  2234 non-null   object 
 3   TEAM_NAME          2234 non-null   object 
 4   GAME_ID            2234 non-null   int64  
 5   GAME_DATE          2234 non-null   object 
 6   MATCHUP            2234 non-null   object 
 7   WL                 2234 non-null   object 
 8   MIN                2234 non-null   int64  
 9   FGM                2218 non-null   float64
 10  FGA                43 non-null     float64
 11  FG_PCT             43 non-null     float64
 12  FG3M               2 non-null      float64
 13  FG3A               2 non-null      float64
 14  FG3_PCT            0 non-null      float64
 15  FTM                2217 non-null   float64
 16  FTA                1966 

In [63]:
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

# games_df['time_idx']  = games_df.groupby('SEASON_ID')['SEASON_ID'].cumcount()

# games_df['TEAM_ID'] = games_df['TEAM_ID'].astype(str).astype('category')

games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])
games_df['WL'] = games_df['WL'].astype('category')
games_df['TEAM_ID'] = games_df['TEAM_ID'].astype(str)

games_df = games_df.sort_values(['TEAM_ID', 'SEASON_ID', 'GAME_DATE'])
games_df = games_df.drop(columns=['TEAM_ABBREVIATION', 'TEAM_NAME',\
                         'MATCHUP', 'VIDEO_AVAILABLE'], errors='ignore')
games_df['time_idx']  = games_df.groupby(['TEAM_ID', 'SEASON_ID']).cumcount()

games_df['day_of_week'] = games_df['GAME_DATE'].dt.day_of_week

# TODO: add additional features
# TODO: entity embedding for: TEAM_ID

training_cutoff = 60

# alternative is to group by season and encode team_id

max_prediction_length = 1
max_encoder_length = 30

training = TimeSeriesDataSet(
    games_df[lambda x: x.time_idx < training_cutoff],
    time_idx='time_idx',
    target='WL',
    group_ids=['SEASON_ID', 'TEAM_ID'],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=['TEAM_ID'],
    time_varying_known_categoricals=['TEAM_ID']
)