In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
import seaborn as sns

In [3]:
import os
import sys

In [4]:
# chaging directory

In [5]:
root, *_ = !pwd

In [6]:
if root.endswith('notebooks'):
    os.chdir('../')
    root, *_ = !pwd
    sys.path.append('src/')

In [7]:
os.getcwd()

'/home/med/projects/competitions/mlb-kaggle'

In [8]:
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
RAW_PATH = Path(os.environ['RAW_PATH'])

In [10]:
PROCESSED_PATH = Path(os.environ['ROOT_DIR']) / 'processed'

In [11]:
os.listdir(RAW_PATH)

['seasons.csv',
 'teams.csv',
 'example_sample_submission.csv',
 'train_updated.csv',
 'train.csv',
 'mlb-player-digital-engagement-forecasting.zip',
 'awards.csv',
 'players.csv',
 'example_test.csv',
 'mlb',
 'train_updated.csv.zip']

In [12]:
os.listdir(PROCESSED_PATH)

['target.csv', 'context_raw_data.csv', 'raw_data.csv']

In [13]:
dates_cols = ['date', 'mlbDebutDate', 'DOB', 'rosterDate', 'pstatsDate']

In [14]:
df = pd.read_csv(PROCESSED_PATH / 'raw_data.csv',
                 parse_dates=dates_cols)

### players data

In [15]:
# not all players are in the private test set
# the players in the test set are saved in the players.csv file

In [16]:
_players = pd.read_csv(RAW_PATH / 'players.csv')

In [17]:
_players['playerForTestSetAndFuturePreds'].fillna(False, inplace=True)

In [18]:
player_in_test = _players.query('playerForTestSetAndFuturePreds')['playerId']

In [19]:
# player_in_test holds only the players in the test set

### filter players

In [20]:
from data.util import filter_by_date, filter_by_id

In [21]:
# filter only test players
df = filter_by_id(df, player_in_test)

In [22]:
# if you want to run on a sample,
# change sample_size to the number of players in the sample
sample_size = None

In [23]:
if sample_size is not None and sample_size > 0:
    df['target_avg'] = df[[f'target{i}' for i in range(1, 5)]].mean(axis=1)
    playerList = df.groupby(['playerId'])['target_avg'].std().sort_values(ascending=False).reset_index()
    df = filter_by_id(df, playerList['playerId'].head(sample_size))
    df.drop('target_avg', axis=1, inplace=True)

In [24]:
df.playerId.nunique()

1187

In [25]:
# player stats

In [26]:
df.head()

Unnamed: 0,playerId,target1,target2,target3,target4,date,home,pstatsDate,battingOrder__ptvf,groundOuts__ptvf,...,year,inSeason,seasonPart,DOB,mlbDebutDate,birthCountry,weight,primaryPositionName,height,playerBMI
0,405395,0.151872,21.449416,0.112952,22.990196,2018-01-01,,NaT,,,...,2018,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
1,405395,0.136406,10.978585,0.096403,15.669173,2018-01-02,,NaT,,,...,2018,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
2,405395,0.06446,9.22619,0.101464,23.172829,2018-01-03,,NaT,,,...,2018,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
3,405395,0.024217,10.170965,0.102448,35.359116,2018-01-04,,NaT,,,...,2018,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
4,405395,0.157556,4.294307,0.046517,12.347789,2018-01-05,,NaT,,,...,2018,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661


In [27]:
list(df.columns)

['playerId',
 'target1',
 'target2',
 'target3',
 'target4',
 'date',
 'home',
 'pstatsDate',
 'battingOrder__ptvf',
 'groundOuts__ptvf',
 'runsScored__ptvf',
 'homeRuns__ptvf',
 'strikeOuts__ptvf',
 'baseOnBalls__ptvf',
 'intentionalWalks__ptvf',
 'hits__ptvf',
 'hitByPitch__ptvf',
 'stolenBases__ptvf',
 'groundIntoTriplePlay__ptvf',
 'plateAppearances__ptvf',
 'totalBases__ptvf',
 'rbi__ptvf',
 'catchersInterference__ptvf',
 'pickoffs__ptvf',
 'gamesPlayedPitching__ptvf',
 'gamesStartedPitching__ptvf',
 'completeGamesPitching__ptvf',
 'winsPitching__ptvf',
 'lossesPitching__ptvf',
 'groundOutsPitching__ptvf',
 'runsPitching__ptvf',
 'strikeOutsPitching__ptvf',
 'atBatsPitching__ptvf',
 'caughtStealingPitching__ptvf',
 'inningsPitched__ptvf',
 'saveOpportunities__ptvf',
 'battersFaced__ptvf',
 'outsPitching__ptvf',
 'balks__ptvf',
 'pickoffsPitching__ptvf',
 'gamesFinishedPitching__ptvf',
 'inheritedRunners__ptvf',
 'catchersInterferencePitching__ptvf',
 'sacBuntsPitching__ptvf',
 'sa

### preprocessing with pipelines

In [28]:
from typing import List
def forward_fill(df: pd.DataFrame, features: List[str],
                 on='playerId',
                 suffix: str = None,
                 limit: int =None):
    ffilled_df = df.groupby(on)[features].ffill(limit=limit)
    output_features = (features if suffix is None else
                       list(map(lambda f: suffix + f, features)))
    outputX = df.copy()
    outputX.loc[:, output_features] = ffilled_df.to_numpy()
    return outputX

In [29]:
def gen_hardcoded_features(df: pd.DataFrame):
    # some feature eng for the dates
    df['pstatsTime'] = (df['date'] - pd.to_datetime(df['pstatsDate'])).dt.total_seconds()
    df['playerAge'] = (df['date'] - df['DOB']).dt.total_seconds()
    df['playerTSinceDebut'] = (df['date'] - df['mlbDebutDate']).dt.total_seconds()
    df['playerDebutAge'] = (df['mlbDebutDate'] - df['DOB']).dt.total_seconds()
    df['rostersTime'] = (df['date'] - pd.to_datetime(df['rosterDate'])).dt.total_seconds()
    df[['rostersTime', 'pstatsTime']] /= 60 * 60 * 24
    # normalize
    df[['playerAge', 'playerTSinceDebut', 'playerDebutAge']] /= 60 * 60 * 24 * 365
#     df['AwardWinner'] = df['awardId'].isna().astype(np.float64)
    return df

In [30]:
def fillna(df: pd.DataFrame, fill_value=0):
    return df.fillna(fill_value)

In [31]:
from pipeline.core import FunctionTransformer, Pipeline, PdColumnTransformer, PdFeatureUnion
from pipeline.continuous import PdScaleNorm, FilterContinuousFeatures, MedianFillNaN
from pipeline.stats import StatisticGen, LagGen, FeaturesTable
from pipeline.categories import Categorify
from pipeline.season import join_season_info
from pipeline.players import join_players_info

In [32]:
last_ffill_features = ['gameType',
 'isTie',
 'gamesInSeries',
 'playerTeamWins',
 'playerTeamLosses',
 'playerTeamWinPct',
 'playerTeamWinner',
 'playerTeamScore',
 'opponentTeamWins',
 'opponentTeamLosses',
 'opponentTeamWinPct',
 'opponentTeamScore',
 'teamScoreDiff',
 'opponentTeamId']
                          
                          
inplace_ffill_features = ['numberOfFollowers',
                          'teamFollowers',
                          'status',
                          'teamId',
                          'rosterDate',
                          'pstatsDate']

In [33]:
# transformer for filling nan values with past valid values
# transformer for generating some hardcoded features
inplace_fill_forward_tmf = FunctionTransformer(forward_fill,
                                               kw_args={'features': inplace_ffill_features})
last_fill_forward_tmf = FunctionTransformer(forward_fill,
                                            kw_args={'features': last_ffill_features,
                                                     'suffix': 'last_'})
join_players_tmf = FunctionTransformer(join_players_info, kw_args={'path_to_players': RAW_PATH / 'players.csv'})
harcoded_feat_tmf = FunctionTransformer(gen_hardcoded_features)

In [34]:
# creating pipeline
ffill_and_gen_harcoded_feat = Pipeline([('inplace_fill_forward', inplace_fill_forward_tmf),
                                      #  ('last_fill_forward', last_fill_forward_tmf),
                                       ('hardcoded_feat', harcoded_feat_tmf)])

In [35]:
# features to use for target preprocessing
target_cols = ['target1', 'target2', 'target3', 'target4']
index_cols = ['date', 'playerId']
pp_cols = target_cols + index_cols

In [36]:
# categories encoding
categories = ['playerId', 'seasonPart', 'year', 'primaryPositionName',
              'status', 'teamId',
              'opponentTeamId', 'home', 'seriesDescription', 'currentGameInSeries']

categories_tmf = PdColumnTransformer([(category, Categorify(add_nan=True), category)
                                       for category in categories])

In [37]:
# for the statistics, we will compute them and then resuse them
stats_gen_tmf = StatisticGen(stats=['mean', 'median', 'std'], windows=[90], drop_index=False)
target_stats_tmf = PdColumnTransformer([('gen_stats', stats_gen_tmf, pp_cols)])
target_stats_table = target_stats_tmf.fit_transform(df)


In [38]:
target_stats_table.head()

Unnamed: 0,playerId,date,target1__mean__90d,target1__median__90d,target1__std__90d,target2__mean__90d,target2__median__90d,target2__std__90d,target3__mean__90d,target3__median__90d,target3__std__90d,target4__mean__90d,target4__median__90d,target4__std__90d
0,405395,2018-01-01,0.151872,0.151872,,21.449417,21.449417,,0.112952,0.112952,,22.990196,22.990196,
1,405395,2018-01-02,0.144139,0.144139,0.010936,16.214001,16.214001,7.403996,0.104677,0.104677,0.011702,19.329685,19.329685,5.176745
2,405395,2018-01-03,0.11758,0.136406,0.046648,13.88473,10.978585,6.609544,0.103606,0.101464,0.00848,20.610733,22.990196,4.28049
3,405395,2018-01-04,0.094239,0.100433,0.060248,12.956289,10.574775,5.707194,0.103317,0.101956,0.006948,24.297829,23.081512,8.160501
4,405395,2018-01-05,0.106902,0.136406,0.059365,11.223893,10.170965,6.279732,0.091957,0.101464,0.026104,21.90782,22.990196,8.860362


In [39]:
## merge computed statistics
reuse_computed_stats = Pipeline([('merge_stats', FeaturesTable(target_stats_table, lags=[60], clip_max=True)),
                                 ('fillnan', FunctionTransformer(fillna)),
                                 ('normalize', PdScaleNorm())
                                ])

In [40]:
# for continuous features, we will filter them and then fill nan values and normalize them

median_fillnan_columns = ['numberOfFollowers', 'teamFollowers',
                          'playerDebutAge', 'playerAge', 'playerTSinceDebut']
fillnan_median_tmf = MedianFillNaN(median_fillnan_columns)

# get the rest of the continuous features
continuous_feat_tmf = Pipeline([('get_cont_feat', FilterContinuousFeatures(ignore_features=categories +
                                                                           pp_cols +
                                                                           ['target_avg'])),
                                ('fillnan_median', fillnan_median_tmf),
                                ('fillnan', FunctionTransformer(fillna)),
                                ('normalize', PdScaleNorm())
                               ])

In [41]:
# putting all features generation steps together
all_features_tmf = PdFeatureUnion([('stats', reuse_computed_stats), 
                                   ('cont_feat', continuous_feat_tmf), 
                                  ('cats', categories_tmf),
                                  ])

In [42]:
# main pipeline with all steps 
pipeline = Pipeline([('core_features', ffill_and_gen_harcoded_feat),
                     ('features_generation', all_features_tmf)])

In [43]:
pipeline

Pipeline(steps=[('core_features',
                 Pipeline(steps=[('inplace_fill_forward',
                                  FunctionTransformer(func=<function forward_fill at 0x7f1ba1debca0>,
                                                      kw_args={'features': ['numberOfFollowers',
                                                                            'teamFollowers',
                                                                            'status',
                                                                            'teamId',
                                                                            'rosterDate',
                                                                            'pstatsDate']})),
                                 ('hardcoded_feat',
                                  FunctionTransformer(func=<function gen_hardcoded_features at 0x7f1b9c3f50d0>))])),
                ('features_...
                                                            

In [44]:
from data.split import SplitData

In [45]:
# split the data on 2021-03-01 with a valid ds of 61 days
sp = SplitData("2021-04-01", test_days=500)

In [46]:
train_idx = sp.train_idx(df)
valid_idx = sp.valid_idx(df)

In [47]:
%%time
# train the pipeline only with the train dataset
pipeline.fit(sp.filter(df, train_idx))

CPU times: user 15.4 s, sys: 2.39 s, total: 17.8 s
Wall time: 17.8 s


Pipeline(steps=[('core_features',
                 Pipeline(steps=[('inplace_fill_forward',
                                  FunctionTransformer(func=<function forward_fill at 0x7f1ba1debca0>,
                                                      kw_args={'features': ['numberOfFollowers',
                                                                            'teamFollowers',
                                                                            'status',
                                                                            'teamId',
                                                                            'rosterDate',
                                                                            'pstatsDate']})),
                                 ('hardcoded_feat',
                                  FunctionTransformer(func=<function gen_hardcoded_features at 0x7f1b9c3f50d0>))])),
                ('features_...
                                                            

In [48]:
%%time
# transform all dataset
features = pipeline.transform(df)

CPU times: user 15.7 s, sys: 2.61 s, total: 18.4 s
Wall time: 18.4 s


In [49]:
pd.options.display.max_rows = len(features.columns)
features.T.head(len(features.columns))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1535968,1535969,1535970,1535971,1535972,1535973,1535974,1535975,1535976,1535977
target1__mean__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005075,0.005079,0.00511,0.00515,0.005157,0.005157,0.005158,0.005159,0.005159,0.00516
target1__median__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000189,0.000232,0.000272,0.000279,0.00029,0.00029,0.00029,0.00029,0.00029,0.00029
target1__std__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025953,0.025952,0.025943,0.025932,0.02593,0.02593,0.025929,0.025929,0.025929,0.025929
target2__mean__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.083716,0.083537,0.083403,0.082961,0.082719,0.082305,0.082099,0.081926,0.081724,0.081436
target2__median__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.04477,0.04477,0.04477,0.04477,0.04477,0.04477,0.04477,0.04477,0.04477,0.04477
target2__std__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.224969,0.225145,0.225285,0.225647,0.225866,0.226191,0.226384,0.226552,0.226749,0.227012
target3__mean__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006548,0.006559,0.006561,0.006564,0.006566,0.006567,0.006568,0.006569,0.00657,0.006233
target3__median__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000459,0.000497,0.000497,0.000497,0.000497,0.000497,0.000497,0.000497,0.000497,0.000459
target3__std__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02443,0.024422,0.024419,0.024417,0.024415,0.024414,0.024414,0.024413,0.024412,0.024027
target4__mean__90d__60lag,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017479,0.017467,0.017419,0.017352,0.01732,0.017233,0.017164,0.017056,0.017061,0.017044


In [50]:
# change the dtypes of the target (pytorch likes float32 and not float64)
df[target_cols] = df[target_cols].astype(np.float32)

In [51]:
train_idx.shape

(1535978,)

In [52]:
train_idx.shape

(1535978,)

In [53]:
train_data = sp.filter(features, train_idx)

raw_train_df = sp.filter(df, train_idx)

In [54]:
valid_data = sp.filter(features, valid_idx)
raw_valid_df = sp.filter(df, valid_idx)

In [55]:
train_data.shape, raw_train_df.shape

((1407782, 102), (1407782, 98))

In [56]:
valid_data.shape, raw_valid_df.shape

((128196, 102), (128196, 98))

In [57]:
# add the target to features ds
train_data[target_cols + ['date']] = raw_train_df[target_cols + ['date']]
valid_data[target_cols + ['date']] = raw_valid_df[target_cols + ['date']]

### modeling 

In [66]:
time_features = ['battingOrder', 'gamesPlayedBatting', 'flyOuts', 'groundOuts',
       'runsScored', 'doubles', 'triples', 'homeRuns', 'strikeOuts',
       'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch', 'atBats',
       'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'isTie',
       'gamesInSeries', 'playerTeamWins', 'playerTeamLosses',
       'playerTeamWinPct', 'playerTeamScore', 'opponentTeamWins',
       'opponentTeamLosses', 'opponentTeamWinPct', 'opponentTeamScore',
       'teamScoreDiff']

time_features = [f2 for f2 in features.columns
                 if any([f2.startswith(f) for f in time_features ])]

In [67]:
static_features = features.columns.drop(categories + time_features).to_list()

In [68]:
static_features

['target1__mean__90d__60lag',
 'target1__median__90d__60lag',
 'target1__std__90d__60lag',
 'target2__mean__90d__60lag',
 'target2__median__90d__60lag',
 'target2__std__90d__60lag',
 'target3__mean__90d__60lag',
 'target3__median__90d__60lag',
 'target3__std__90d__60lag',
 'target4__mean__90d__60lag',
 'target4__median__90d__60lag',
 'target4__std__90d__60lag',
 'SLG__ptvf',
 'SLG__date__maxNorm__ptvf',
 'totalGamesVsoppTeam',
 'playerTeamWintPctHist',
 'numberOfFollowers',
 'awardCount',
 'teamFollowers',
 'playerTeamWinpct',
 'playerTeamDivisionchamp',
 'playerTeamDivisionleader',
 'playerTeamLasttenwins',
 'playerTeamLasttenlosses',
 'playerTeamXwinlosspct',
 'playerTeamHomewinpct',
 'playerTeamAwaywinpct',
 'playerTeamHomewinpct__Date__Ranked',
 'playerTeamAwaywinpct__Date__Ranked',
 'playerTeamWinpct__Date__Ranked',
 'weight',
 'height',
 'playerBMI',
 'pstatsTime',
 'playerAge',
 'playerTSinceDebut',
 'playerDebutAge',
 'rostersTime']

In [69]:
categories

['playerId',
 'seasonPart',
 'year',
 'primaryPositionName',
 'status',
 'teamId',
 'opponentTeamId',
 'home',
 'seriesDescription',
 'currentGameInSeries']

In [70]:
from omegaconf import OmegaConf
hp = {'bptt': 1,
      'epochs': 10,
      'batch_size': 512,
      'max_emb_sz': 20,
      'hidden_dim': 16,
      'encoder_dim': 128,
      'emb_dropout': 0.2,
      'dropout': 0.2,
      'lr': 0.001,
      'wd': 3e-5}

real_data_config = {'hp': hp,
                    'seed': 2021,
         'static_features': static_features,
          'categories': categories,
          'time_features': time_features,
          'target_cols': target_cols}
real_data_config = OmegaConf.create(real_data_config) 

In [71]:
from train.lstm import run_lstm

  rank_zero_deprecation(


In [72]:
output = run_lstm(real_data_config, train_data, valid_data)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name         | Type           | Params
------------------------------------------------
0 | emb          | EmbeddingLayer | 24.2 K
1 | net          | LSTM           | 4.6 K 
2 | output_layer | Sequential     | 13.6 K
------------------------------------------------
42.4 K    Trainable params
0         Non-trainable params
42.4 K    Total params
0.170     Total estimated model params size (MB)


LstmModel(
  (emb): EmbeddingLayer(
    (dropout): Dropout(p=0.2, inplace=False)
    (emb): ModuleDict(
      (playerId): Embedding(1188, 20)
      (seasonPart): Embedding(8, 2)
      (year): Embedding(5, 2)
      (primaryPositionName): Embedding(10, 3)
      (status): Embedding(15, 3)
      (teamId): Embedding(32, 5)
      (opponentTeamId): Embedding(34, 5)
      (home): Embedding(4, 1)
      (seriesDescription): Embedding(8, 2)
      (currentGameInSeries): Embedding(9, 2)
    )
  )
  (net): LSTM(54, 16, batch_first=True)
  (output_layer): Sequential(
    (0): Linear(in_features=99, out_features=128, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=128, out_features=4, bias=True)
  )
)


Validation sanity check: 0it [00:00, ?it/s]

  value = torch.tensor(value, device=device, dtype=torch.float)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [73]:
from train.core import predict_recursive, Ensemble
ensemble = Ensemble([output],
                    pipeline)

In [74]:
# %%time
# raw_train_fr = raw_train_df[df.columns].copy()

# recursive_prediction = []
# for date, val_date_df in raw_valid_df.groupby('date'):
#     val_date_df = val_date_df[df.columns].reset_index(drop=True)
#     _recurive_pred, raw_train_fr = predict_recursive(val_date_df,
#                                                      raw_train_fr,
#                                                      ensemble,
#                                                      n_days=90 + 60 + 10,
#                                                      target_cols=target_cols)
#     recursive_prediction.append(_recurive_pred)

# recursive_prediction = pd.concat(recursive_prediction, ignore_index=True)
# recursive_prediction.sort_values(['playerId', 'date'], inplace=True)
# recursive_prediction.reset_index(drop=True, inplace=True)
# recursive_prediction.rename(columns={f: f.replace('target', 'yhat')
#                                      for f in target_cols}, inplace=True)
# recursive_prediction[target_cols] = raw_valid_df[target_cols].to_numpy()

In [75]:
from evaluate.metrics import compute_metrics

In [77]:
valid_test_prediction = pd.DataFrame(output.prediction, columns=['yhat1', 'yhat2', 'yhat3', 'yhat4'])

valid_test_prediction[pp_cols] = raw_valid_df[pp_cols]

In [78]:
is_test_pred = valid_test_prediction['date'] >= '2021-05-01'

valid_prediction, test_prediction = (valid_test_prediction.loc[~is_test_pred, :],
                                     valid_test_prediction.loc[is_test_pred, :])

In [79]:
# valid_rc_prediction, test_rc_prediction = (recursive_prediction.loc[~is_test_pred, :],
#                                            recursive_prediction.loc[is_test_pred, :])

In [80]:
# metrics from 2021-03-01 up to 2021-04-30
compute_metrics(valid_test_prediction)

{'mae_mean': 1.3350189,
 'mae_1': 1.0669042,
 'mae_2': 2.0582983,
 'mae_3': 0.85222656,
 'mae_4': 1.3626462}

In [81]:
# metrics from 2021-03-01 up to 2021-03-31
compute_metrics(valid_prediction)

{'mae_mean': 1.5263585,
 'mae_1': 1.1786147,
 'mae_2': 2.3978095,
 'mae_3': 0.9377938,
 'mae_4': 1.5912162}

In [83]:
compute_metrics(test_prediction[test_prediction['date'].between('2021-05-01', '2021-06-01')])

{'mae_mean': 1.3665464,
 'mae_1': 1.0967832,
 'mae_2': 2.1868427,
 'mae_3': 0.883537,
 'mae_4': 1.2990229}

In [82]:
# metrics from 2021-04-01 up to 2021-04-30
compute_metrics(test_prediction)

{'mae_mean': 1.2614267,
 'mae_1': 1.0239385,
 'mae_2': 1.9277172,
 'mae_3': 0.81931615,
 'mae_4': 1.2747346}

from omegaconf import OmegaConf
hp = {'bptt': 14,
      'epochs': 10,
      'batch_size': 512,
      'max_emb_sz': 20,
      'hidden_dim': 16,
      'encoder_dim': 128,
      'emb_dropout': 0.2,
      'dropout': 0.2,
      'lr': 0.001,
      'wd': 3e-5}

{'mae_mean': 1.4912145,
 'mae_1': 1.1684885,
 'mae_2': 2.2605271,
 'mae_3': 0.9525842,
 'mae_4': 1.5832579}

In [None]:
# # metrics from 2021-03-01 up to 2021-04-30
# compute_metrics(recursive_prediction)

In [None]:
# # metrics from 2021-03-01 up to 2021-03-31
# compute_metrics(valid_rc_prediction)

In [None]:
# # metrics from 2021-04-01 up to 2021-04-30
# compute_metrics(test_rc_prediction)

In [None]:
def plot_pred(df, playerId, index=1):
    pdf = df.query(f'playerId == {playerId}')
    
    f, ax = plt.subplots(1, 1, figsize=(20, 5))
    
    tname = f'target{index}'
    yhat_name = f'yhat{index}'
    
    ax.plot('date', tname, '-o', data=pdf, label=tname)
    ax.plot('date', yhat_name, '-o', data=pdf, label=tname)
    
    ax.legend()
    

In [None]:
def feature_importance(features, model):
    fi = pd.DataFrame({'feature': features, 'importance': model.feature_importances_})
    fi.sort_values(by='importance', ascending=False, inplace=True)
    return fi

def compute_fi(features, models):
    fi = [feature_importance(features, _model).assign(target=f'target{target+1}')
      for target, _model in enumerate(models)]
    fi = pd.concat(fi)
    fi = fi.pivot('feature', 'target', 'importance')
    fi = fi * 100 / fi.sum()
    fi['mean'] = fi.mean(axis=1)
    fi.sort_values(by='mean', ascending=False, inplace=True)
    return fi

In [None]:
for index in range(1, 5):
    plot_pred(valid_test_prediction, playerId=660271, index=index)

In [None]:
model = output.model

In [None]:
from data.dataset import PlayerDataset

In [None]:
_index = (raw_valid_df['playerId']== 660271)
inference_data = valid_data[_index].reset_index(drop=True)

In [None]:
from torch.utils.data import DataLoader
from copy import deepcopy

In [None]:
valid_ds = PlayerDataset.from_df(inference_data, 
                                      features=static_features,
                                      time_features=time_features,
                                      categories=categories,
                                      target=target_cols,
                                      bptt=hp['bptt'])
valid_dl = DataLoader(valid_ds, batch_size=512,
                     shuffle=False, num_workers=4)

In [None]:
batch = next(iter(valid_dl))
hidden_state = model.init_hidden_state(len(batch['features']))

In [None]:
timeft, (final_state, cell_state) = model.net(batch['timeft'],hidden_state)

In [None]:
timeft = timeft.detach().cpu().numpy()

In [None]:
no_past_batch = deepcopy(batch)

In [None]:
no_past_batch['timeft'] = no_past_batch['timeft'][:, [-1], :]

In [None]:
prediction = model(**batch)
no_past_prediction = model(**no_past_batch)

In [None]:
prediction.shape, no_past_prediction.shape

In [None]:
prediction = prediction.detach().cpu().numpy()
no_past_prediction = no_past_prediction.detach().cpu().numpy()

In [None]:
def plot_single_batch(batch_ft, raw_ft):
    pass

In [None]:
batch_index = -1

In [None]:
batch_ft = timeft[batch_index]

In [None]:
raw_ft = batch['timeft'][batch_index]

In [None]:
plt.figure(figsize=(20, 5))
for day in range(raw_ft.shape[0]):
    plt.plot(raw_ft[:, day])

In [None]:
plt.figure(figsize=(20, 5))
for day in range(batch_ft.shape[0]):
    plt.plot(batch_ft[:, day])

In [None]:
for i in range(4):
    plt.figure(figsize=(20, 5))
    plt.plot(batch['target'].detach().cpu().numpy()[:, i], '-o',
             label='target')
    plt.plot(prediction[:, i], '-o',
             label='with past')
    plt.plot(no_past_prediction[:, i],
             '-o',
             label='no past')
    plt.legend()

In [None]:
# is_playing = raw_valid_df.loc[:, time_features].drop('AwardWinner', axis=1).std(axis=1).notna()

In [None]:
# not_playing = raw_valid_df.loc[:, time_features].drop('AwardWinner', axis=1).isna().all(axis=1)

In [None]:
# compute_metrics(recursive_prediction.loc[not_playing])

In [None]:
# compute_metrics(recursive_prediction.loc[~not_playing])