# Overview
- LightGBM

In [9]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from tqdm.notebook import tqdm
import lightgbm as lgb
from typing import List, Union, Optional
import time
from contextlib import contextmanager
import sys
import logging
import category_encoders as ce
import plotly
from plotly import express as px
# import mlb

pd.options.display.max_columns = 200

## Config

In [2]:
class CFG:
    ######################
    # global #
    ######################
    INPUT_DIR = "../input/mlb-player-digital-engagement-forecasting"
    # INPUT_DIR = "../input/mlb-unnested-dataset" # for kaggle kernel
    OBJECT_ID = ["playerId", "date"]
    TARGETS = ["target1", "target2", "target3", "target4"]
    ######################
    # model #
    ######################
    LGB_PARAMS = {
        "objective": "mae",
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "max_depth": 6,
        "num_leaves": int(2 ** 6 * 0.8),
        "lambda_l1": 0.1,
        "lambda_l2": 0.1,
        "bagging_fraction": 0.9,
        "bagging_freq": 3,
        "feature_fraction": 0.9,
        "min_data_in_leaf": 20,
        "num_threads": 8,
        "verbosity": -1,
        "num_iterations": 10000,
        "early_stopping_round": 100,
    }
    SEEDS = [2434]
    MODEL_PATH = "../input/mlb-nb012-lgb-weights"

## Utils

In [3]:
def get_logger(out_file=None):
    logger = logging.getLogger()  # loggerの呼び出し
    formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] [%(message)s]")  # ログ出力の際のフォーマットを定義
    logger.handlers = []  # ハンドラーを追加するためのリスト
    logger.setLevel(logging.INFO)  # ロギングのレベルを設定, 'INFO' : 想定された通りのことが起こったことの確認

    handler = logging.StreamHandler(sys.stdout)  # StreamHandler(コンソールに出力するハンドラ)を追加
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    # ログをファイルとして出力する際のハンドラ(FileHandler)
    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)

    logger.info("logger set up")  # "logger set up"を表示
    return logger


@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"<{name}> start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"<{name}> done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Loading dataset

In [4]:
class MLBDataset(object):
    def __init__(self, input_path):
        self.input_path = Path(input_path)
        #################
        # train #
        #################
        self.train_next = pd.read_pickle(self.input_path / "train_nextDayPlayerEngagement.pickle")
        self.train_rosters = pd.read_pickle(self.input_path / "train_rosters.pickle")
        self.train_ptf = pd.read_pickle(self.input_path / "train_playerTwitterFollowers.pickle")
        self.train_scores = pd.read_pickle(self.input_path / "train_playerBoxScores.pickle")
        self.train_games = pd.read_pickle(self.input_path / "train_games.pickle")
        self.train_standings = pd.read_pickle(self.input_path / "train_standings.pickle")
        self.train_tbs = pd.read_pickle(self.input_path / "train_teamBoxScores.pickle")
        self.train_ttf = pd.read_pickle(self.input_path / "train_teamTwitterFollowers.pickle")
        self.train_trans = pd.read_pickle(self.input_path / "train_transactions.pickle")
        self.train_awards = pd.read_pickle(self.input_path / "train_awards.pickle")
        self.train_events = pd.read_pickle(self.input_path / "train_events.pickle")
        #################
        # Additional #
        #################
        self.players = pd.read_pickle(self.input_path / "players.pickle")
        self.awards = pd.read_pickle(self.input_path / "awards.pickle")
        self.seasons = pd.read_pickle(self.input_path / "seasons.pickle")
        self.teams = pd.read_pickle(self.input_path / "teams.pickle")
        #################
        # test #
        #################
        self.example_test = pd.read_csv(self.input_path / "example_test.csv")
        self.sample_submission = pd.read_csv(self.input_path / "example_sample_submission.csv")

mlb_train_ds = MLBDataset(CFG.INPUT_DIR)

## Feature blocks

In [150]:
def merge_by_key(left: Union[pd.DataFrame, pd.Series], right: pd.DataFrame, on=CFG.OBJECT_ID) -> pd.DataFrame:
    if not isinstance(left, pd.Series):
        left = left[on]
    return pd.merge(left, right, on=on, how="left").drop(columns=on)


class BaseBlock(object):
    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        return NotImplementedError()


# class LagBlock(BaseBlock):
#     def __init__(self, column: str, periods: List[int]):
#         self.column = column
#         self.periods = periods

#     def fit(self, input_df: pd.DataFrame, y=None):
#         agg_list = [input_df.groupby(["playerId"], as_index=False)[self.column].transform(lambda x: x.shift(periods=period)).add_prefix(f"{period}_") for period in self.periods]
#         self.agg_df = pd.concat(agg_list, axis=1)

#         return self.transform(input_df)

#     def transform(self, input_df: pd.DataFrame):
#         return self.agg_df.add_prefix("Lag_")


def quantile25(x: pd.Series):
    return x.quantile(q=0.25)


def quantile75(x: pd.Series):
    return x.quantile(q=0.75)


class TargetAggregateBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        dfs = [mlb_train_ds.train_next.groupby(["playerId"])[col].agg([
            "mean", "std", "max", "min", "median", quantile25, quantile75
            ]).add_prefix(f"{col}_") for col in CFG.TARGETS]
        self.agg_df = pd.concat(dfs, axis=1).reset_index()

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df, on="playerId")


class PlayersLabelEncodingBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns + ["playerId"]
        self.labeled_df = mlb_train_ds.players[self.columns].copy()
        self.labeled_df["playerid"] = self.labeled_df["playerId"]
        self.columns = self.columns + ["playerid"]
        self.encoder = None
    
    def fit(self, input_df: pd.DataFrame, y=None):
        self.encoder = ce.OrdinalEncoder(handle_unknown="value", handle_missing="values")
        self.encoder.fit(self.labeled_df[self.columns])
        self.labeled_df[self.columns] = self.encoder.transform(self.labeled_df[self.columns])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.labeled_df, on="playerId").add_prefix("Players_LE_")


class RostersLabelEncodingBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        rosters_df = mlb_train_ds.train_rosters[self.columns]
        self.encoder = ce.OrdinalEncoder(handle_unknown="value", handle_missing="value")
        self.encoder.fit(rosters_df[self.columns])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        self.labeled_df = mlb_train_ds.train_rosters[CFG.OBJECT_ID + self.columns].copy()
        self.labeled_df[self.columns] = self.encoder.transform(self.labeled_df[self.columns])

        return merge_by_key(input_df, self.labeled_df).add_prefix("Rosters_LE_")

    
class PlayerBoxScoresCountBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        self.agg_df = mlb_train_ds.train_scores.groupby(CFG.OBJECT_ID)[self.columns].sum().reset_index()
        
        return merge_by_key(input_df, self.agg_df).add_prefix("PlayerBoxScoresCount_")


class TeamBoxScoresCountBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        input_df = input_df.merge(mlb_train_ds.train_rosters[["date", "playerId", "teamId"]], on=["playerId", "date"], how="left")
        self.agg_df = mlb_train_ds.train_tbs.groupby(["teamId", "date"])[self.columns].sum().reset_index()

        return merge_by_key(input_df, self.agg_df, on=["teamId", "date"]).add_prefix("TeamBoxScoresCount_")


class GamesCountBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        games_df = mlb_train_ds.train_scores[["date", "playerId", "gamePk"]].merge(mlb_train_ds.train_games, on=["date", "gamePk"], how="left")
        self.agg_df = games_df.groupby(["date", "playerId"])[self.columns].sum().reset_index()

        return merge_by_key(input_df, self.agg_df).add_prefix("GamesCount_")


class StandingsMetaBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        self.standings_df = mlb_train_ds.train_rosters[["date", "playerId", "teamId"]].merge(mlb_train_ds.train_standings, on=["date", "teamId"], how="left")

        return merge_by_key(input_df, self.standings_df[self.columns]).add_prefix("StandingsMeta_")


class PlayerTwitterFollowerAggregateBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        self.agg_df = mlb_train_ds.train_ptf.groupby(["playerId"])["numberOfFollowers"].agg([
            "mean", "std", "max", "min", "median", quantile25, quantile75])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df, on="playerId").add_prefix("PlayerTwitterFollowerAggregate_")


class PlayerStatsBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        playerBoxScores = mlb_train_ds.train_scores.copy()
        playerBoxScores["year"] = pd.to_datetime(playerBoxScores["gameDate"]).dt.year

        self.agg_df = playerBoxScores.groupby(["year", "playerId", "date"], as_index=False).agg(
            gamePk = ("gamePk", "nunique"),
            # 打撃編
            gamesPlayedBatting = ("gamesPlayedBatting", np.sum),
            plateAppearances = ("plateAppearances", np.sum),
            atBats = ("atBats", np.sum),
            hits = ("hits", np.sum),
            doubles = ("doubles", np.sum),
            triples = ("triples", np.sum),
            homeRuns = ("homeRuns", np.sum),
            rbi = ("rbi", np.sum),
            runsScored = ("runsScored", np.sum),
            totalBases = ("totalBases", np.sum),
            baseOnBalls = ("baseOnBalls", np.sum),
            hitByPitch = ("hitByPitch", np.sum),
            intentionalWalks = ("intentionalWalks", np.sum),
            groundIntoDoublePlay = ("groundIntoDoublePlay", np.sum),
            groundIntoTriplePlay = ("groundIntoTriplePlay", np.sum),
            strikeOuts = ("strikeOuts", np.sum),
            sacBunts = ("sacBunts", np.sum),
            sacFlies = ("sacFlies", np.sum),
            caughtStealing = ("caughtStealing", np.sum),
            stolenBases = ("stolenBases", np.sum),
            leftOnBase = ("leftOnBase", np.sum),
            catchersInterference = ("catchersInterference", np.sum),
            pickoffs = ("pickoffs", np.sum),
            flyOuts = ("flyOuts", np.sum),
            groundOuts = ("groundOuts", np.sum),
            # 投手編
            gamesPlayedPitching = ("gamesPlayedPitching", np.sum),
            gamesStartedPicthing = ("gamesStartedPitching", np.sum),
            winsPitching = ("winsPitching", np.sum),
            lossesPitching = ("lossesPitching", np.sum),
            completeGamesPitching = ("completeGamesPitching", np.sum),
            shutoutsPitching = ("shutoutsPitching", np.sum),
            saves = ("saves", np.sum),
            saveOpportunities = ("saveOpportunities", np.sum),
            blownSaves = ("blownSaves", np.sum),
            holds = ("holds", np.sum),
            inningsPitched = ("inningsPitched", np.sum),
            runsPitching = ("runsPitching", np.sum),
            earnedRuns = ("earnedRuns", np.sum),
            pitchesThrown = ("pitchesThrown", np.sum),
            hitsPitching = ("hitsPitching", np.sum),
            homeRunsPitching = ("homeRunsPitching", np.sum),
            strikeOutsPitching = ("strikeOutsPitching", np.sum),
            baseOnBallsPitching = ("baseOnBallsPitching", np.sum),
            hitByPitchPitching = ("hitByPitchPitching", np.sum),
            intentionalWalksPitching = ("intentionalWalksPitching", np.sum),
            balks = ("balks", np.sum),
            wildPitches = ("wildPitches", np.sum),
            atBatsPitching = ("atBatsPitching", np.sum),
            battersFaced = ("battersFaced", np.sum),
            sacBuntsPitching = ("sacBuntsPitching", np.sum),
            sacFliesPitching = ("sacFliesPitching", np.sum),
            inheritedRunners = ("inheritedRunners", np.sum),
            inheritedRunnersScored = ("inheritedRunnersScored", np.sum),
            rbiPitching = ("rbiPitching", np.sum),
            flyOutsPitching = ("flyOutsPitching", np.sum),
            airOutsPitching = ("airOutsPitching", np.sum),
            doublesPitching = ("doublesPitching", np.sum),
            triplesPitching = ("triplesPitching", np.sum),
            caughtStealingPitching = ("caughtStealingPitching", np.sum),
            stolenBasesPitching = ("stolenBasesPitching", np.sum),
            outsPitching = ("outsPitching", np.sum),
            balls = ("balls", np.sum),
            strikes = ("strikes", np.sum),
            hitBatsmen = ("hitBatsmen", np.sum),
            pickoffsPitching = ("pickoffsPitching", np.sum),
            catchersInterferencePitching = ("catchersInterferencePitching", np.sum),
            assists = ("assists", np.sum),
            putOuts = ("putOuts", np.sum),
            errors = ("errors", np.sum),
            chances = ("chances", np.sum),
            )

        self.agg_df["qs"] = ((self.agg_df["inningsPitched"] >= 6) & (self.agg_df["runsPitching"] <= 3)) * self.agg_df["gamesStartedPicthing"]
        self.agg_df["hqs"] = ((self.agg_df["inningsPitched"] >= 7) & (self.agg_df["runsPitching"] <= 2)) * self.agg_df["gamesStartedPicthing"]
        
        for col in self.agg_df.drop(columns=["year", "date", "playerId"], axis=1).columns.tolist():
            col_ = col + "_cumsum"
            self.agg_df[col_] = self.agg_df.groupby(["year", "playerId"], as_index=False)[col].cumsum()

        # 打撃編
        self.agg_df["battingAverage"] = self.agg_df["hits_cumsum"] / self.agg_df["atBats_cumsum"] # 打率
        self.agg_df["sluggingPercentage"] = self.agg_df["totalBases_cumsum"] / self.agg_df["atBats_cumsum"] # 長打率
        self.agg_df["onBasePercentage"] = (self.agg_df["hits_cumsum"] + self.agg_df["baseOnBalls_cumsum"] + self.agg_df["hitByPitch_cumsum"])/(self.agg_df["atBats_cumsum"] + self.agg_df["baseOnBalls_cumsum"] + self.agg_df["hitByPitch_cumsum"] + self.agg_df["sacFlies_cumsum"]) # 出塁率
        self.agg_df["ops"] = self.agg_df["sluggingPercentage"] + self.agg_df["onBasePercentage"] # OPS
        self.agg_df["isop"] = self.agg_df["sluggingPercentage"] - self.agg_df["battingAverage"] # IsoP
        self.agg_df["isod"] = self.agg_df["onBasePercentage"] - self.agg_df["battingAverage"] # IsoD
        self.agg_df["rc"] = (self.agg_df["hits_cumsum"] + self.agg_df["baseOnBalls_cumsum"]) * self.agg_df["totalBases_cumsum"] / (self.agg_df["atBats_cumsum"] + self.agg_df["baseOnBalls_cumsum"]) # RC
        self.agg_df["rc27"] = self.agg_df["rc"] / (self.agg_df["atBats_cumsum"] - self.agg_df["hits_cumsum"] + self.agg_df["caughtStealing_cumsum"] + self.agg_df["sacBunts_cumsum"] + self.agg_df["sacFlies_cumsum"] + self.agg_df["groundIntoDoublePlay_cumsum"]) * 27 # RC27
        self.agg_df["rc27"] = self.agg_df["rc27"].replace({np.inf: 0})
        self.agg_df["babip"] = (self.agg_df["hits_cumsum"] - self.agg_df["homeRuns_cumsum"]) / (self.agg_df["atBats_cumsum"] - self.agg_df["strikeOuts_cumsum"] - self.agg_df["homeRuns_cumsum"] + self.agg_df["sacFlies_cumsum"]) # BABIP
        self.agg_df["bb_k"] = self.agg_df["baseOnBalls_cumsum"] / self.agg_df["strikeOuts_cumsum"] # BB/K
        self.agg_df["k%"] = self.agg_df["strikeOuts_cumsum"] / self.agg_df["plateAppearances_cumsum"] # K%
        # 投手編
        self.agg_df["era"] = self.agg_df["earnedRuns_cumsum"] * 9 / self.agg_df["inningsPitched_cumsum"] # 防御率
        self.agg_df["wp"] = self.agg_df["winsPitching_cumsum"] / (self.agg_df["winsPitching_cumsum"] + self.agg_df["lossesPitching_cumsum"]) # 勝率
        self.agg_df["k9"] = self.agg_df["strikeOutsPitching_cumsum"] * 9 / self.agg_df["inningsPitched_cumsum"] # K/9(奪三振率)
        self.agg_df["hits_allowed_average"] = self.agg_df["hitsPitching_cumsum"] / self.agg_df["atBatsPitching_cumsum"] # 被打率
        self.agg_df["hp"] = self.agg_df["holds_cumsum"] + self.agg_df["winsPitching_cumsum"] # HP
        self.agg_df["qs%"] = self.agg_df["qs_cumsum"] / self.agg_df["gamesStartedPicthing_cumsum"] # QS率
        self.agg_df["hqs%"] = self.agg_df["hqs_cumsum"] / self.agg_df["gamesStartedPicthing_cumsum"] # HQS率
        self.agg_df["k%_pitching"] = self.agg_df["strikeOutsPitching_cumsum"] / self.agg_df["battersFaced_cumsum"] # K%
        self.agg_df["bb9"] = self.agg_df["baseOnBallsPitching_cumsum"] * 9 / self.agg_df["inningsPitched_cumsum"] # BB/9
        self.agg_df["bb%"] = self.agg_df["baseOnBallsPitching_cumsum"] / self.agg_df["battersFaced_cumsum"] # BB%
        self.agg_df["k_bb"] = self.agg_df["strikeOutsPitching_cumsum"] / (self.agg_df["baseOnBallsPitching_cumsum"] + self.agg_df["hitByPitchPitching_cumsum"]) # K/BB
        self.agg_df["hr9"] = self.agg_df["homeRunsPitching_cumsum"] * 9 / self.agg_df["inningsPitched_cumsum"] # HR/9
        self.agg_df["babip_pitching"] = (self.agg_df["hitsPitching_cumsum"] - self.agg_df["homeRunsPitching_cumsum"]) / (self.agg_df["atBatsPitching_cumsum"] - self.agg_df["strikeOutsPitching_cumsum"] - self.agg_df["homeRunsPitching_cumsum"] + self.agg_df["sacFliesPitching_cumsum"]) # BABIP
        self.agg_df["whip"] = (self.agg_df["baseOnBallsPitching_cumsum"] + self.agg_df["hitsPitching_cumsum"]) / self.agg_df["inningsPitched_cumsum"] # WHIP
        # ランキング
        # 率のものは規定打席以上かどうか、投手かどうかを加えたい
        self.agg_df["homeRuns_rank"] = self.agg_df.groupby(["date"])["homeRuns_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["hits_rank"] = self.agg_df.groupby(["date"])["hits_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["rbi_rank"] = self.agg_df.groupby(["date"])["rbi_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["doubles_rank"] = self.agg_df.groupby(["date"])["doubles_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["triples_rank"] = self.agg_df.groupby(["date"])["triples_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["runsScored_rank"] = self.agg_df.groupby(["date"])["runsScored_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["totalBases_rank"] = self.agg_df.groupby(["date"])["totalBases_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["baseOnBalls_rank"] = self.agg_df.groupby(["date"])["baseOnBalls_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["hitByPitch_rank"] = self.agg_df.groupby(["date"])["hitByPitch_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["strikeOuts_rank"] = self.agg_df.groupby(["date"])["strikeOuts_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["stolenBases_rank"] = self.agg_df.groupby(["date"])["stolenBases_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["plateAppearances_rank"] = self.agg_df.groupby(["date"])["plateAppearances_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["atBats_rank"] = self.agg_df.groupby(["date"])["atBats_cumsum"].rank(ascending=False, method="min").astype(int)

        self.agg_df["battingAverage_rank"] = self.agg_df[self.agg_df["gamesPlayedBatting"] == 1.0].groupby(["date"])["battingAverage"].rank(ascending=False, method="min")
        self.agg_df["ops_rank"] = self.agg_df.groupby(["date"])["ops"].rank(ascending=False, method="min")
        self.agg_df["isop_rank"] = self.agg_df.groupby(["date"])["isop"].rank(ascending=False, method="min")
        self.agg_df["isod_rank"] = self.agg_df.groupby(["date"])["isod"].rank(ascending=False, method="min")
        self.agg_df["rc_rank"] = self.agg_df.groupby(["date"])["rc"].rank(ascending=False, method="min")
        self.agg_df["rc27_rank"] = self.agg_df.groupby(["date"])["rc27"].rank(ascending=False, method="min")
        self.agg_df["babip_rank"] = self.agg_df.groupby(["date"])["babip"].rank(ascending=False, method="min")
        self.agg_df["bb_k_rank"] = self.agg_df.groupby(["date"])["bb_k"].rank(ascending=False, method="min")
        self.agg_df["k%_rank"] = self.agg_df.groupby(["date"])["k%"].rank(ascending=False, method="min")

        self.agg_df["winsPitching_rank"] = self.agg_df.groupby(["date"])["winsPitching_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["lossesPitching_rank"] = self.agg_df.groupby(["date"])["lossesPitching_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["saves_rank"] = self.agg_df.groupby(["date"])["saves_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["holds_rank"] = self.agg_df.groupby(["date"])["holds_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["completeGamesPitching_rank"] = self.agg_df.groupby(["date"])["completeGamesPitching_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["shutoutsPitching_rank"] = self.agg_df.groupby(["date"])["shutoutsPitching_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["inningsPitched_rank"] = self.agg_df.groupby(["date"])["inningsPitched_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["runsPitching_rank"] = self.agg_df.groupby(["date"])["runsPitching_cumsum"].rank(ascending=False, method="min").astype(int)
        self.agg_df["earnedRuns_rank"] = self.agg_df.groupby(["date"])["earnedRuns_cumsum"].rank(ascending=False, method="min").astype(int)

        self.agg_df["era_rank"] = self.agg_df.groupby(["date"])["era"].rank(ascending=False, method="min")
        self.agg_df["whip_rank"] = self.agg_df.groupby(["date"])["whip"].rank(ascending=False, method="min")
        # 規定打席
        # teamBoxScores = mlb_train_ds.train_tbs.copy()
        # teamBoxScores["year"] = pd.to_datetime(teamBoxScores["gameDate"]).dt.year
        # teamBoxScores_agg_df = teamBoxScores.groupby(["year", "teamId", "date"], as_index=False)["gamePk"].nunique()
        # teamBoxScores_agg_df["gamePk_cumsum"] = teamBoxScores_agg_df.groupby(["teamId", "year"])["gamePk"].cumsum()
        # self.agg_df = self.agg_df.merge(teamBoxScores_agg_df, on=["date"])
        
        self.agg_df = reduce_mem_usage(self.agg_df)

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df.drop(columns=["year"], axis=1)).add_prefix("PlayerStats_")

In [131]:
def create_train_feature(input_df: pd.DataFrame, y: np.ndarray, update_blocks: list, non_update_blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()
    blocks = update_blocks + non_update_blocks

    for block in blocks:
        with timer(name=f"{str(block) + '_fit'}", logger=logger):
            try:
                out_feat_block = block.fit(input_df, y=y)
            except Exception as e:
                print(f"Error on {block} fit.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df


def create_test_feature(input_df: pd.DataFrame, update_blocks: list, non_update_blocks: list) -> pd.DataFrame:
    is_update = True if len(update_blocks) != 0 else False
    update_feat_df = pd.DataFrame()
    non_update_feat_df = pd.DataFrame()
    
    if is_update:
        for block in update_blocks:
            with timer(name=f"{str(block) + '_update'}", logger=logger):
                try:
                    out_feat_block = block.fit(input_df)
                except Exception as e:
                    print(f"Error on {block} transform.")
                    raise e from e

            update_feat_df = pd.concat([update_feat_df, out_feat_block], axis=1)
        

    for block in non_update_blocks:
        with timer(name=f"{str(block) + '_transform'}", logger=logger):
            try:
                out_feat_block = block.transform(input_df)
            except Exception as e:
                print(f"Error on {block} transform.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        non_update_feat_df = pd.concat([non_update_feat_df, out_feat_block], axis=1)

    feat_df = pd.concat([update_feat_df, non_update_feat_df], axis=1) if is_update else non_update_feat_df

    return feat_df

## CV

In [132]:
def get_timeseries_holdout(train_df: pd.DataFrame, valid_start_date: int) -> List[tuple]:
    fold = []
    train_idx = np.array(train_df[train_df["date"] < valid_start_date].index)
    valid_idx = np.array(train_df[train_df["date"] >= valid_start_date].index)
    fold.append((train_idx, valid_idx))

    return fold

## Model

In [133]:
class LightGBMTrainer:
    def __init__(self, params: dict, seeds: List[int]):
        self.params = params
        self.seeds = seeds
        self.models = []

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, cv: List[tuple]):
        oof_all = []
        for i, seed in enumerate(self.seeds):
            self.params["seed"] = seed

            for train_idx, valid_idx in cv:
                X_train_fold = X_train[train_idx]
                X_valid_fold = X_train[valid_idx]

                y_train_fold = y_train[train_idx]
                y_valid_fold = y_train[valid_idx]

                train_set = lgb.Dataset(X_train_fold, y_train_fold)
                valid_set = lgb.Dataset(X_valid_fold, y_valid_fold, reference=train_set)

                model = lgb.train(
                    params=self.params,
                    train_set=train_set,
                    valid_sets=[train_set, valid_set],
                    verbose_eval=100,
                )

                y_oof = model.predict(X_valid_fold, num_iteration=model.best_iteration)
                oof_all.append(y_oof)
                self.models.append(model)

        oof_all = np.mean(oof_all, axis=0)
        oof_score = mean_absolute_error(oof_all, y_valid_fold)

        return oof_score, self.models

    def predict(self, X_test: np.ndarray):
        y_pred = np.mean([model.predict(X_test) for model in self.models], axis=0)

        return y_pred


def run_lgb(X_train, targets, params, seeds, fold):
    cv_scores = []
    models = []

    for i, target in enumerate(targets):
        print(f"Traning for Target{i+1}")
        trainer = LightGBMTrainer(params, seeds)
        oof_score_tmp, models_tmp = trainer.fit(X_train, target, fold)
        cv_scores.append(oof_score_tmp)
        models.append(models_tmp)
        print(f"Local Target{i+1} OOF-MAE : {np.mean(oof_score_tmp)}")
        print("-"*50)

    print(f"Local MCMAE : {np.mean(cv_scores)}")

    return models

## Training

In [161]:
# set-up logger
logger = get_logger()

# create feature
update_blocks = []
non_update_blocks = [
    TargetAggregateBlock(),
    RostersLabelEncodingBlock(columns=[
        "teamId",
        "status"
        ]
    ),
    PlayersLabelEncodingBlock(columns=[
        "birthCountry",
        "primaryPositionCode",
    ]),
    # PlayerBoxScoresCountBlock(columns=[
    #     # 'date',
    #     'home',
    #     # 'gamePk',
    #     # 'gameDate',
    #     # 'gameTimeUTC',
    #     # 'teamId',
    #     # 'teamName',
    #     # 'playerId',
    #     # 'playerName',
    #     'jerseyNum',
    #     # 'positionCode',
    #     # 'positionName',
    #     # 'positionType',
    #     'battingOrder',
    #     'gamesPlayedBatting',
    #     'flyOuts',
    #     'groundOuts',
    #     'runsScored',
    #     'doubles',
    #     'triples',
    #     'homeRuns',
    #     'strikeOuts',
    #     'baseOnBalls',
    #     'intentionalWalks',
    #     'hits',
    #     'hitByPitch',
    #     'atBats',
    #     'caughtStealing',
    #     'stolenBases',
    #     'groundIntoDoublePlay',
    #     'groundIntoTriplePlay',
    #     'plateAppearances',
    #     'totalBases',
    #     'rbi',
    #     'leftOnBase',
    #     'sacBunts',
    #     'sacFlies',
    #     'catchersInterference',
    #     'pickoffs',
    #     'gamesPlayedPitching',
    #     'gamesStartedPitching',
    #     'completeGamesPitching',
    #     'shutoutsPitching',
    #     'winsPitching',
    #     'lossesPitching',
    #     'flyOutsPitching',
    #     'airOutsPitching',
    #     'groundOutsPitching',
    #     'runsPitching',
    #     'doublesPitching',
    #     'triplesPitching',
    #     'homeRunsPitching',
    #     'strikeOutsPitching',
    #     'baseOnBallsPitching',
    #     'intentionalWalksPitching',
    #     'hitsPitching',
    #     'hitByPitchPitching',
    #     'atBatsPitching',
    #     'caughtStealingPitching',
    #     'stolenBasesPitching',
    #     'inningsPitched',
    #     'saveOpportunities',
    #     'earnedRuns',
    #     'battersFaced',
    #     'outsPitching',
    #     'pitchesThrown',
    #     'balls',
    #     'strikes',
    #     'hitBatsmen',
    #     'balks',
    #     'wildPitches',
    #     'pickoffsPitching',
    #     'rbiPitching',
    #     'gamesFinishedPitching',
    #     'inheritedRunners',
    #     'inheritedRunnersScored',
    #     'catchersInterferencePitching',
    #     'sacBuntsPitching',
    #     'sacFliesPitching',
    #     'saves',
    #     'holds',
    #     'blownSaves',
    #     'assists',
    #     'putOuts',
    #     'errors',
    #     'chances'
    #     ]
    # ),
    # TeamBoxScoresCountBlock(columns=[
    #     # 'date',
    #     # 'home',
    #     # 'teamId',
    #     # 'gamePk',
    #     # 'gameDate',
    #     # 'gameTimeUTC',
    #     'flyOuts',
    #     'groundOuts',
    #     'runsScored',
    #     'doubles',
    #     'triples',
    #     'homeRuns',
    #     'strikeOuts',
    #     'baseOnBalls',
    #     'intentionalWalks',
    #     'hits',
    #     'hitByPitch',
    #     'atBats',
    #     'caughtStealing',
    #     'stolenBases',
    #     'groundIntoDoublePlay',
    #     'groundIntoTriplePlay',
    #     'plateAppearances',
    #     'totalBases',
    #     'rbi',
    #     'leftOnBase',
    #     'sacBunts',
    #     'sacFlies',
    #     'catchersInterference',
    #     'pickoffs',
    #     'airOutsPitching',
    #     'groundOutsPitching',
    #     'runsPitching',
    #     'doublesPitching',
    #     'triplesPitching',
    #     'homeRunsPitching',
    #     'strikeOutsPitching',
    #     'baseOnBallsPitching',
    #     'intentionalWalksPitching',
    #     'hitsPitching',
    #     'hitByPitchPitching',
    #     'atBatsPitching',
    #     'caughtStealingPitching',
    #     'stolenBasesPitching',
    #     'inningsPitched',
    #     'earnedRuns',
    #     'battersFaced',
    #     'outsPitching',
    #     'hitBatsmen',
    #     'balks',
    #     'wildPitches',
    #     'pickoffsPitching',
    #     'rbiPitching',
    #     'inheritedRunners',
    #     'inheritedRunnersScored',
    #     'catchersInterferencePitching',
    #     'sacBuntsPitching',
    #     'sacFliesPitching'
    #     ]
    # ),
    GamesCountBlock(columns=[
        # 'date',
        # 'gamePk',
        # 'gameType',
        # 'season',
        # 'gameDate',
        # 'gameTimeUTC',
        # 'resumeDate',
        # 'resumedFrom',
        # 'codedGameState',
        # 'detailedGameState',
        'isTie',
        'gameNumber',
        'doubleHeader',
        # 'dayNight',
        'scheduledInnings',
        'gamesInSeries',
        # 'seriesDescription',
        'homeId',
        # 'homeName',
        # 'homeAbbrev',
        'homeWins',
        'homeLosses',
        'homeWinPct',
        'homeWinner',
        'homeScore',
        'awayId',
        # 'awayName',
        # 'awayAbbrev',
        'awayWins',
        'awayLosses',
        'awayWinPct',
        'awayWinner',
        'awayScore'
        ]
    ),
    # PlayerTwitterFollowerAggregateBlock(),
    PlayerStatsBlock(),
    StandingsMetaBlock(columns=[
        'playerId',
        'date',
        # 'season',
        # 'gameDate',
        'divisionId',
        # 'teamId',
        # 'teamName',
        # 'streakCode',
        'divisionRank',
        'leagueRank',
        'wildCardRank',
        # 'leagueGamesBack',
        # 'sportGamesBack',
        # 'divisionGamesBack',
        'wins',
        'losses',
        'pct',
        'runsAllowed',
        'runsScored',
        # 'divisionChamp',
        # 'divisionLeader',
        # 'wildCardLeader',
        # 'eliminationNumber',
        # 'wildCardEliminationNumber',
        'homeWins',
        'homeLosses',
        'awayWins',
        'awayLosses',
        'lastTenWins',
        'lastTenLosses',
        'extraInningWins',
        'extraInningLosses',
        'oneRunWins',
        'oneRunLosses',
        'dayWins',
        'dayLosses',
        'nightWins',
        'nightLosses',
        'grassWins',
        'grassLosses',
        'turfWins',
        'turfLosses',
        'divWins',
        'divLosses',
        'alWins',
        'alLosses',
        'nlWins',
        'nlLosses',
        'xWinLossPct'
        ]
    ),
]

y = mlb_train_ds.train_next["target1"].values
input_df = mlb_train_ds.train_next[CFG.OBJECT_ID].copy()
train_feat_df = create_train_feature(input_df, y, update_blocks, non_update_blocks)
X_train = train_feat_df.values
targets = [mlb_train_ds.train_next[col].values for col in CFG.TARGETS]

# split train/valid
fold = get_timeseries_holdout(input_df, valid_start_date=20210401)

# training
models = run_lgb(X_train, targets, CFG.LGB_PARAMS, CFG.SEEDS, fold)

[2021-07-11 18:11:07,678] [INFO] [logger set up]
[2021-07-11 18:11:07,788] [INFO] [<<__main__.TargetAggregateBlock object at 0x1c6ad03d0>_fit> start]
[2021-07-11 18:11:17,216] [INFO] [<<__main__.TargetAggregateBlock object at 0x1c6ad03d0>_fit> done in 9.43 s]
[2021-07-11 18:11:17,283] [INFO] [<<__main__.RostersLabelEncodingBlock object at 0x1c6ab78e0>_fit> start]
[2021-07-11 18:11:18,349] [INFO] [<<__main__.RostersLabelEncodingBlock object at 0x1c6ab78e0>_fit> done in 1.07 s]
[2021-07-11 18:11:18,536] [INFO] [<<__main__.PlayersLabelEncodingBlock object at 0x1c32b5b50>_fit> start]
[2021-07-11 18:11:18,810] [INFO] [<<__main__.PlayersLabelEncodingBlock object at 0x1c32b5b50>_fit> done in 0.27 s]
[2021-07-11 18:11:19,023] [INFO] [<<__main__.GamesCountBlock object at 0x1c6acf970>_fit> start]
[2021-07-11 18:11:20,221] [INFO] [<<__main__.GamesCountBlock object at 0x1c6acf970>_fit> done in 1.20 s]
[2021-07-11 18:11:20,518] [INFO] [<<__main__.PlayerStatsBlock object at 0x1c14405e0>_fit> start]



Found `num_iterations` in params. Will use it instead of argument


Found `early_stopping_round` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.511226	valid_1's l1: 0.70394
[200]	training's l1: 0.510107	valid_1's l1: 0.700028
[300]	training's l1: 0.510073	valid_1's l1: 0.699928
[400]	training's l1: 0.510019	valid_1's l1: 0.699807
Early stopping, best iteration is:
[354]	training's l1: 0.510036	valid_1's l1: 0.699786
Local Target1 OOF-MAE : 0.699786220645778
--------------------------------------------------
Traning for Target2



Found `num_iterations` in params. Will use it instead of argument


Found `early_stopping_round` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[100]	training's l1: 1.70451	valid_1's l1: 1.47754
Early stopping, best iteration is:
[88]	training's l1: 1.70587	valid_1's l1: 1.47737
Local Target2 OOF-MAE : 1.4773672611327218
--------------------------------------------------
Traning for Target3



Found `num_iterations` in params. Will use it instead of argument


Found `early_stopping_round` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.669039	valid_1's l1: 0.554732
[200]	training's l1: 0.668823	valid_1's l1: 0.554662
Early stopping, best iteration is:
[132]	training's l1: 0.668852	valid_1's l1: 0.554506
Local Target3 OOF-MAE : 0.5545058889952914
--------------------------------------------------
Traning for Target4



Found `num_iterations` in params. Will use it instead of argument


Found `early_stopping_round` in params. Will use it instead of argument



Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.779351	valid_1's l1: 1.12433
[200]	training's l1: 0.776847	valid_1's l1: 1.12282
Early stopping, best iteration is:
[195]	training's l1: 0.776855	valid_1's l1: 1.12257
Local Target4 OOF-MAE : 1.1225708050183636
--------------------------------------------------
Local MCMAE : 0.9635575439480387


In [162]:
def visualize_feature_importance(models, feat_train_df) -> plotly.graph_objects.Figure:
    '''LightGBMのfeature importanceを可視化
    '''
    for target_i in range(len(models)):
        feature_importance_df = pd.DataFrame()
        for i, model in enumerate(models[target_i]):
            _df = pd.DataFrame()
            _df['feature_importance'] = model.feature_importance(importance_type="gain")
            _df['feature'] = feat_train_df.columns
            _df['model_no'] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, _df], 
                                            axis=0, ignore_index=True)

        order = feature_importance_df.groupby('feature')\
            .mean()[['feature_importance']]\
            .sort_values('feature_importance', ascending=False).index[:50]
        
        fig = px.box(
            feature_importance_df.query("feature in @order"),
            x="feature_importance",
            y="feature",
            category_orders={"feature": order},
            width=1250,
            height=900,
            title=f"Target{target_i+1} Top 50 feature importance",
        )
        fig.update_yaxes(showgrid=True)
        fig.show()

In [163]:
visualize_feature_importance(models, train_feat_df)

## Inference

In [None]:
def update_dataset(input_df: pd.DataFrame, sample_prediction_df: pd.DataFrame):
    '''datasetの更新
    '''
    # columnがNaNでなければ分岐処理が実行される(pd.isnaとかでも代用可)
    # NaN == NaNはFalseになる
    ####################
    # rosters #
    ####################
    if input_df["rosters"].iloc[0] == input_df["rosters"].iloc[0]:
        test_rosters = pd.read_json(input_df["rosters"].iloc[0])
    else:
        test_rosters = sample_prediction_df[["playerId"]].copy()
        for col in mlb_train_ds.train_rosters.columns:
            if col == "playerId": continue
            test_rosters[col] = np.nan

    test_rosters["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_rosters = reduce_mem_usage(pd.concat([mlb_train_ds.train_rosters, test_rosters], axis=0, ignore_index=True))
    ####################
    # playerBoxScores #
    ####################
    if input_df["playerBoxScores"].iloc[0] == input_df["playerBoxScores"].iloc[0]:
        test_scores = pd.read_json(input_df["playerBoxScores"].iloc[0])
    else:
        test_scores = sample_prediction_df[["playerId"]].copy()
        for col in mlb_train_ds.train_scores.columns:
            if col == "playerId": continue
            test_scores[col] = np.nan
    
    test_scores["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_scores = reduce_mem_usage(pd.concat([mlb_train_ds.train_scores, test_scores], axis=0, ignore_index=True))
    ####################
    # teamBoxScores #
    ####################
    if input_df["teamBoxScores"].iloc[0] == input_df["teamBoxScores"].iloc[0]:
        test_teamBoxScores = pd.read_json(input_df["teamBoxScores"].iloc[0])
    else:
        cols = mlb_train_ds.train_tbs.drop(columns="date", axis=1).columns.tolist()
        test_teamBoxScores = pd.DataFrame(columns=cols)
        test_teamBoxScores["teamId"] = mlb_train_ds.train_tbs.teamId.unique()

    test_teamBoxScores["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_tbs = reduce_mem_usage(pd.concat([mlb_train_ds.train_tbs, test_teamBoxScores], axis=0, ignore_index=True))
    ####################
    # games #
    ####################
    if input_df["games"].iloc[0] == input_df["games"].iloc[0]:
        test_games = pd.read_json(input_df["games"].iloc[0])
    else:
        cols = mlb_train_ds.train_games.drop(columns="date", axis=1).columns.tolist()
        test_games = pd.DataFrame(columns=cols)

    test_games["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_games = reduce_mem_usage(pd.concat([mlb_train_ds.train_games, test_games], axis=0, ignore_index=True))
    ####################
    # staindings #
    ####################
    if input_df["standings"].iloc[0] == input_df["standings"].iloc[0]:
        test_standings = pd.read_json(input_df["standings"].iloc[0])
    else:
        cols = mlb_train_ds.train_standings.drop(columns="date", axis=1).columns.tolist()
        test_standings = pd.DataFrame(columns=cols)
    
    test_standings["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_standings = reduce_mem_usage(pd.concat([mlb_train_ds.train_standings, test_standings], axis=0, ignore_index=True))

In [None]:
# models[0][0].save_model("nb012_lgb_target1.txt")
# models[1][0].save_model("nb012_lgb_target2.txt")
# models[2][0].save_model("nb012_lgb_target3.txt")
# models[3][0].save_model("nb012_lgb_target4.txt")

# model1 = lgb.Booster(model_file=f"{CFG.MODEL_PATH}/nb012_lgb_target1.txt")
# model2 = lgb.Booster(model_file=f"{CFG.MODEL_PATH}/nb012_lgb_target2.txt")
# model3 = lgb.Booster(model_file=f"{CFG.MODEL_PATH}/nb012_lgb_target3.txt")
# model4 = lgb.Booster(model_file=f"{CFG.MODEL_PATH}/nb012_lgb_target4.txt")
# models = [model1, model2, model3, model4]

In [None]:
# env = mlb.make_env()
# iter_test = env.iter_test()

# for (test_df, sample_prediction_df) in iter_test:
#     sample_prediction_df = sample_prediction_df.reset_index(drop=True)
#     # create dataset
#     test_df = test_df.reset_index()
#     test_df = test_df.rename(columns={"index": "date"})
#     sample_prediction_df["date"] = test_df.iloc[0]["date"]
#     sample_prediction_df["playerId"] = sample_prediction_df["date_playerId"].map(lambda x: int(x.split("_")[1]))
#     update_dataset(test_df, sample_prediction_df)
#     # create features
#     test_feat_df = create_test_feature(sample_prediction_df, update_blocks, non_update_blocks)
#     X_test = test_feat_df.values
#     # prediction
#     for target, model in zip(CFG.TARGETS, models):
#         pred = model.predict(X_test)
#         sample_prediction_df[target] = np.clip(pred, 0, 100)

#     sample_prediction_df = sample_prediction_df.drop(columns=["playerId", "date"], axis=1)

#     env.predict(sample_prediction_df)

In [None]:
# Local inference check
# test_df = mlb_train_ds.example_test.query("date <= 20210426").copy()
# sample_prediction_df = mlb_train_ds.sample_submission.query("date <= 20210426").copy()
# sample_prediction_df["playerId"] = sample_prediction_df["date_playerId"].map(lambda x: int(x.split("_")[1]))