# Overview
- LightGBM

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from tqdm.notebook import tqdm
import lightgbm as lgb
from typing import List, Union, Optional
import time
from contextlib import contextmanager
import sys
import logging
import category_encoders as ce
import plotly
from plotly import express as px
# import mlb

pd.options.display.max_columns = 100

## Config

In [2]:
class CFG:
    ######################
    # global #
    ######################
    INPUT_DIR = "../input/mlb-player-digital-engagement-forecasting"
    # INPUT_DIR = "../input/mlb-unnested-dataset" # for kaggle kernel
    OBJECT_ID = ["playerId", "date"]
    TARGETS = ["target1", "target2", "target3", "target4"]
    ######################
    # model #
    ######################
    LGB_PARAMS = {
        "objective": "mae",
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "max_depth": 6,
        "num_leaves": int(2 ** 6 * 0.8),
        "lambda_l1": 0.1,
        "lambda_l2": 0.1,
        "bagging_fraction": 0.9,
        "bagging_freq": 3,
        "feature_fraction": 0.9,
        "min_data_in_leaf": 20,
        "num_threads": 8,
        "verbosity": -1,
        "num_iterations": 10000,
        "early_stopping_round": 100,
    }
    SEEDS = [2434]

## Utils

In [3]:
def get_logger(out_file=None):
    logger = logging.getLogger()  # loggerの呼び出し
    formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] [%(message)s]")  # ログ出力の際のフォーマットを定義
    logger.handlers = []  # ハンドラーを追加するためのリスト
    logger.setLevel(logging.INFO)  # ロギングのレベルを設定, 'INFO' : 想定された通りのことが起こったことの確認

    handler = logging.StreamHandler(sys.stdout)  # StreamHandler(コンソールに出力するハンドラ)を追加
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    # ログをファイルとして出力する際のハンドラ(FileHandler)
    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)

    logger.info("logger set up")  # "logger set up"を表示
    return logger


@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"<{name}> start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"<{name}> done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Loading dataset

In [4]:
class MLBDataset(object):
    def __init__(self, input_path):
        self.input_path = Path(input_path)
        #################
        # train #
        #################
        self.train_next = pd.read_pickle(self.input_path / "train_nextDayPlayerEngagement.pickle")
        self.train_rosters = pd.read_pickle(self.input_path / "train_rosters.pickle")
        self.train_ptf = pd.read_pickle(self.input_path / "train_playerTwitterFollowers.pickle")
        self.train_scores = pd.read_pickle(self.input_path / "train_playerBoxScores.pickle")
        self.train_games = pd.read_pickle(self.input_path / "train_games.pickle")
        self.train_standings = pd.read_pickle(self.input_path / "train_standings.pickle")
        self.train_tbs = pd.read_pickle(self.input_path / "train_teamBoxScores.pickle")
        self.train_ttf = pd.read_pickle(self.input_path / "train_teamTwitterFollowers.pickle")
        self.train_trans = pd.read_pickle(self.input_path / "train_transactions.pickle")
        self.train_awards = pd.read_pickle(self.input_path / "train_awards.pickle")
        self.train_events = pd.read_pickle(self.input_path / "train_events.pickle")
        #################
        # Additional #
        #################
        self.players = pd.read_pickle(self.input_path / "players.pickle")
        self.awards = pd.read_pickle(self.input_path / "awards.pickle")
        self.seasons = pd.read_pickle(self.input_path / "seasons.pickle")
        self.teams = pd.read_pickle(self.input_path / "teams.pickle")
        #################
        # test #
        #################
        self.example_test = pd.read_csv(self.input_path / "example_test.csv")
        self.sample_submission = pd.read_csv(self.input_path / "example_sample_submission.csv")

mlb_train_ds = MLBDataset(CFG.INPUT_DIR)

## Feature blocks

In [12]:
def merge_by_key(left: Union[pd.DataFrame, pd.Series], right: pd.DataFrame, on=CFG.OBJECT_ID) -> pd.DataFrame:
    if not isinstance(left, pd.Series):
        left = left[on]
    return pd.merge(left, right, on=on, how="left").drop(columns=on)


class BaseBlock(object):
    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        return NotImplementedError()


# class LagBlock(BaseBlock):
#     def __init__(self, column: str, periods: List[int]):
#         self.column = column
#         self.periods = periods

#     def fit(self, input_df: pd.DataFrame, y=None):
#         agg_list = [input_df.groupby(["playerId"], as_index=False)[self.column].transform(lambda x: x.shift(periods=period)).add_prefix(f"{period}_") for period in self.periods]
#         self.agg_df = pd.concat(agg_list, axis=1)

#         return self.transform(input_df)

#     def transform(self, input_df: pd.DataFrame):
#         return self.agg_df.add_prefix("Lag_")


def quantile25(x: pd.Series):
    return x.quantile(q=0.25)


def quantile75(x: pd.Series):
    return x.quantile(q=0.75)


class TargetAggregateBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        dfs = [mlb_train_ds.train_next.groupby(["playerId"])[col].agg([
            "mean", "std", "max", "min", "median", quantile25, quantile75
            ]).add_prefix(f"{col}_") for col in CFG.TARGETS]
        self.agg_df = pd.concat(dfs, axis=1).reset_index()

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df, on="playerId")


class PlayersLabelEncodingBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns + ["playerId"]
        self.labeled_df = mlb_train_ds.players[self.columns].copy()
        self.labeled_df["playerid"] = self.labeled_df["playerId"]
        self.columns = self.columns + ["playerid"]
        self.encoder = None
    
    def fit(self, input_df: pd.DataFrame, y=None):
        self.encoder = ce.OrdinalEncoder(handle_unknown="value", handle_missing="values")
        self.encoder.fit(self.labeled_df[self.columns])
        self.labeled_df[self.columns] = self.encoder.transform(self.labeled_df[self.columns])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.labeled_df, on="playerId").add_prefix("Players_LE_")


class RostersLabelEncodingBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        rosters_df = mlb_train_ds.train_rosters[self.columns]
        self.encoder = ce.OrdinalEncoder(handle_unknown="value", handle_missing="value")
        self.encoder.fit(rosters_df[self.columns])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        self.labeled_df = mlb_train_ds.train_rosters[CFG.OBJECT_ID + self.columns].copy()
        self.labeled_df[self.columns] = self.encoder.transform(self.labeled_df[self.columns])

        return merge_by_key(input_df, self.labeled_df).add_prefix("Rosters_LE_")

    
class PlayerBoxScoresCountBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        self.agg_df = mlb_train_ds.train_scores.groupby(CFG.OBJECT_ID)[self.columns].sum().reset_index()
        
        return merge_by_key(input_df, self.agg_df).add_prefix("PlayerBoxScoresCount_")


class TeamBoxScoresCountBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        input_df = input_df.merge(mlb_train_ds.train_rosters[["date", "playerId", "teamId"]], on=["playerId", "date"], how="left")
        self.agg_df = mlb_train_ds.train_tbs.groupby(["teamId", "date"])[self.columns].sum().reset_index()

        return merge_by_key(input_df, self.agg_df, on=["teamId", "date"]).add_prefix("TeamBoxScoresCount_")


class GamesCountBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        games_df = mlb_train_ds.train_scores[["date", "playerId", "gamePk"]].merge(mlb_train_ds.train_games, on=["date", "gamePk"], how="left")
        self.agg_df = games_df.groupby(["date", "playerId"])[self.columns].sum().reset_index()

        return merge_by_key(input_df, self.agg_df).add_prefix("GamesCount_")


class PlayerTwitterFollowerAggregateBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        self.agg_df = mlb_train_ds.train_ptf.groupby(["playerId"])["numberOfFollowers"].agg([
            "mean", "std", "max", "min", "median", quantile25, quantile75])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df, on="playerId").add_prefix("PlayerTwitterFollowerAggregate_")

In [13]:
def create_train_feature(input_df: pd.DataFrame, y: np.ndarray, update_blocks: list, non_update_blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()
    blocks = update_blocks + non_update_blocks

    for block in blocks:
        with timer(name=f"{str(block) + '_fit'}", logger=logger):
            try:
                out_feat_block = block.fit(input_df, y=y)
            except Exception as e:
                print(f"Error on {block} fit.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df


def create_test_feature(input_df: pd.DataFrame, update_blocks: list, non_update_blocks: list) -> pd.DataFrame:
    is_update = True if len(update_blocks) != 0 else False
    update_feat_df = pd.DataFrame()
    non_update_feat_df = pd.DataFrame()
    
    if is_update:
        for block in update_blocks:
            with timer(name=f"{str(block) + '_update'}", logger=logger):
                try:
                    out_feat_block = block.fit(input_df)
                except Exception as e:
                    print(f"Error on {block} transform.")
                    raise e from e

            update_feat_df = pd.concat([update_feat_df, out_feat_block], axis=1)
        

    for block in non_update_blocks:
        with timer(name=f"{str(block) + '_transform'}", logger=logger):
            try:
                out_feat_block = block.transform(input_df)
            except Exception as e:
                print(f"Error on {block} transform.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        non_update_feat_df = pd.concat([non_update_feat_df, out_feat_block], axis=1)

    feat_df = pd.concat([update_feat_df, non_update_feat_df], axis=1) if is_update else non_update_feat_df

    return feat_df

## CV

In [14]:
def get_timeseries_holdout(train_df: pd.DataFrame, valid_start_date: int) -> List[tuple]:
    fold = []
    train_idx = np.array(train_df[train_df["date"] < valid_start_date].index)
    valid_idx = np.array(train_df[train_df["date"] >= valid_start_date].index)
    fold.append((train_idx, valid_idx))

    return fold

## Model

In [15]:
class LightGBMTrainer:
    def __init__(self, params: dict, seeds: List[int]):
        self.params = params
        self.seeds = seeds
        self.models = []

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, cv: List[tuple]):
        oof_all = []
        for i, seed in enumerate(self.seeds):
            self.params["seed"] = seed

            for train_idx, valid_idx in cv:
                X_train_fold = X_train[train_idx]
                X_valid_fold = X_train[valid_idx]

                y_train_fold = y_train[train_idx]
                y_valid_fold = y_train[valid_idx]

                train_set = lgb.Dataset(X_train_fold, y_train_fold)
                valid_set = lgb.Dataset(X_valid_fold, y_valid_fold, reference=train_set)

                model = lgb.train(
                    params=self.params,
                    train_set=train_set,
                    valid_sets=[train_set, valid_set],
                    verbose_eval=100,
                )

                y_oof = model.predict(X_valid_fold, num_iteration=model.best_iteration)
                oof_all.append(y_oof)
                self.models.append(model)

        oof_all = np.mean(oof_all, axis=0)
        oof_score = mean_absolute_error(oof_all, y_valid_fold)

        return oof_score, self.models

    def predict(self, X_test: np.ndarray):
        y_pred = np.mean([model.predict(X_test) for model in self.models], axis=0)

        return y_pred


def run_lgb(X_train, targets, params, seeds, fold):
    cv_scores = []
    models = []

    for i, target in enumerate(targets):
        print(f"Traning for Target{i+1}")
        trainer = LightGBMTrainer(params, seeds)
        oof_score_tmp, models_tmp = trainer.fit(X_train, target, fold)
        cv_scores.append(oof_score_tmp)
        models.append(models_tmp)
        print(f"Local Target{i+1} OOF-MAE : {np.mean(oof_score_tmp)}")
        print("-"*50)

    print(f"Local MCMAE : {np.mean(cv_scores)}")

    return models

## Training

In [16]:
# set-up logger
logger = get_logger()

# create feature
update_blocks = []
non_update_blocks = [
    TargetAggregateBlock(),
    RostersLabelEncodingBlock(columns=[
        "teamId",
        "status"
        ]
    ),
    PlayersLabelEncodingBlock(columns=[
        "birthCountry",
        "primaryPositionCode",
    ]),
    PlayerBoxScoresCountBlock(columns=[
        # 'date',
        'home',
        # 'gamePk',
        # 'gameDate',
        # 'gameTimeUTC',
        # 'teamId',
        # 'teamName',
        # 'playerId',
        # 'playerName',
        'jerseyNum',
        # 'positionCode',
        # 'positionName',
        # 'positionType',
        'battingOrder',
        'gamesPlayedBatting',
        'flyOuts',
        'groundOuts',
        'runsScored',
        'doubles',
        'triples',
        'homeRuns',
        'strikeOuts',
        'baseOnBalls',
        'intentionalWalks',
        'hits',
        'hitByPitch',
        'atBats',
        'caughtStealing',
        'stolenBases',
        'groundIntoDoublePlay',
        'groundIntoTriplePlay',
        'plateAppearances',
        'totalBases',
        'rbi',
        'leftOnBase',
        'sacBunts',
        'sacFlies',
        'catchersInterference',
        'pickoffs',
        'gamesPlayedPitching',
        'gamesStartedPitching',
        'completeGamesPitching',
        'shutoutsPitching',
        'winsPitching',
        'lossesPitching',
        'flyOutsPitching',
        'airOutsPitching',
        'groundOutsPitching',
        'runsPitching',
        'doublesPitching',
        'triplesPitching',
        'homeRunsPitching',
        'strikeOutsPitching',
        'baseOnBallsPitching',
        'intentionalWalksPitching',
        'hitsPitching',
        'hitByPitchPitching',
        'atBatsPitching',
        'caughtStealingPitching',
        'stolenBasesPitching',
        'inningsPitched',
        'saveOpportunities',
        'earnedRuns',
        'battersFaced',
        'outsPitching',
        'pitchesThrown',
        'balls',
        'strikes',
        'hitBatsmen',
        'balks',
        'wildPitches',
        'pickoffsPitching',
        'rbiPitching',
        'gamesFinishedPitching',
        'inheritedRunners',
        'inheritedRunnersScored',
        'catchersInterferencePitching',
        'sacBuntsPitching',
        'sacFliesPitching',
        'saves',
        'holds',
        'blownSaves',
        'assists',
        'putOuts',
        'errors',
        'chances'
        ]
    ),
    TeamBoxScoresCountBlock(columns=[
        # 'date',
        # 'home',
        # 'teamId',
        # 'gamePk',
        # 'gameDate',
        # 'gameTimeUTC',
        'flyOuts',
        'groundOuts',
        'runsScored',
        'doubles',
        'triples',
        'homeRuns',
        'strikeOuts',
        'baseOnBalls',
        'intentionalWalks',
        'hits',
        'hitByPitch',
        'atBats',
        'caughtStealing',
        'stolenBases',
        'groundIntoDoublePlay',
        'groundIntoTriplePlay',
        'plateAppearances',
        'totalBases',
        'rbi',
        'leftOnBase',
        'sacBunts',
        'sacFlies',
        'catchersInterference',
        'pickoffs',
        'airOutsPitching',
        'groundOutsPitching',
        'runsPitching',
        'doublesPitching',
        'triplesPitching',
        'homeRunsPitching',
        'strikeOutsPitching',
        'baseOnBallsPitching',
        'intentionalWalksPitching',
        'hitsPitching',
        'hitByPitchPitching',
        'atBatsPitching',
        'caughtStealingPitching',
        'stolenBasesPitching',
        'inningsPitched',
        'earnedRuns',
        'battersFaced',
        'outsPitching',
        'hitBatsmen',
        'balks',
        'wildPitches',
        'pickoffsPitching',
        'rbiPitching',
        'inheritedRunners',
        'inheritedRunnersScored',
        'catchersInterferencePitching',
        'sacBuntsPitching',
        'sacFliesPitching'
        ]
    ),
    GamesCountBlock(columns=[
        # 'date',
        # 'gamePk',
        # 'gameType',
        # 'season',
        # 'gameDate',
        # 'gameTimeUTC',
        # 'resumeDate',
        # 'resumedFrom',
        # 'codedGameState',
        # 'detailedGameState',
        'isTie',
        'gameNumber',
        'doubleHeader',
        # 'dayNight',
        'scheduledInnings',
        'gamesInSeries',
        # 'seriesDescription',
        # 'homeId',
        # 'homeName',
        # 'homeAbbrev',
        'homeWins',
        'homeLosses',
        'homeWinPct',
        # 'homeWinner',
        'homeScore',
        # 'awayId',
        # 'awayName',
        # 'awayAbbrev',
        'awayWins',
        'awayLosses',
        'awayWinPct',
        # 'awayWinner',
        'awayScore'
        ]
    ),
    PlayerTwitterFollowerAggregateBlock(),
]

y = mlb_train_ds.train_next["target1"].values
input_df = mlb_train_ds.train_next[CFG.OBJECT_ID].copy()
train_feat_df = create_train_feature(input_df, y, update_blocks, non_update_blocks)
X_train = train_feat_df.values
targets = [mlb_train_ds.train_next[col].values for col in CFG.TARGETS]

# split train/valid
fold = get_timeseries_holdout(input_df, valid_start_date=20210401)

# training
models = run_lgb(X_train, targets, CFG.LGB_PARAMS, CFG.SEEDS, fold)

[2021-07-07 23:26:26,809] [INFO] [logger set up]
[2021-07-07 23:26:26,914] [INFO] [<<__main__.TargetAggregateBlock object at 0x1c2238640>_fit> start]
[2021-07-07 23:26:36,203] [INFO] [<<__main__.TargetAggregateBlock object at 0x1c2238640>_fit> done in 9.29 s]
[2021-07-07 23:26:36,267] [INFO] [<<__main__.RostersLabelEncodingBlock object at 0x1c22384f0>_fit> start]
[2021-07-07 23:26:37,381] [INFO] [<<__main__.RostersLabelEncodingBlock object at 0x1c22384f0>_fit> done in 1.11 s]
[2021-07-07 23:26:37,598] [INFO] [<<__main__.PlayersLabelEncodingBlock object at 0x1c221f760>_fit> start]
[2021-07-07 23:26:37,865] [INFO] [<<__main__.PlayersLabelEncodingBlock object at 0x1c221f760>_fit> done in 0.27 s]
[2021-07-07 23:26:38,093] [INFO] [<<__main__.PlayerBoxScoresCountBlock object at 0x1c221fa60>_fit> start]
[2021-07-07 23:26:40,127] [INFO] [<<__main__.PlayerBoxScoresCountBlock object at 0x1c221fa60>_fit> done in 2.03 s]
[2021-07-07 23:26:40,698] [INFO] [<<__main__.TeamBoxScoresCountBlock object a



Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.51266	valid_1's l1: 0.700532
[200]	training's l1: 0.512409	valid_1's l1: 0.699725
[300]	training's l1: 0.512369	valid_1's l1: 0.699531
Early stopping, best iteration is:
[277]	training's l1: 0.512371	valid_1's l1: 0.699519
Local Target1 OOF-MAE : 0.6995188802843717
--------------------------------------------------
Traning for Target2




Training until validation scores don't improve for 100 rounds
[100]	training's l1: 1.7603	valid_1's l1: 1.51034
Early stopping, best iteration is:
[35]	training's l1: 1.77213	valid_1's l1: 1.5026
Local Target2 OOF-MAE : 1.5025952777897913
--------------------------------------------------
Traning for Target3




Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.668506	valid_1's l1: 0.55332
[200]	training's l1: 0.66835	valid_1's l1: 0.553266
Early stopping, best iteration is:
[153]	training's l1: 0.668388	valid_1's l1: 0.553239
Local Target3 OOF-MAE : 0.5532389298829665
--------------------------------------------------
Traning for Target4




Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.792593	valid_1's l1: 1.13368
[200]	training's l1: 0.791798	valid_1's l1: 1.1324
[300]	training's l1: 0.791756	valid_1's l1: 1.1324
Early stopping, best iteration is:
[222]	training's l1: 0.791783	valid_1's l1: 1.13233
Local Target4 OOF-MAE : 1.1323322614765632
--------------------------------------------------
Local MCMAE : 0.9719213373584232


In [17]:
def visualize_feature_importance(models, feat_train_df) -> plotly.graph_objects.Figure:
    '''LightGBMのfeature importanceを可視化
    '''
    for target_i in range(len(models)):
        feature_importance_df = pd.DataFrame()
        for i, model in enumerate(models[target_i]):
            _df = pd.DataFrame()
            _df['feature_importance'] = model.feature_importance(importance_type="gain")
            _df['feature'] = feat_train_df.columns
            _df['model_no'] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, _df], 
                                            axis=0, ignore_index=True)

        order = feature_importance_df.groupby('feature')\
            .mean()[['feature_importance']]\
            .sort_values('feature_importance', ascending=False).index[:50]
        
        fig = px.box(
            feature_importance_df.query("feature in @order"),
            x="feature_importance",
            y="feature",
            category_orders={"feature": order},
            width=1250,
            height=900,
            title=f"Target{target_i+1} Top 50 feature importance",
        )
        fig.update_yaxes(showgrid=True)
        fig.show()

In [18]:
visualize_feature_importance(models, train_feat_df)

## Inference

In [None]:
def update_dataset(input_df: pd.DataFrame, sample_prediction_df: pd.DataFrame):
    '''datasetの更新
    '''
    # columnがNaNでなければ分岐処理が実行される(pd.isnaとかでも代用可)
    # NaN == NaNはFalseになる
    ####################
    # rosters #
    ####################
    if input_df["rosters"].iloc[0] == input_df["rosters"].iloc[0]:
        test_rosters = pd.read_json(input_df["rosters"].iloc[0])
    else:
        test_rosters = sample_prediction_df[["playerId"]].copy()
        for col in mlb_train_ds.train_rosters.columns:
            if col == "playerId": continue
            test_rosters[col] = np.nan

    test_rosters["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_rosters = reduce_mem_usage(pd.concat([mlb_train_ds.train_rosters, test_rosters], axis=0, ignore_index=True))
    ####################
    # playerBoxScores #
    ####################
    if input_df["playerBoxScores"].iloc[0] == input_df["playerBoxScores"].iloc[0]:
        test_scores = pd.read_json(input_df["playerBoxScores"].iloc[0])
    else:
        test_scores = sample_prediction_df[["playerId"]].copy()
        for col in mlb_train_ds.train_scores.columns:
            if col == "playerId": continue
            test_scores[col] = np.nan
    
    test_scores["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_scores = reduce_mem_usage(pd.concat([mlb_train_ds.train_scores, test_scores], axis=0, ignore_index=True))
    ####################
    # teamBoxScores #
    ####################
    if input_df["teamBoxScores"].iloc[0] == input_df["teamBoxScores"].iloc[0]:
        test_teamBoxScores = pd.read_json(input_df["teamBoxScores"].iloc[0])
    else:
        cols = mlb_train_ds.train_tbs.drop(columns="date", axis=1).columns.tolist()
        test_teamBoxScores = pd.DataFrame(columns=cols)
        test_teamBoxScores["teamId"] = mlb_train_ds.train_tbs.teamId.unique()

    test_teamBoxScores["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_tbs = reduce_mem_usage(pd.concat([mlb_train_ds.train_tbs, test_teamBoxScores], axis=0, ignore_index=True))
    ####################
    # games #
    ####################
    if input_df["games"].iloc[0] == input_df["games"].iloc[0]:
        test_games = pd.read_json(input_df["games"].iloc[0])
    else:
        cols = mlb_train_ds.train_games.drop(columns="date", axis=1).columns.tolist()
        test_games = pd.DataFrame(columns=cols)

    test_games["date"] = input_df.iloc[0]["date"]
    if input_df.iloc[0]["date"] > 20210430:
        mlb_train_ds.train_games = reduce_mem_usage(pd.concat([mlb_train_ds.train_games, test_games], axis=0, ignore_index=True))

In [None]:
# env = mlb.make_env()
# iter_test = env.iter_test()

# for (test_df, sample_prediction_df) in iter_test:
#     sample_prediction_df = sample_prediction_df.reset_index(drop=True)
#     # create dataset
#     test_df = test_df.reset_index()
#     test_df = test_df.rename(columns={"index": "date"})
#     sample_prediction_df["date"] = test_df.iloc[0]["date"]
#     sample_prediction_df["playerId"] = sample_prediction_df["date_playerId"].map(lambda x: int(x.split("_")[1]))
#     update_dataset(test_df, sample_prediction_df)
#     # create features
#     test_feat_df = create_test_feature(sample_prediction_df, update_blocks, non_update_blocks)
#     X_test = test_feat_df.values
#     # prediction
#     for target, model in zip(CFG.TARGETS, models):
#         pred = np.mean([model_.predict(X_test) for model_ in model], axis=0)
#         sample_prediction_df[target] = np.clip(pred, 0, 100)

#     sample_prediction_df = sample_prediction_df.drop(columns=["playerId", "date"], axis=1)

#     env.predict(sample_prediction_df)