# Overview
- LightGBM

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from tqdm.notebook import tqdm
import lightgbm as lgb
from typing import List, Union, Optional
import time
from contextlib import contextmanager
import sys
import logging
import category_encoders as ce
# import mlb

## Config

In [2]:
class CFG:
    ######################
    # global #
    ######################
    INPUT_DIR = "../input/mlb-player-digital-engagement-forecasting"
    # INPUT_DIR = "../input/mlb-unnested-dataset" # for kaggle kernel
    OBJECT_ID = ["playerId", "date"]
    TARGETS = ["target1", "target2", "target3", "target4"]
    ######################
    # model #
    ######################
    LGB_PARAMS = {
        "objective": "regression",
        "metric": "mae",
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "max_depth": 6,
        "num_leaves": int(64 * 0.8),
        "lambda_l1": 0.1,
        "lambda_l2": 0.1,
        "bagging_fraction": 0.9,
        "bagging_freq": 3,
        "feature_fraction": 0.9,
        "min_data_in_leaf": 20,
        "num_threads": 8,
        "verbosity": -1,
        "num_iterations": 10000,
        "early_stopping_round": 100,
    }
    SEEDS = [2434, 42]

## Utils

In [3]:
def get_logger(out_file=None):
    logger = logging.getLogger()  # loggerの呼び出し
    formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] [%(message)s]")  # ログ出力の際のフォーマットを定義
    logger.handlers = []  # ハンドラーを追加するためのリスト
    logger.setLevel(logging.INFO)  # ロギングのレベルを設定, 'INFO' : 想定された通りのことが起こったことの確認

    handler = logging.StreamHandler(sys.stdout)  # StreamHandler(コンソールに出力するハンドラ)を追加
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    # ログをファイルとして出力する際のハンドラ(FileHandler)
    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)

    logger.info("logger set up")  # "logger set up"を表示
    return logger


@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"<{name}> start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"<{name}> done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Loading dataset

In [4]:
class MLBDataset(object):
    def __init__(self, input_path):
        self.input_path = Path(input_path)
        #################
        # train #
        #################
        self.train_next = pd.read_pickle(self.input_path / "train_nextDayPlayerEngagement.pickle")
        self.train_rosters = pd.read_pickle(self.input_path / "train_rosters.pickle")
        self.train_ptf = pd.read_pickle(self.input_path / "train_playerTwitterFollowers.pickle")
        self.train_scores = pd.read_pickle(self.input_path / "train_playerBoxScores.pickle")
        self.train_games = pd.read_pickle(self.input_path / "train_games.pickle")
        self.train_standings = pd.read_pickle(self.input_path / "train_standings.pickle")
        self.train_tbs = pd.read_pickle(self.input_path / "train_teamBoxScores.pickle")
        self.train_ttf = pd.read_pickle(self.input_path / "train_teamTwitterFollowers.pickle")
        self.train_trans = pd.read_pickle(self.input_path / "train_transactions.pickle")
        self.train_awards = pd.read_pickle(self.input_path / "train_awards.pickle")
        self.train_events = pd.read_pickle(self.input_path / "train_events.pickle")
        #################
        # Additional #
        #################
        self.players = pd.read_pickle(self.input_path / "players.pickle")
        self.awards = pd.read_pickle(self.input_path / "awards.pickle")
        self.seasons = pd.read_pickle(self.input_path / "seasons.pickle")
        self.teams = pd.read_pickle(self.input_path / "teams.pickle")
        #################
        # test #
        #################
        self.example_test = pd.read_csv(self.input_path / "example_test.csv")
        self.sample_submission = pd.read_csv(self.input_path / "example_sample_submission.csv")

ds = MLBDataset(CFG.INPUT_DIR)

## Feature blocks

In [5]:
def merge_by_key(left: Union[pd.DataFrame, pd.Series], right: pd.DataFrame, on=CFG.OBJECT_ID) -> pd.DataFrame:
    if not isinstance(left, pd.Series):
        left = left[on]
    return pd.merge(left, right, on=on, how="left").drop(columns=on)


class BaseBlock(object):
    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        return NotImplementedError()


class LabelEncodingBlock(BaseBlock):
    def __init__(self, cols: List[str]):
        self.cols = cols
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        self.encoder = ce.CountEncoder(handle_unknown="value", handle_missing="value")
        self.encoder.fit(input_df[self.cols])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return self.encoder.transform(input_df[self.cols]).add_prefix("LE_").astype(int)


class LagBlock(BaseBlock):
    def __init__(self, column: str, periods: List[int]):
        self.column = column
        self.periods = periods

    def fit(self, input_df: pd.DataFrame, y=None):
        agg_list = [input_df.groupby(["playerId"], as_index=False)[self.column].transform(lambda x: x.shift(periods=period)).add_prefix(f"{period}_") for period in self.periods]
        self.agg_df = pd.concat(agg_list, axis=1)

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return self.agg_df.add_prefix("Lag_")


class TargetAggregateBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        target_df = ds.train_next.copy()

        dfs = [target_df.groupby(["playerId"])[col].agg(["mean", "std", "max", "min", "median"]).add_prefix(f"{col}_") for col in CFG.TARGETS]
        self.agg_df = pd.concat(dfs, axis=1).reset_index()

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = merge_by_key(input_df, self.agg_df, on="playerId")

        return output_df


class RostersLabelEncodingBlock(BaseBlock):
    def __init__(self, columns: List[str]):
        self.columns = columns
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        self.labeled_df = ds.train_rosters[CFG.OBJECT_ID + self.columns].copy()

        self.encoder = ce.OrdinalEncoder(handle_unknown="value", handle_missing="value")
        self.encoder.fit(self.labeled_df[self.columns])
        self.labeled_df[self.columns] = self.encoder.transform(self.labeled_df[self.columns])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.labeled_df).add_prefix(f"LE_")

In [6]:
def create_feature(input_df: pd.DataFrame, y: np.ndarray, blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()

    for block in blocks:
        with timer(name=f"{str(block) + '_fit'}", logger=logger):
            try:
                out_feat_block = block.fit(input_df, y=y)
            except Exception as e:
                print(f"Error on {block} fit.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df


def transform_feature(input_df: pd.DataFrame, blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()

    for block in blocks:
        with timer(name=f"{str(block) + '_transform'}", logger=logger):
            try:
                out_feat_block = block.transform(input_df)
            except Exception as e:
                print(f"Error on {block} transform.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df

## CV

In [7]:
def get_simple_timeseries_split(train_df: pd.DataFrame, train_final_date: int) -> List[tuple]:
    fold = []
    train_idx = np.array(train_df[train_df["date"] < train_final_date].index)
    valid_idx = np.array(train_df[train_df["date"] >= train_final_date].index)
    fold.append((train_idx, valid_idx))

    return fold

## Model

In [8]:
class LightGBMTrainer:
    def __init__(self, params: dict, seeds: List[int]):
        self.params = params
        self.seeds = seeds
        self.models = []

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, cv: List[tuple]):
        oof_all = []
        for i, seed in enumerate(self.seeds):
            self.params["seed"] = seed

            for train_idx, valid_idx in cv:
                X_train_fold = X_train[train_idx]
                X_valid_fold = X_train[valid_idx]

                y_train_fold = y_train[train_idx]
                y_valid_fold = y_train[valid_idx]

                train_set = lgb.Dataset(X_train_fold, y_train_fold)
                valid_set = lgb.Dataset(X_valid_fold, y_valid_fold, reference=train_set)

                model = lgb.train(
                    params=self.params,
                    train_set=train_set,
                    valid_sets=[train_set, valid_set],
                    verbose_eval=100,
                )

                y_oof = model.predict(X_valid_fold, num_iteration=model.best_iteration)
                oof_all.append(y_oof)
                self.models.append(model)

        oof_all = np.mean(oof_all, axis=0)
        oof_score = mean_absolute_error(oof_all, y_valid_fold)

        return oof_score, self.models

    def predict(self, X_test: np.ndarray):
        y_pred = np.mean([model.predict(X_test) for model in self.models], axis=0)

        return y_pred


def run_lgb(X_train, targets, params, seeds, fold):
    oof_all_score = []
    all_targets_models = []
    for target in targets:
        trainer = LightGBMTrainer(params, seeds)
        oof_tmp, models_tmp = trainer.fit(X_train, target, fold)
        oof_all_score.append(oof_tmp)
        all_targets_models.append(models_tmp)

    print(f"MCMAE : {np.mean(oof_all_score)}")

    return oof_all_score, all_targets_models

## Training

In [9]:
# set up logger
logger = get_logger()
# feature
blocks = [
    TargetAggregateBlock(),
    RostersLabelEncodingBlock(
        columns=["status"]
    ),
]

y = ds.train_next["target1"].values
feat_df = create_feature(ds.train_next, y, blocks)
# split train/valid
fold = get_simple_timeseries_split(ds.train_next, 20210401)

# target
targets = [ds.train_next[col].values for col in CFG.TARGETS]

# training
X_train = feat_df.values
oof_all_score, all_targets_models = run_lgb(X_train, targets, CFG.LGB_PARAMS, CFG.SEEDS, fold)


[2021-06-30 22:41:36,137] [INFO] [logger set up]
[2021-06-30 22:41:36,138] [INFO] [<<__main__.TargetAggregateBlock object at 0x124e051f0>_fit> start]
[2021-06-30 22:41:37,662] [INFO] [<<__main__.TargetAggregateBlock object at 0x124e051f0>_fit> done in 1.52 s]
[2021-06-30 22:41:37,708] [INFO] [<<__main__.RostersLabelEncodingBlock object at 0x124e055b0>_fit> start]
[2021-06-30 22:41:38,915] [INFO] [<<__main__.RostersLabelEncodingBlock object at 0x124e055b0>_fit> done in 1.21 s]




Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.848769	valid_1's l1: 1.03276
[200]	training's l1: 0.815121	valid_1's l1: 0.988445
[300]	training's l1: 0.803581	valid_1's l1: 0.986393
Early stopping, best iteration is:
[244]	training's l1: 0.808579	valid_1's l1: 0.98024
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.849141	valid_1's l1: 1.03732
[200]	training's l1: 0.815387	valid_1's l1: 0.99554
[300]	training's l1: 0.803909	valid_1's l1: 1.00758
Early stopping, best iteration is:
[213]	training's l1: 0.8131	valid_1's l1: 0.994675




Training until validation scores don't improve for 100 rounds
[100]	training's l1: 2.29255	valid_1's l1: 2.31802
[200]	training's l1: 2.14793	valid_1's l1: 2.17484
[300]	training's l1: 2.10676	valid_1's l1: 2.13093
[400]	training's l1: 2.09246	valid_1's l1: 2.11715
[500]	training's l1: 2.08609	valid_1's l1: 2.11894
Early stopping, best iteration is:
[408]	training's l1: 2.09179	valid_1's l1: 2.11628
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 2.292	valid_1's l1: 2.31608
[200]	training's l1: 2.14736	valid_1's l1: 2.17291
[300]	training's l1: 2.1072	valid_1's l1: 2.13023
[400]	training's l1: 2.0927	valid_1's l1: 2.11728
[500]	training's l1: 2.0861	valid_1's l1: 2.12893
Early stopping, best iteration is:
[414]	training's l1: 2.09133	valid_1's l1: 2.11589




Training until validation scores don't improve for 100 rounds
[100]	training's l1: 1.10508	valid_1's l1: 1.01369
[200]	training's l1: 1.06953	valid_1's l1: 0.998354
Early stopping, best iteration is:
[168]	training's l1: 1.07716	valid_1's l1: 0.995371
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 1.10502	valid_1's l1: 1.01322
[200]	training's l1: 1.06982	valid_1's l1: 1.00613
Early stopping, best iteration is:
[156]	training's l1: 1.08091	valid_1's l1: 0.997106




Training until validation scores don't improve for 100 rounds


KeyboardInterrupt: 

## Inference

In [None]:
env = mlb.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    # create dataset
    sample_prediction_df["playerId"] = sample_prediction_df["date_playerId"].map(lambda x: int(x.split("_")[1]))
    # Dealing with missing values
    # columnがNaNでなければ分岐処理が実行される(pd.isnaとかでも代用可)
    # NaN == NaNはFalseになる
    if test_df["rosters"].iloc[0] == test_df["rosters"].iloc[0]:
        test_rosters = pd.read_json(test_df["rosters"].iloc[0])
    else:
        test_rosters = sample_prediction[["playerId"]].copy()
        for col in ds.train_rosters.columns:
            if col == "playerId": continue
            test_rosters[col] = np.nan

    if test_df["playerBoxScores"].iloc[0] == test_df["playerBoxScores"].iloc[0]:
        test_scores = pd.read_json(test_df["playerBoxScores"].iloc[0])
    else:
        test_scores = sample_prediction[["playerId"]].copy()
        for col in ds.train_scores.columns:
            if col == "playerId": continue
            test_scores[col] = np.nan

In [14]:
ds.example_test

Unnamed: 0,date,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20210426,"[{""gamePk"":634374,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-26"",""t...","[{""home"":1,""gamePk"":634377,""gameDate"":""2021-04...","[{""home"":1,""teamId"":139,""gamePk"":634343,""gameD...","[{""transactionId"":480386,""playerId"":543685,""pl...","[{""season"":2021,""gameDate"":""2021-04-26"",""divis...",,"[{""gamePk"":634433,""gameDate"":""2021-04-26"",""gam...",,
1,20210427,"[{""gamePk"":634318,""gameType"":""R"",""season"":2021...","[{""playerId"":443558,""gameDate"":""2021-04-27"",""t...","[{""home"":1,""gamePk"":634320,""gameDate"":""2021-04...","[{""home"":1,""teamId"":117,""gamePk"":634333,""gameD...","[{""transactionId"":480456,""playerId"":642162,""pl...","[{""season"":2021,""gameDate"":""2021-04-27"",""divis...",,"[{""gamePk"":634332,""gameDate"":""2021-04-27"",""gam...",,
2,20210428,"[{""gamePk"":634309,""gameType"":""R"",""season"":2021...","[{""playerId"":429722,""gameDate"":""2021-04-28"",""t...","[{""home"":1,""gamePk"":634310,""gameDate"":""2021-04...","[{""home"":0,""teamId"":111,""gamePk"":634310,""gameD...","[{""transactionId"":480728,""playerId"":545358,""pl...","[{""season"":2021,""gameDate"":""2021-04-28"",""divis...",,"[{""gamePk"":634317,""gameDate"":""2021-04-28"",""gam...",,
3,20210429,"[{""gamePk"":634330,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-29"",""t...","[{""home"":1,""gamePk"":634330,""gameDate"":""2021-04...","[{""home"":0,""teamId"":119,""gamePk"":634346,""gameD...","[{""transactionId"":480993,""playerId"":606965,""pl...","[{""season"":2021,""gameDate"":""2021-04-29"",""divis...",,"[{""gamePk"":634346,""gameDate"":""2021-04-29"",""gam...",,
4,20210430,"[{""gamePk"":634287,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-30"",""t...","[{""home"":1,""gamePk"":634305,""gameDate"":""2021-04...","[{""home"":1,""teamId"":135,""gamePk"":634303,""gameD...",,"[{""season"":2021,""gameDate"":""2021-04-30"",""divis...","[{""awardId"":""NLRRELMON"",""awardName"":""NL Reliev...","[{""gamePk"":634327,""gameDate"":""2021-04-30"",""gam...",,


In [27]:
pd.read_json(ds.example_test["rosters"].iloc[0])

Unnamed: 0,playerId,gameDate,teamId,statusCode,status
0,405395,2021-04-26,108,A,Active
1,408234,2021-04-26,116,A,Active
2,444482,2021-04-26,109,A,Active
3,445276,2021-04-26,119,A,Active
4,446334,2021-04-26,137,A,Active
...,...,...,...,...,...
1264,676477,2021-04-26,111,A,Active
1265,676755,2021-04-26,110,RM,Reassigned to Minors
1266,676831,2021-04-26,138,RM,Reassigned to Minors
1267,676845,2021-04-26,136,RM,Reassigned to Minors


In [21]:
ds.sample_submission

Unnamed: 0,date,date_playerId,target1,target2,target3,target4
0,20210426,20210427_656669,0,0,0,0
1,20210426,20210427_543475,0,0,0,0
2,20210426,20210427_592866,0,0,0,0
3,20210426,20210427_452678,0,0,0,0
4,20210426,20210427_570257,0,0,0,0
...,...,...,...,...,...,...
5930,20210430,20210501_596049,0,0,0,0
5931,20210430,20210501_642851,0,0,0,0
5932,20210430,20210501_596071,0,0,0,0
5933,20210430,20210501_664901,0,0,0,0
