# Overview
- Lightgbm

In [37]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from tqdm.notebook import tqdm
import lightgbm as lgb
from typing import List, Union, Optional
import time
from contextlib import contextmanager
import logging
# import mlb

## Config

In [16]:
class CFG:
    INPUT_DIR = "../input/mlb-player-digital-engagement-forecasting"
    # INPUT_DIR = "../input/mlb-unnested-dataset" # for kaggle kernel
    OBJECT_ID = "playerId"

## Utils

In [38]:
def get_logger(out_file=None):
    logger = logging.getLogger()  # loggerの呼び出し
    formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] [%(message)s]")  # ログ出力の際のフォーマットを定義
    logger.handlers = []  # ハンドラーを追加するためのリスト
    logger.setLevel(logging.INFO)  # ロギングのレベルを設定, 'INFO' : 想定された通りのことが起こったことの確認

    handler = logging.StreamHandler(sys.stdout)  # StreamHandler(コンソールに出力するハンドラ)を追加
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    # ログをファイルとして出力する際のハンドラ(FileHandler)
    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)

    logger.info("logger set up")  # "logger set up"を表示
    return logger


@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"<{name}> start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"<{name}> done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)

In [39]:
logger = get_logger()

[2021-06-24 23:34:35,664] [INFO] [logger set up]


## Load dataset

In [3]:
def load_dataset(input_path):
    INPUT_PATH = Path(input_path)
    train_next = pd.read_pickle(INPUT_PATH / "train_nextDayPlayerEngagement.pickle")
    train_rosters = pd.read_pickle(INPUT_PATH / "train_rosters.pickle")
    train_scores = pd.read_pickle(INPUT_PATH / "train_playerBoxScores.pickle")
    players = pd.read_pickle(INPUT_PATH / "players.pickle")

    return train_next, train_rosters, train_scores, players

In [4]:
train_next, train_rosters, train_scores, players = load_dataset(CFG.INPUT_DIR)

## feature engineering

In [49]:
def merge_by_key(left: Union[pd.DataFrame, pd.Series], right: pd.DataFrame, on=CFG.OBJECT_ID) -> pd.DataFrame:
    if not isinstance(left, pd.Series):
        left = left[on]
    return pd.merge(left, right, on=on, how="left").drop(columns=[on])


class BaseBlock(object):
    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        return NotImplementedError()


class LagBlock(BaseBlock):
    def __init__(self, column: str, periods: List[int]):
        self.column = column
        self.periods = periods

    def fit(self, input_df: pd.DataFrame, y=None):
        agg_list = [input_df.groupby(["playerId"], as_index=False)[self.column].transform(lambda x: x.shift(periods=period)).add_prefix(f"{period}_") for period in self.periods]
        self.agg_df = pd.concat(agg_list, axis=1)

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return self.agg_df.add_prefix("Lag_")

In [52]:
def create_feature(input_df: pd.DataFrame, y: np.ndarray, blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()

    for block in blocks:
        with timer(name=f"{str(block) + '_fit'}", logger=logger):
            try:
                out_feat_block = block.fit(input_df, y=y)
            except Exception as e:
                print(f"Error on {block} fit.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df

In [53]:
# feature
blocks = [
    *[LagBlock(
        column=col,
        periods=periods,
    ) for col, periods in zip(
        ["target1", "target2", "target3", "target4"],
        [
            [1, 2],
            [1, 2],
            [1, 2],
            [1, 2],
        ]
    )]
]

y = train_next["target1"].values
feat_df = create_feature(train_next, y, blocks)

[2021-06-24 23:56:29,952] [INFO] [<<__main__.LagBlock object at 0x14a2ffaf0>_fit> start]
[2021-06-24 23:56:33,783] [INFO] [<<__main__.LagBlock object at 0x14a2ffaf0>_fit> done in 3.83 s]
[2021-06-24 23:56:33,795] [INFO] [<<__main__.LagBlock object at 0x14a2ff940>_fit> start]
[2021-06-24 23:56:37,321] [INFO] [<<__main__.LagBlock object at 0x14a2ff940>_fit> done in 3.53 s]
[2021-06-24 23:56:37,337] [INFO] [<<__main__.LagBlock object at 0x14a2ff550>_fit> start]
[2021-06-24 23:56:41,085] [INFO] [<<__main__.LagBlock object at 0x14a2ff550>_fit> done in 3.75 s]
[2021-06-24 23:56:41,119] [INFO] [<<__main__.LagBlock object at 0x14a2ff130>_fit> start]
[2021-06-24 23:56:44,649] [INFO] [<<__main__.LagBlock object at 0x14a2ff130>_fit> done in 3.53 s]


## CV