# Overview
- Lightgbm

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from tqdm.notebook import tqdm
import lightgbm as lgb
from typing import List, Union, Optional
import time
from contextlib import contextmanager
import sys
import logging
import category_encoders as ce
# import mlb

## Config

In [2]:
class CFG:
    INPUT_DIR = "../input/mlb-player-digital-engagement-forecasting"
    # INPUT_DIR = "../input/mlb-unnested-dataset" # for kaggle kernel
    OBJECT_ID = ["playerId", "date"]
    TARGETS = ["target1", "target2", "target3", "target4"]

    # training
    ## LightGBM
    LGB_PARAMS = {
        "objective": "regression",
        "metric": "mae",
        "boosting_type": "gbdt",
        "learning_rate": 0.01,
        "max_depth": 6,
        "num_leaves": int(64 * 0.8),
        "lambda_l1": 0.001,
        "lambda_l2": 0.001,
        "bagging_fraction": 0.9,
        "bagging_freq": 3,
        "feature_fraction": 0.9,
        "min_data_in_leaf": 20,
        "num_threads": 8,
        "verbosity": -1,
        "num_iterations": 10000,
        "early_stopping_round": 100,
    }
    SEEDS = [2434, 42]

NameError: name 'LGB_PARAMS' is not defined

## Utils

In [None]:
def get_logger(out_file=None):
    logger = logging.getLogger()  # loggerの呼び出し
    formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] [%(message)s]")  # ログ出力の際のフォーマットを定義
    logger.handlers = []  # ハンドラーを追加するためのリスト
    logger.setLevel(logging.INFO)  # ロギングのレベルを設定, 'INFO' : 想定された通りのことが起こったことの確認

    handler = logging.StreamHandler(sys.stdout)  # StreamHandler(コンソールに出力するハンドラ)を追加
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    # ログをファイルとして出力する際のハンドラ(FileHandler)
    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)

    logger.info("logger set up")  # "logger set up"を表示
    return logger


@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"<{name}> start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"<{name}> done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
logger = get_logger()

## Load dataset

In [None]:
def load_dataset(input_path):
    INPUT_PATH = Path(input_path)
    train_next = pd.read_pickle(INPUT_PATH / "train_nextDayPlayerEngagement.pickle")
    train_rosters = pd.read_pickle(INPUT_PATH / "train_rosters.pickle")
    train_scores = pd.read_pickle(INPUT_PATH / "train_playerBoxScores.pickle")
    train_games = pd.read_pickle(INPUT_PATH / "train_games.pickle")
    players = pd.read_pickle(INPUT_PATH / "players.pickle")
    example_test = pd.read_csv(INPUT_PATH / "example_test.csv")
    sample_submission = pd.read_csv(INPUT_PATH / "example_sample_submission.csv")

    return train_next, train_rosters, train_scores, train_games, players, example_test, sample_submission

In [None]:
train_next, train_rosters, train_scores, train_games, players, example_test, sample_submission = load_dataset(CFG.INPUT_DIR)

## feature engineering

In [None]:
def merge_by_key(left: Union[pd.DataFrame, pd.Series], right: pd.DataFrame, on=CFG.OBJECT_ID) -> pd.DataFrame:
    if not isinstance(left, pd.Series):
        left = left[on]
    return pd.merge(left, right, on=on, how="left").drop(columns=[on])


class BaseBlock(object):
    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        return NotImplementedError()


class LabelEncodingBlock(BaseBlock):
    def __init__(self, cols: List[str]):
        self.cols = cols
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        self.encoder = ce.CountEncoder(handle_unknown="value", handle_missing="value")
        self.encoder.fit(input_df[self.cols])

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return self.encoder.transform(input_df[self.cols]).add_prefix("LE_").astype(int)


class LagBlock(BaseBlock):
    def __init__(self, column: str, periods: List[int]):
        self.column = column
        self.periods = periods

    def fit(self, input_df: pd.DataFrame, y=None):
        agg_list = [input_df.groupby(["playerId"], as_index=False)[self.column].transform(lambda x: x.shift(periods=period)).add_prefix(f"{period}_") for period in self.periods]
        self.agg_df = pd.concat(agg_list, axis=1)

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return self.agg_df.add_prefix("Lag_")


class TargetAggregateBlock(BaseBlock):
    def fit(self, input_df: pd.DataFrame, y=None):
        target_df = train_next.copy()

        dfs = [target_df.groupby(["playerId"])[col].agg(["mean", "std", "max", "min", "median"]).add_prefix(f"{col}_") for col in CFG.TARGETS]
        self.agg_df = pd.concat(dfs, axis=1).reset_index()

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = merge_by_key(input_df, self.agg_df, on="playerId")

        return output_df

In [None]:
def create_feature(input_df: pd.DataFrame, y: np.ndarray, blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()

    for block in blocks:
        with timer(name=f"{str(block) + '_fit'}", logger=logger):
            try:
                out_feat_block = block.fit(input_df, y=y)
            except Exception as e:
                print(f"Error on {block} fit.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df


def transform_feature(input_df: pd.DataFrame, blocks: list) -> pd.DataFrame:
    feat_df = pd.DataFrame()

    for block in blocks:
        with timer(name=f"{str(block) + '_transform'}", logger=logger):
            try:
                out_feat_block = block.transform(input_df)
            except Exception as e:
                print(f"Error on {block} transform.")
                raise e from e

            assert len(out_feat_block) == len(input_df), block

        feat_df = pd.concat([feat_df, out_feat_block], axis=1)

    return feat_df

In [None]:
# feature
blocks = [
    TargetAggregateBlock(),
]

y = train_next["target1"].values
feat_df = create_feature(train_next, y, blocks)

## CV

In [None]:
def get_simple_timeseries_split(train_df: pd.DataFrame, train_final_date: int) -> List[tuple]:
    fold = []
    train_idx = np.array(train_df[train_df["date"] < train_final_date].index)
    valid_idx = np.array(train_df[train_df["date"] >= train_final_date].index)
    fold.append((train_idx, valid_idx))

    return fold

## Training

In [None]:
class LightGBMTrainer:
    def __init__(self, params: dict, seeds: List[int]):
        self.params = params
        self.seeds = seeds
        self.models = []

    def fit(self, X_train: np.ndarray, y_train: np.ndarray, cv: List[tuple]):
        oof_all = []
        for i, seed in enumerate(self.seeds):
            self.params["seed"] = seed

            for train_idx, valid_idx in cv:
                X_train_fold = X_train[train_idx]
                X_valid_fold = X_train[valid_idx]

                y_train_fold = y_train[train_idx]
                y_valid_fold = y_train[valid_idx]

                train_set = lgb.Dataset(X_train_fold, y_train_fold)
                valid_set = lgb.Dataset(X_valid_fold, y_valid_fold, reference=train_set)

                model = lgb.train(
                    params=self.params,
                    train_set=train_set,
                    valid_sets=[train_set, valid_set],
                    verbose_eval=100,
                )

                y_oof = model.predict(X_valid_fold, num_iteration=model.best_iteration)
                oof_all.append(y_oof)
                self.models.append(model)

        oof_all = np.mean(oof_all, axis=0)
        oof_score = mean_absolute_error(oof_all, y_valid_fold)

        return oof_score, self.models

    def predict(self, X_test: np.ndarray):
        y_pred = np.mean([model.predict(X_test) for model in self.models], axis=0)

        return y_pred


def run_lgb(X_train, targets, params, seeds, fold):
    oof_all_score = []
    all_targets_models = []
    for target in targets:
        trainer = LightGBMTrainer(params, seeds)
        oof_tmp, models_tmp = trainer.fit(X_train, target, fold)
        oof_all_score.append(oof_tmp)
        all_targets_models.append(models_tmp)

    return oof_all_score, all_targets_models

In [None]:
# split train/valid
fold = get_simple_timeseries_split(train_next, 20210401)

# target
targets = [train_next[col].values for col in CFG.TARGETS]

# training
X_train = feat_df.values
oof_all_score, all_targets_models = run_lgb(X_train, targets, CFG.LGB_PARAMS, CFG.SEEDS, fold)

## Inference

In [None]:
# for (id, row) in example_test.iterrows():
#     break

# df = row.to_frame().T
# nested_df_names = df.drop(columns="date", axis=1).columns.values.tolist()

# df_ = df[["date", "games"]]
# df_ = df_[~pd.isna(df_["games"]).reset_index(drop=True)]

# df_collection = []
# for date_index, date_row in df_.iterrows():
#     daily_df = pd.read_json(date_row["games"])
#     daily_df["date"] = date_row["date"]
#     df_collection.append(daily_df)

# unnested_table = pd.concat(df_collection, ignore_index=True).set_index("date").reset_index()
# unnested_table = reduce_mem_usage(unnested_table)

In [None]:
env = mlb.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    # create feature in test set
    sample_prediction_df["playerId"] = sample_prediction_df["date_playerId"].map(lambda x: int(x.split("_")[1]))
    X_test = transform_feature(sample_prediction_df, blocks).values

    # prediction
    for target, models in zip(CFG.TARGETS, all_targets_models):
        pred = np.mean([model.predict(X_test) for model in models], axis=0)
        sample_prediction_df[target] = np.clip(pred, 0, 100)

    del sample_prediction_df["playerId"]

    env.predict(sample_prediction_df)