<a href="https://colab.research.google.com/github/applejxd/colaboratory/blob/master/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LightGBM で回帰タスクをするサンプル
- [ボストンの住宅価格に関する回帰タスク](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html)
- [説明変数をDataFrame形式で用意](https://scikit-learn.org/stable/datasets/index.html#boston-dataset)

## 前準備

説明変数を DataFrame で用意

In [None]:
import pandas as pd
from sklearn.datasets import load_boston

data = load_boston()
train_x: pd.DataFrame = pd.DataFrame(data.data, columns = data.feature_names)

print(len(train_x))
display(train_x.head())

506


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


目的変数も DataFrame で用意

In [None]:
train_y: pd.DataFrame = pd.DataFrame(data.target, columns = ["MEDV"])
display(train_y.head())

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


学習を行ってモデルを戻す関数を定義

In [None]:
from typing import Dict
# !pip install optuna
# import optuna.integration.lightgbm as lgb
import lightgbm as lgb

def Fit(tr_x: pd.DataFrame, tr_y: pd.Series,
        va_x: pd.DataFrame, va_y: pd.Series,
        params: Dict = {}) -> lgb.Booster:
    """
    :param tr_x: 学習用説明変数
    :param tr_y: 学習用目的変数
    :param va_x: バリデーション用説明変数
    :param va_y: バリデーション用目的変数
    :return: 学習済みモデル
    """
    # LightGBM が扱う形式に変換
    lgb_train: lgb.Dataset = lgb.Dataset(tr_x, tr_y)
    lgb_eval: lgb.Dataset = lgb.Dataset(va_x, va_y)

    # ハイパーパラメータ
    if len(params) == 0:
        params = {'objective': 'regression', 'metrics': 'l1', 'seed': 71}

    # 学習の実施
    booster = lgb.train(params, lgb_train, 
                        valid_names=['train', 'valid'],
                        valid_sets=[lgb_train, lgb_eval],
                        verbose_eval=False)
    return booster

クロスバリデーションの準備

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=71)

## 学習の実施

学習の実施

In [None]:
# 学習・バリデーションデータの分離
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

# 学習
model: lgb.Booster = Fit(tr_x, tr_y, va_x, va_y)

# 特徴量評価
importance = pd.DataFrame(model.feature_importance(),
                          index=tr_x.columns.values, 
                          columns=["importance"])
display(importance)

Unnamed: 0,importance
CRIM,145
ZN,5
INDUS,55
CHAS,6
NOX,128
RM,205
AGE,149
DIS,222
RAD,28
TAX,67


バリデーションの実施

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error

va_pred: np.ndarray = model.predict(va_x)
score: float = mean_absolute_error(va_y, va_pred)
print(f'MAE: {score:.4f}')

MAE: 2.2260


## クロスバリデーションとハイパーパラメータの調整

クロスバリデーションを行う関数を定義

In [None]:
from hyperopt import STATUS_OK

def score(params):
    scores = []
    for tr_idx, va_idx in kf.split(train_x):
        # 学習・バリデーションデータの分離
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

        # 学習
        model = Fit(tr_x, tr_y, va_x, va_y)

        # バリデーション
        va_pred = model.predict(va_x)
        score = mean_absolute_error(va_y, va_pred)
        scores.append(score)
        
    # 情報を記録しておく
    score_ave = np.mean(scores)
    history.append((params, score_ave))

    return {'loss': score_ave, 'status': STATUS_OK}

探索するパラメータの空間を指定

cf. [Laurae++: PARAMETERS](https://sites.google.com/view/lauraepp/parameters)

In [None]:
from hyperopt import hp

space = {
    'num_leaves': hp.quniform('num_leaves', 50, 200, 10),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf',  5, 25, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'learning_rate': hp.uniform('learning_rate', 0.03, 0.2),
    'subsample': hp.uniform('subsample', 0.5, 1.0)
}

hyperoptによるパラメータ探索の実行

In [None]:
from hyperopt import fmin, tpe, Trials

trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=200)

100%|██████████| 200/200 [01:33<00:00,  2.15it/s, best loss: 2.2817945860126856]


{'colsample_bytree': 0.8665393305457109,
 'learning_rate': 0.04691483835368401,
 'max_depth': 8.0,
 'min_data_in_leaf': 10.0,
 'num_leaves': 120.0,
 'subsample': 0.9570367904211257}

記録した情報からスコアを出力

In [None]:
history = sorted(history, key=lambda tpl: tpl[1])
# 最新の履歴
best = history[0]
print(f'params: {best[0]}')
print(f'score: {best[1]:.4f}')

params: {'colsample_bytree': 0.8665393305457109, 'learning_rate': 0.04691483835368401, 'max_depth': 8.0, 'min_data_in_leaf': 10.0, 'num_leaves': 120.0, 'subsample': 0.9570367904211257}
score: 2.2818
