In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
data = fetch_california_housing()

df = pd.DataFrame(data.data, columns=data.feature_names) \
    .assign(MedHouseVal=data.target)
            
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
feature_cols = data.feature_names
target_col = 'MedHouseVal'

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    df[feature_cols], df[target_col], test_size=0.2, random_state=0
)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((16512, 8), (4128, 8), (16512,), (4128,))

In [5]:
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train, y_train, test_size=0.2, random_state=0
)

x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((13209, 8), (3303, 8), (13209,), (3303,))

## Training API

train (https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html#lightgbm-train) を使う

In [6]:
# Dataset形式で渡す必要がある
# pd.DataFrameなら、特徴量の名前は勝手に入力される

train = lgb.Dataset(x_train, y_train)
valid = lgb.Dataset(x_valid, y_valid, reference=train)

In [15]:
# paramsは以降の引数より優先される

# ここで紹介されている https://lightgbm.readthedocs.io/en/latest/Parameters.html#core-parameters
# ネットの記事とかで紹介されてるパラメータはデフォルト通りのものも多い

# objectiveをlambdarankにすればランク学習もできる

params = {
    'objective': 'regression',  # デフォルトで回帰だけど
    'metric': 'rmse',
    'num_iterations': 1000,  # trainの引数でも指定できる
    'seed': 0
}

In [16]:
# callbackｓを指定するとデフォルトのverbosityではrmseの推移を出してくれなくなる
# log_evaluationを使うと適当な周期で推移を出せる

model = lgb.train(
    params, train, valid_sets=[valid], valid_names=['valid'],
    callbacks=[lgb.log_evaluation(10), lgb.early_stopping(50)]
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 8
[LightGBM] [Info] Start training from score 2.065707
Training until validation scores don't improve for 50 rounds
[10]	valid's rmse: 0.719651
[20]	valid's rmse: 0.592514
[30]	valid's rmse: 0.534944
[40]	valid's rmse: 0.511013
[50]	valid's rmse: 0.498221
[60]	valid's rmse: 0.491197
[70]	valid's rmse: 0.484716
[80]	valid's rmse: 0.48053
[90]	valid's rmse: 0.477339
[100]	valid's rmse: 0.475572
[110]	valid's rmse: 0.474097
[120]	valid's rmse: 0.472471
[130]	valid's rmse: 0.471329
[140]	valid's rmse: 0.470435
[150]	valid's rmse: 0.468505
[160]	valid's rmse: 0.466826
[170]	valid's rmse: 0.465828
[180]	valid's rmse: 0.465149
[190]	valid's rmse: 0.463973
[200]	valid's rmse: 0.463681
[210]	valid's rmse: 0.463186
[220]	valid's rmse: 0.462563
[230]	valid's rmse: 0.462049
[240]	valid's rmse: 0.46124
[250]	valid's rms



In [17]:
y_pred = model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.44061853689115754

## scikit-learn API

回帰の例 https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor

fitのパラメータはここ
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor.fit

In [14]:
model2 = lgb.LGBMRegressor(
    n_estimators=1000, random_state=0
).fit(
    x_train, y_train, eval_set=[(x_valid, y_valid)], eval_names=['valid'],
    eval_metric='rmse',
    callbacks=[lgb.log_evaluation(10), lgb.early_stopping(50)]
)

Training until validation scores don't improve for 50 rounds
[10]	valid's rmse: 0.719651	valid's l2: 0.517897
[20]	valid's rmse: 0.592514	valid's l2: 0.351073
[30]	valid's rmse: 0.534944	valid's l2: 0.286165
[40]	valid's rmse: 0.511013	valid's l2: 0.261134
[50]	valid's rmse: 0.498221	valid's l2: 0.248224
[60]	valid's rmse: 0.491197	valid's l2: 0.241274
[70]	valid's rmse: 0.484716	valid's l2: 0.23495
[80]	valid's rmse: 0.48053	valid's l2: 0.230909
[90]	valid's rmse: 0.477339	valid's l2: 0.227852
[100]	valid's rmse: 0.475572	valid's l2: 0.226169
[110]	valid's rmse: 0.474097	valid's l2: 0.224768
[120]	valid's rmse: 0.472471	valid's l2: 0.223229
[130]	valid's rmse: 0.471329	valid's l2: 0.222151
[140]	valid's rmse: 0.470435	valid's l2: 0.22131
[150]	valid's rmse: 0.468505	valid's l2: 0.219497
[160]	valid's rmse: 0.466826	valid's l2: 0.217926
[170]	valid's rmse: 0.465828	valid's l2: 0.216995
[180]	valid's rmse: 0.465149	valid's l2: 0.216364
[190]	valid's rmse: 0.463973	valid's l2: 0.215271
[

In [11]:
y_pred = model2.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.44061853689115754