<a href="https://colab.research.google.com/github/bmreiniger/datascience.stackexchange/blob/master/SO79361226.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Generate some random regression data
np.random.seed(42)
X = np.random.rand(100, 5)
y = 4 * X[:, 0] - 2 * X[:, 1] + np.random.rand(100) * 0.1

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the LGBMRegressor
model = lgb.LGBMRegressor(objective='regression', n_estimators=2, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Regular predict:
reg_y_hat = model.predict(X_test)

# Get the initial prediction (mean of y_train)
init_pred = np.mean(y_train)

# Get the train leaf values
train_leaf_indices = model.predict(X_train, pred_leaf=True)
leaf_samples = {(i, leaf_id): [] for i in range(model.n_estimators) for leaf_id in np.unique(train_leaf_indices[:, i])}

# Store corresponding target values for each leaf
for i, row in enumerate(train_leaf_indices):
    for j, leaf_id in enumerate(row):
        leaf_samples[(j, leaf_id)].append(y_train[i])

# Compute avg for each leaf:
leaf_agg = {}
for key, values in leaf_samples.items():
    leaf_agg[key] = np.mean(values)

# Predict by aggregating the mean values and adding the initial prediction:
preds = []
test_leaf_indices = model.predict(X_test, pred_leaf=True)
for row_indices in test_leaf_indices:
    row_pred = init_pred
    for i, leaf_index in enumerate(row_indices):
        row_pred += model.learning_rate * (leaf_agg[(i, leaf_index)] - init_pred) # only the residual contribution of the leaf after initial prediction
    preds.append(row_pred)
self_y_hat = np.array(preds)

# Verify the results
print('Difference between reg_y_hat and self_y_hat:', np.abs(reg_y_hat - self_y_hat).sum())

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 5
[LightGBM] [Info] Start training from score 1.061487
Difference between reg_y_hat and self_y_hat: 0.17329262573332693




In [5]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Generate some random regression data
np.random.seed(42)
X = np.random.rand(100, 5)
y = 4 * X[:, 0] - 2 * X[:, 1] + np.random.rand(100) * 0.1

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the LGBMRegressor
model = lgb.LGBMRegressor(objective='regression', n_estimators=10, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# Regular predict:
reg_y_hat = model.predict(X_test)

# Get the initial prediction (mean of y_train)
init_pred = np.mean(y_train)

# Predict by aggregating the mean values and adding the initial prediction:
preds = []
test_leaf_indices = model.predict(X_test, pred_leaf=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 5
[LightGBM] [Info] Start training from score 1.061487




In [6]:
test_leaf_indices.shape

(20, 10)

In [8]:
test_leaf_indices[0, :]

array([1, 2, 2, 2, 1, 0, 1, 0, 1, 1], dtype=int32)

In [28]:
val = 0 # not init_pred, b/c init_pred is included in the first tree (!?)
for tree_idx, leaf_idx in enumerate(test_leaf_indices[3, :]):
    leaf_val = model.booster_.get_leaf_output(tree_idx, leaf_idx)
    print(tree_idx, leaf_idx, leaf_val)  #trees_df.loc[trees_df['node_index'] == f'{tree_idx}-L{leaf_idx}', 'value'].item())
    val += leaf_val
val

0 0 0.9294197992534574
1 0 -0.1465100972602765
2 0 -0.1318590881302953
3 0 -0.11867317873984576
4 0 -0.08845862099957286
5 2 -0.09883328131758251
6 0 -0.07400195739711776
7 2 -0.08406647059779901
8 0 -0.08110933775897139
9 0 -0.061811353465600405


0.044096413586395905

In [14]:
reg_y_hat

array([1.52392002, 1.56749652, 0.84895338, 0.04409641, 1.56749652,
       0.4562928 , 0.22088663, 0.04409641, 1.56749652, 0.4562928 ,
       0.04409641, 2.06567294, 0.51469892, 2.06567294, 1.56749652,
       1.66122078, 0.04409641, 2.06567294, 0.85290376, 0.04409641])

In [29]:
for row_indices in test_leaf_indices:
    row_pred = 0
    for tree_index, leaf_index in enumerate(row_indices):
        row_pred += model.booster_.get_leaf_output(tree_index, leaf_index)
    preds.append(row_pred)
self_y_hat = np.array(preds)

# Verify the results
print('Difference between reg_y_hat and self_y_hat:', np.abs(reg_y_hat - self_y_hat).sum())

Difference between reg_y_hat and self_y_hat: 0.0
