forked from TeamHG-Memex/eli5
-
Notifications
You must be signed in to change notification settings - Fork 2
/
lightgbm.py
274 lines (230 loc) · 9.49 KB
/
lightgbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from collections import defaultdict
from typing import DefaultDict
import numpy as np # type: ignore
import lightgbm # type: ignore
from eli5._feature_weights import get_top_features
from eli5.explain import explain_weights, explain_prediction
from eli5._feature_importances import get_feature_importance_explanation
from eli5.sklearn.utils import handle_vec, get_X, get_X0, add_intercept, predict_proba
from eli5.utils import mask
from eli5._decision_path import get_decision_path_explanation
DESCRIPTION_LIGHTGBM = """
LightGBM feature importances; values are numbers 0 <= x <= 1;
all values sum to 1.
"""
@explain_weights.register(lightgbm.LGBMClassifier)
@explain_weights.register(lightgbm.LGBMRegressor)
def explain_weights_lightgbm(lgb,
vec=None,
top=20,
target_names=None, # ignored
targets=None, # ignored
feature_names=None,
feature_re=None,
feature_filter=None,
importance_type='gain',
):
"""
Return an explanation of an LightGBM estimator (via scikit-learn wrapper
LGBMClassifier or LGBMRegressor) as feature importances.
See :func:`eli5.explain_weights` for description of
``top``, ``feature_names``,
``feature_re`` and ``feature_filter`` parameters.
``target_names`` and ``targets`` parameters are ignored.
Parameters
----------
importance_type : str, optional
A way to get feature importance. Possible values are:
- 'gain' - the average gain of the feature when it is used in trees
(default)
- 'split' - the number of times a feature is used to split the data
across all trees
- 'weight' - the same as 'split', for compatibility with xgboost
"""
coef = _get_lgb_feature_importances(lgb, importance_type)
lgb_feature_names = lgb.booster_.feature_name()
return get_feature_importance_explanation(lgb, vec, coef,
feature_names=feature_names,
estimator_feature_names=lgb_feature_names,
feature_filter=feature_filter,
feature_re=feature_re,
top=top,
description=DESCRIPTION_LIGHTGBM,
num_features=coef.shape[-1],
is_regression=isinstance(lgb, lightgbm.LGBMRegressor),
)
@explain_prediction.register(lightgbm.LGBMClassifier)
@explain_prediction.register(lightgbm.LGBMRegressor)
def explain_prediction_lightgbm(
lgb, doc,
vec=None,
top=None,
top_targets=None,
target_names=None,
targets=None,
feature_names=None,
feature_re=None,
feature_filter=None,
vectorized=False,
):
""" Return an explanation of LightGBM prediction (via scikit-learn wrapper
LGBMClassifier or LGBMRegressor) as feature weights.
See :func:`eli5.explain_prediction` for description of
``top``, ``top_targets``, ``target_names``, ``targets``,
``feature_names``, ``feature_re`` and ``feature_filter`` parameters.
``vec`` is a vectorizer instance used to transform
raw features to the input of the estimator ``xgb``
(e.g. a fitted CountVectorizer instance); you can pass it
instead of ``feature_names``.
``vectorized`` is a flag which tells eli5 if ``doc`` should be
passed through ``vec`` or not. By default it is False, meaning that
if ``vec`` is not None, ``vec.transform([doc])`` is passed to the
estimator. Set it to True if you're passing ``vec``,
but ``doc`` is already vectorized.
Method for determining feature importances follows an idea from
http://blog.datadive.net/interpreting-random-forests/.
Feature weights are calculated by following decision paths in trees
of an ensemble.
Each leaf has an output score, and expected scores can also be assigned
to parent nodes.
Contribution of one feature on the decision path is how much expected score
changes from parent to child.
Weights of all features sum to the output score of the estimator.
"""
vec, feature_names = handle_vec(lgb, doc, vec, vectorized, feature_names)
if feature_names.bias_name is None:
# LightGBM estimators do not have an intercept, but here we interpret
# them as having an intercept
feature_names.bias_name = '<BIAS>'
X = get_X(doc, vec, vectorized=vectorized)
proba = predict_proba(lgb, X)
weight_dicts = _get_prediction_feature_weights(lgb, X, _lgb_n_targets(lgb))
x = get_X0(add_intercept(X))
flt_feature_names, flt_indices = feature_names.handle_filter(
feature_filter, feature_re, x)
is_regression = isinstance(lgb, lightgbm.LGBMRegressor)
is_multiclass = _lgb_n_targets(lgb) > 2
names = lgb.classes_ if not is_regression else ['y']
def get_score_feature_weights(_label_id):
_weights = _target_feature_weights(
weight_dicts[_label_id],
num_features=len(feature_names),
bias_idx=feature_names.bias_idx,
)
_score = _get_score(weight_dicts[_label_id])
_x = x
if flt_indices is not None:
_x = mask(_x, flt_indices)
_weights = mask(_weights, flt_indices)
return _score, get_top_features(flt_feature_names, _weights, top, _x)
return get_decision_path_explanation(
lgb, doc, vec,
vectorized=vectorized,
original_display_names=names,
target_names=target_names,
targets=targets,
top_targets=top_targets,
is_regression=is_regression,
is_multiclass=is_multiclass,
proba=proba,
get_score_feature_weights=get_score_feature_weights,
)
def _lgb_n_targets(lgb):
if isinstance(lgb, lightgbm.LGBMClassifier):
return lgb.n_classes_
else:
return 1
def _get_lgb_feature_importances(lgb, importance_type):
aliases = {'weight': 'split'}
coef = lgb.booster_.feature_importance(
importance_type=aliases.get(importance_type, importance_type)
)
norm = coef.sum()
return coef / norm if norm else coef
def _compute_node_values(tree_info):
""" Add node_value key with an expected value for non-leaf nodes """
def walk(tree):
if 'leaf_value' in tree:
return tree['leaf_value'], tree['leaf_count']
left_value, left_count = walk(tree['left_child'])
right_value, right_count = walk(tree['right_child'])
count = left_count + right_count
tree['node_value'] = (left_value * left_count +
right_value * right_count) / count
return tree['node_value'], count
for tree in tree_info:
walk(tree['tree_structure'])
def _get_decision_path(leaf_index, split_index, leaf_id):
path, split_features = [], []
parent_id, leaf = leaf_index[leaf_id]
path.append(leaf['leaf_value'])
while True:
if parent_id == -1:
break
parent_id, node = split_index[parent_id]
path.append(node['node_value'])
split_features.append(node['split_feature'])
path.reverse()
changes = _changes(path)
bias, path = changes[0], list(zip(reversed(split_features), changes[1:]))
return bias, path
def _changes(path):
"""
>>> _changes([2, 3, 0, 5])
[2, 1, -3, 5]
>>> _changes([2])
[2]
"""
res = [path[0]]
res += [p - p_prev for p, p_prev in zip(path[1:], path)]
return res
def _get_leaf_split_indices(tree_structure):
leaf_index = {} # leaf id => (parent_id, leaf)
split_index = {} # split id => (parent_id, subtree)
def walk(tree, parent_id=-1):
if 'leaf_index' in tree:
leaf_index[tree['leaf_index']] = tree['leaf_parent'], tree
else:
split_index[tree['split_index']] = parent_id, tree
walk(tree['left_child'], tree['split_index'])
walk(tree['right_child'], tree['split_index'])
walk(tree_structure)
return leaf_index, split_index
def _get_prediction_feature_weights(lgb, X, n_targets):
"""
Return a list of {feat_id: value} dicts with feature weights,
following ideas from http://blog.datadive.net/interpreting-random-forests/
"""
if n_targets == 2:
n_targets = 1
dump = lgb.booster_.dump_model()
tree_info = dump['tree_info']
_compute_node_values(tree_info)
pred_leafs = lgb.booster_.predict(X, pred_leaf=True).reshape(-1, n_targets)
tree_info = np.array(tree_info).reshape(-1, n_targets)
assert pred_leafs.shape == tree_info.shape
res = []
for target in range(n_targets):
feature_weights = defaultdict(float) # type: DefaultDict[str, float]
for info, leaf_id in zip(tree_info[:, target], pred_leafs[:, target]):
leaf_index, split_index = _get_leaf_split_indices(
info['tree_structure']
)
bias, path = _get_decision_path(leaf_index, split_index, leaf_id)
feature_weights[None] += bias
for feat, value in path:
feature_weights[feat] += value
res.append(dict(feature_weights))
return res
def _target_feature_weights(feature_weights_dict, num_features, bias_idx):
feature_weights = np.zeros(num_features)
for k, v in feature_weights_dict.items():
if k is None:
feature_weights[bias_idx] = v
else:
feature_weights[k] = v
return feature_weights
def _get_score(feature_weights_dict):
return sum(feature_weights_dict.values())