In [1]:
import sklearn
import sklearn.datasets

import numpy as np
import lime
import lime.lime_tabular
np.random.seed(1)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload 
%autoreload 2

from graphviz import Source
from sklearn.datasets import load_iris
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from lime.lime_tabular import LimeTabularExplainer

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,sklearn

Ethen 2017-07-09 15:51:03 

CPython 3.5.2
IPython 6.1.0

numpy 1.13.1
pandas 0.19.2
matplotlib 2.0.0
sklearn 0.18.1


# eli5 experiment

In [3]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import r2_score

from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target
feature_names = boston.feature_names

iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 14)

model_xgb = XGBClassifier(n_estimators = 1)
model_xgb.fit(X_train, y_train)
accuracy_score(y_test, model_xgb.predict(X_test))

# model_xgb = XGBRegressor(n_estimators = 30)
# model_xgb.fit(X_train, y_train)
# r2_score(y_test, model_xgb.predict(X_test))

0.96666666666666667

In [4]:
def xgb_feature_imp(model_xgb, feature_names = None, importance_type = 'gain'):
    # XGBClassifier is a scikit-learn like
    # wrapper around the actual underlying model
    booster = model_xgb.get_booster()
    
    # get the normalized feature importance (sum up to 1)
    score = booster.get_score(importance_type = importance_type)
    all_features_score = np.array(
        [score.get(f, 0.) for f in booster.feature_names], dtype = np.float32)
    
    normed_score = all_features_score / np.sum(all_features_score)
    
    # construct a dataframe with the feature name mapping with the score
    if feature_names is None:
        feature_names = booster.feature_names

    feature_imp = {'weight': normed_score, 'feature': feature_names}
    df_feature_imp = pd.DataFrame(feature_imp)[['feature', 'weight']]
    df_feature_imp = (df_feature_imp
                      .sort_values('feature')
                      .reset_index(drop = True))
    return df_feature_imp

In [5]:
df_feature_imp = xgb_feature_imp(model_xgb, feature_names = feature_names)
df_feature_imp

Unnamed: 0,feature,weight
0,petal length (cm),0.442693
1,petal width (cm),0.557307
2,sepal length (cm),0.0
3,sepal width (cm),0.0


In [6]:
from xgboost import Booster, XGBRegressor, XGBClassifier

def _check_booster_args(xgb):
    if isinstance(xgb, Booster):
        booster = xgb
    else:
        booster = xgb.get_booster()
        regression = isinstance(xgb, XGBRegressor)

    return booster, regression


booster, regression = _check_booster_args(model_xgb)
regression

False

In [7]:
# xgb regression n_targets
n_targets = 1
if isinstance(model_xgb, XGBClassifier):
    n_targets = 1 if model_xgb.n_classes_ == 2 else model_xgb.n_classes_

n_targets

3

In [8]:
names = ['y']
if not regression:
    names = model_xgb.classes_

names

array([0, 1, 2])

In [9]:
X = np.atleast_2d(X_train[0])

# if regression:
# proba = model_xgb.predict_proba(X)[0]
# proba

In [10]:
tree_dumps = booster.get_dump(with_stats = True)
tree_dumps[0]

'0:[f2<2.3] yes=1,no=2,missing=1,gain=54.04,cover=53.3333\n\t1:leaf=0.141176,cover=16\n\t2:leaf=-0.0730435,cover=37.3333\n'

In [11]:
import re

def _parse_dump_line(line):
    # match the branch pattern, e.g.
    # \t1:[f2<2.3] yes=3,no=4,missing=3,gain=34.829,cover=35.5556'
    # may have 0 or more than 1 '\t' at the beginning, which is 
    # used for indicating depth of branch when printed
    branch_match = re.match(
        '^(\t*)(\d+):\[(.+)<(.+)\] '
        'yes=(\d+),no=(\d+),missing=(\d+),'
        'gain=(.+),cover=(.+)$', line)
    
    if branch_match is not None:
        matched = branch_match.groups()
        n_tabs = matched[0]
        depth = len(n_tabs)
        branch_info = {'depth': depth,
                       'node_id': int(matched[1]),
                       'split': matched[2],
                       'split_condition': float(matched[3]),
                       'yes': int(matched[4]),
                       'no': int(matched[5]),
                       'missing': int(matched[6]),
                       'gain': float(matched[7]),
                       'cover': float(matched[8])}
        return depth, branch_info

    # if it's not a branch, then it has to be a leaf node
    # match the leaf pattern, e.g.
    # \t1:leaf=0.141176,cover=16
    leaf_match = re.match('^(\t*)(\d+):leaf=(.+),cover=(.+)$', line)
    n_tabs, node_id, value, cover = leaf_match.groups()
    depth = len(n_tabs)
    leaf_info = {'node_id': int(node_id),
                 'leaf': float(value),
                 'cover': float(cover)}
    return depth, leaf_info


def _parse_tree_dump(text_dump):
    """ Parse text tree dump (one item of a list returned by Booster.get_dump())
    into json format that will be used by next XGBoost release.
    """
    result = None
    stack = []  # type: List[Dict]
    for line in text_dump.split('\n'):
        if line:
            depth, node = _parse_dump_line(line)
            if depth == 0:
                assert not stack
                result = node
                stack.append(node)
            elif depth > len(stack):
                raise ValueError('Unexpected dump structure')
            else:
                if depth < len(stack):
                    stack = stack[:depth]
                stack[-1].setdefault('children', []).append(node)
                stack.append(node)
    return result

text_dump = tree_dumps[0]
result = _parse_tree_dump(text_dump)
result

{'children': [{'cover': 16.0, 'leaf': 0.141176, 'node_id': 1},
  {'cover': 37.3333, 'leaf': -0.0730435, 'node_id': 2}],
 'cover': 53.3333,
 'depth': 0,
 'gain': 54.04,
 'missing': 1,
 'no': 2,
 'node_id': 0,
 'split': 'f2',
 'split_condition': 2.3,
 'yes': 1}

In [12]:
# useful
from xgboost import DMatrix

# XGBClassifier does not have pred_leaf argument as of now, so use booster
dmatrix = DMatrix(X, missing = model_xgb.missing)
leaf_ids = booster.predict(dmatrix, pred_leaf = True)[0]
xgb_feature_names = {f: i for i, f in enumerate(booster.feature_names)}

In [13]:
h = booster.get_dump(with_stats = True, dump_format = 'json')
len(h)

3

In [14]:
import json

tree_id = 1
t = booster.get_dump(with_stats = True, dump_format = 'json')[tree_id]
result2 = json.loads(t)
result2

{'children': [{'children': [{'cover': 16, 'leaf': -0.0705882, 'nodeid': 3},
    {'children': [{'cover': 16.8889, 'leaf': 0.141615, 'nodeid': 7},
      {'cover': 2.66667, 'leaf': -3.25116e-09, 'nodeid': 8}],
     'cover': 19.5556,
     'depth': 2,
     'gain': 4.65416,
     'missing': 7,
     'no': 8,
     'nodeid': 4,
     'split': 'f2',
     'split_condition': 4.95,
     'yes': 7}],
   'cover': 35.5556,
   'depth': 1,
   'gain': 34.829,
   'missing': 3,
   'no': 4,
   'nodeid': 1,
   'split': 'f2',
   'split_condition': 2.3,
   'yes': 3},
  {'children': [{'cover': 1.33333, 'leaf': -2.55448e-09, 'nodeid': 5},
    {'cover': 16.4444, 'leaf': -0.0707006, 'nodeid': 6}],
   'cover': 17.7778,
   'depth': 1,
   'gain': 0.619154,
   'missing': 5,
   'no': 6,
   'nodeid': 2,
   'split': 'f2',
   'split_condition': 4.85,
   'yes': 5}],
 'cover': 53.3333,
 'depth': 0,
 'gain': 12.9454,
 'missing': 1,
 'no': 2,
 'nodeid': 0,
 'split': 'f3',
 'split_condition': 1.75,
 'yes': 1}

In [15]:
tree_id = 0

# leaf_ids : the leaf id of that example for each tree
leaf_id = leaf_ids[tree_id]
print(leaf_id)

# parse the tree dump into json format
t = booster.get_dump(with_stats = True, dump_format = 'json')[tree_id]
result1 = json.loads(t)
result1

1


{'children': [{'cover': 16, 'leaf': 0.141176, 'nodeid': 1},
  {'cover': 37.3333, 'leaf': -0.0730435, 'nodeid': 2}],
 'cover': 53.3333,
 'depth': 0,
 'gain': 54.04,
 'missing': 1,
 'no': 2,
 'nodeid': 0,
 'split': 'f2',
 'split_condition': 2.3,
 'yes': 1}

In [16]:
# node undefined = -2
xgb_feature_names

{'f0': 0, 'f1': 1, 'f2': 2, 'f3': 3}

In [17]:
def _get_tree_paths(result):
    if 'leaf' not in result:
        node_id = result['nodeid']
        left_child, right_child = result['children']
        left_paths = _get_tree_paths(left_child)
        right_paths = _get_tree_paths(right_child)

        for path in left_paths:
            path.append(node_id)

        for path in right_paths:
            path.append(node_id)

        paths = left_paths + right_paths
    else:
        node_id = result['nodeid']
        paths = [[node_id]]

    return paths

In [18]:
paths = _get_tree_paths(result1)
paths

[[1, 0], [2, 0]]

In [19]:
node_count = -1

leaf_to_path = {}
for path in paths:
    path.reverse()
    node = path[-1]
    leaf_to_path[node] = path
    
    if node > node_count:
        node_count = node
        
node_count += 1
        
print(node_count)   
leaf_to_path

3


{1: [0, 1], 2: [0, 2]}

In [20]:
# obtain the feature, value, where index i holds the feature
# and value for node i 

def _get_feature(result, feature, xgb_feature_names):
    node_id = result['nodeid']
    if 'leaf' not in result:
        feature[node_id] = xgb_feature_names[result['split']]
        left_child, right_child = result['children']
        _get_feature(left_child, feature, xgb_feature_names)
        _get_feature(right_child, feature, xgb_feature_names)
    else:
        feature[node_id] = -2

In [21]:
feature = np.zeros(node_count, dtype = np.int32)
_get_feature(result1, feature, xgb_feature_names)
feature

array([ 2, -2, -2], dtype=int32)

In [22]:
X

array([[ 5.1,  3.4,  1.5,  0.2]])

In [23]:
model_xgb.predict_proba(X)

array([[ 0.38218513,  0.30924702,  0.30856785]], dtype=float32)

In [24]:
result1

{'children': [{'cover': 16, 'leaf': 0.141176, 'nodeid': 1},
  {'cover': 37.3333, 'leaf': -0.0730435, 'nodeid': 2}],
 'cover': 53.3333,
 'depth': 0,
 'gain': 54.04,
 'missing': 1,
 'no': 2,
 'nodeid': 0,
 'split': 'f2',
 'split_condition': 2.3,
 'yes': 1}

In [None]:
def sum1(node, prefix = 0):
    if node is None:
        return 0
    
    p = prefix * 10 + node.value
    if node.left is not None or node.right is not None:
        return sum1(node.left, p) + sum1(node.right, p)

    return p

In [None]:
def sum1(node, prefix = 0):
    if node is None:
        return 0
    
    # p = prefix * 10 + node.value
    
    if node.left is not None or node.right is not None:
        return sum1(node.left, p) + sum1(node.right, p)

    return p

In [25]:
def _get_parent(result):
    if 'leaf' not in result:
        left_child, right_child = result['children']
        _get_parent(left_child)
        _get_parent(right_child)
    else:
        result['leaf'] = _parent_value(result['children'])
    #return 
    
def _parent_value(children):
    """
    Value of the parent node: a weighted sum of child values.
    """
    covers = np.array([child['cover'] for child in children])
    covers /= np.sum(covers)
    leafs = np.array([child['leaf'] for child in children])
    return np.sum(leafs * covers)

In [26]:
_parent_value(result1['children'])

-0.0087776098338186448

In [27]:
hi

NameError: name 'hi' is not defined

In [None]:
_get_parent(result1)

In [None]:
_parent_value(result1['children'])

In [None]:
_indexed_leafs(result1)

In [None]:
def _indexed_leafs(parent):
    """ Return a leaf nodeid -> node dictionary with
    "parent" and "leaf" (average child "leaf" value) added to all nodes.
    """
    if not parent.get('children'):
        return {parent['nodeid']: parent}
    indexed = {}
    for child in parent['children']:
        child['parent'] = parent
        if 'leaf' in child:
            indexed[child['nodeid']] = child
        else:
            indexed.update(_indexed_leafs(child))
    parent['leaf'] = _parent_value(parent['children'])
    return indexed

In [None]:
def _indexed_leafs(parent):
    """ Return a leaf nodeid -> node dictionary with
    "parent" and "leaf" (average child "leaf" value) added to all nodes.
    """
    if not parent.get('children'):
        return {parent['nodeid']: parent}
    indexed = {}
    for child in parent['children']:
        child['parent'] = parent
        if 'leaf' in child:
            indexed[child['nodeid']] = child
        else:
            indexed.update(_indexed_leafs(child))
    parent['leaf'] = _parent_value(parent['children'])
    return indexed

def _parent_value(children):
    """ Value of the parent node: a weighted sum of child values.
    """
    covers = np.array([child['cover'] for child in children])
    covers /= np.sum(covers)
    leafs = np.array([child['leaf'] for child in children])
    return np.sum(leafs * covers)

tree_id = 0

# leaf_ids : the leaf id of that example for each tree
leaf_id = leaf_ids[tree_id]
print(leaf_id)

# parse the tree dump into json format
t = booster.get_dump(with_stats = True, dump_format = 'json')[tree_id]
result = json.loads(t)
result


indexed = _indexed_leafs(result)
indexed 

In [None]:
result

In [None]:
leaf = indexed[leaf_id]
leaf

In [None]:
score = 0.0
score += leaf['leaf']
path = [leaf]
while 'parent' in path[-1]:
    path.append(path[-1]['parent'])
path.reverse()
path

In [None]:
leaf_id = leaf_ids[0]

In [None]:
# match the branch pattern, e.g.
# \t1:[f2<2.3] yes=3,no=4,missing=3,gain=34.829,cover=35.5556'
# may have 0 or more than 1 '\t' at the beginning, which is 
# used for indicating depth of branch when printed
tree_dump = tree_dumps[1].split('\n')[0]


import re
line = tree_dump.split('\n')[0]
branch_match = re.match(
    '^(\t*)(\d+):\[(.+)<(.+)\] '
    'yes=(\d+),no=(\d+),missing=(\d+),'
    'gain=(.+),cover=(.+)$', line)
matched = branch_match.groups()
matched

In [None]:
def _target_feature_weights(leaf_ids, tree_dumps, feature_names, xgb_feature_names):
    feature_weights = np.zeros(len(feature_names))
    # All trees in XGBoost give equal contribution to the prediction:
    # it is equal to sum of "leaf" values in leafs
    # before applying loss-specific function
    # (e.g. logistic for "binary:logistic" loss).
    score = 0
    for text_dump, leaf_id in zip(tree_dumps, leaf_ids):
        leaf = _indexed_leafs(_parse_tree_dump(text_dump))[leaf_id]
        score += leaf['leaf']
        path = [leaf]
        while 'parent' in path[-1]:
            path.append(path[-1]['parent'])
        path.reverse()
        # Check how each split changes "leaf" value
        for node, child in zip(path, path[1:]):
            idx = xgb_feature_names[node['split']]
            feature_weights[idx] += child['leaf'] - node['leaf']
        # Root "leaf" value is interpreted as bias
        feature_weights[feature_names.bias_idx] += path[0]['leaf']
    return score, feature_weights

In [None]:
from xgboost import DMatrix

output_margin=False
ntree_limit=0

# each record indicating the predicted leaf index of each sample in each tree
leaf_preds = booster.predict(dmatrix,
    output_margin=output_margin,
    ntree_limit=ntree_limit,
    pred_leaf=True)[0]

leaf_preds

In [None]:
contrib_preds = booster.predict(dmatrix,
    output_margin=output_margin,
    ntree_limit=ntree_limit,
    pred_contribs=True)[0]

contrib_preds

In [None]:
tree_dumps = booster.get_dump(with_stats=True)
tree_dumps[0]

In [None]:
leaf_preds

In [None]:
temp = contrib_preds[0]
temp[0::n_targets]

In [None]:
temp[0::n_targets].sum()

In [None]:
temp[1::n_targets].sum()

In [None]:
temp[2::n_targets].sum()

In [None]:
from eli5 import explain_weights
explain_weights(model_xgb)

In [None]:
#.dump_model('temp.txt')
booster = model_xgb.get_booster()
original_feature_names = booster.feature_names

features_names = iris.feature_names
booster.feature_names = features_names
features_names

In [None]:
xgdump = booster.get_dump()
xgdump[0]
print(booster.get_dump()[0])
# recover original feature names
booster.feature_names = original_feature_names

## Conditional Feature Contribution

In [None]:
hi

In [None]:
instances = boston.data[[300, 309]]
print("Instance 0 prediction:", model_tree_reg.predict([instances[0]]))
print("Instance 1 prediction:", model_tree_reg.predict([instances[1]]))

In [None]:
bias

In [None]:
leaf_to_path

In [None]:
# contributions = []
# for row, leaf in enumerate(leaves):
#     path = leaf_to_path[leaf]

# leaf = leaves[0]
path = leaf_to_path[leaf]
path

In [None]:
feature = list(model_tree.tree_.feature)

path_features = set()
path_features_dict = {}

for depth in range(len(path) - 1):
    path_feature = feature[path[depth]]
    path_features.add(path_feature)
    contrib = values[path[depth + 1]] - values[path[depth]]
    
    joint_features = tuple(sorted(path_features))
    contrib += path_features_dict.get(joint_features, 0)
    path_features_dict[joint_features] = contrib

In [None]:
prediction

In [None]:
bias

In [None]:
bias - 2.8953765912305585 - 4.9119953416149027

In [None]:
path_features_dict

In [None]:
contributions = []
for leaf in leaves:
    path = leaf_to_path[leaf]
    path_features = set()
    path_features_dict = {}

    for depth in range(len(path) - 1):
        path_feature = feature[path[depth]]
        path_features.add(path_feature)
        contrib = values[path[depth + 1]] - values[path[depth]]

        joint_features = tuple(sorted(path_features))
        contrib += path_features_dict.get(joint_features, 0)
        path_features_dict[joint_features] = contrib
    
    contributions.append(path_features_dict)

In [None]:
contributions

In [None]:
hi

In [None]:
contributions = []

for leaf in leaves:
    # for each leaf, check which is the path
    # that it took to get to the leaf
    for path in paths:
        if leaf == path[-1]:
            break
    
    # compute the contribution of each feature 
    # for a given observation
    contribs = np.zeros(line_shape)
    for depth in range(len(path) - 1):
        contrib = values[path[depth + 1]] - values[path[depth]]
        feature_idx = feature[path[depth]]
        contribs[feature_idx] += contrib
    
    contributions.append(contribs)
    
contributions

In [None]:
_predict_tree(model, X, joint_contribution=joint_contribution)

In [None]:
from tree_explainer import TreeExplainer

tree_explain = TreeExplainer(model_tree, iris.feature_names)
best_idx, prediction, df_explained = tree_explain.explain(X_train[0])

# style the contribution weight
# https://pandas.pydata.org/pandas-docs/stable/style.html#Builtin-Styles
# http://seaborn.pydata.org/tutorial/color_palettes.html#custom-diverging-palettes-with-diverging-palette
cmap = sns.diverging_palette(10, 133, s = 85, l = 60, n = 4, as_cmap = True)
df_explained = df_explained.style.background_gradient(cmap = cmap, subset = 'contrib')

print('predicted class: ', best_idx)
print('prediction: ', prediction)
df_explained

In [None]:
rf = RandomForestClassifier(n_estimators = 500)
rf.fit(X_train, y_train)
accuracy_score(y_test, rf.predict(X_test))

In [None]:
explainer = LimeTabularExplainer(X_train, feature_names = iris.feature_names, 
                                 class_names = iris.target_names, discretize_continuous = True)

i = np.random.randint(0, X_test.shape[0])
exp = explainer.explain_instance(X_test[i], rf.predict_proba, num_features=2, top_labels=1)

In [None]:
exp.available_labels()

In [None]:
exp.show_in_notebook(show_table = True, show_all = False)

# Reference

- [Blog: Interpreting random forests](http://blog.datadive.net/interpreting-random-forests/)
- [Blog: Random forest interpretation with scikit-learn](http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/)
- [Blog: Random forest interpretation â€“ conditional feature contributions](http://blog.datadive.net/random-forest-interpretation-conditional-feature-contributions/)