Skip to content

Commit

Permalink
Adding Feature Value column to SHAP table. (#1064)
Browse files Browse the repository at this point in the history
* Adding Feature Value column to SHAP table.

* Editing release_notes for PR 1064.

* Moving updated prediction explanation tests from pipeline_tests to model_understanding_tests.

* Using string formatting to round and making some minor tweaks to tests.

* Adding comment explaining mocked return value of pipeline._transform
  • Loading branch information
freddyaboulton committed Aug 19, 2020
1 parent 88a5f1b commit 2ae172a
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 918 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Release Notes
* Added guide on installing with conda :pr:`1041`
* Standardized error when calling transform/predict before fit for pipelines :pr:`1048`
* Added `percent_better_than_baseline` to Automl search rankings and full rankings table :pr:`1050`
* Added "Feature Value" column to prediction explanation reports. :pr:`1064`
* Fixes
* Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
* Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,13 @@ def _compute_shap_values(pipeline, features, training_data=None):
if estimator.model_family == ModelFamily.BASELINE:
raise ValueError("You passed in a baseline pipeline. These are simple enough that SHAP values are not needed.")

pipeline_features = pipeline._transform(features)
feature_names = pipeline_features.columns
feature_names = features.columns

# This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise.
# Sklearn components do this under-the-hood so we're not changing the data the model was trained on.
# Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric.
if estimator.model_family != ModelFamily.CATBOOST:
pipeline_features = check_array(pipeline_features.values)
features = check_array(features.values)

if estimator.model_family.is_tree_estimator():
# Because of this issue: https://github.com/slundberg/shap/issues/1215
Expand All @@ -68,7 +67,7 @@ def _compute_shap_values(pipeline, features, training_data=None):
explainer = shap.TreeExplainer(estimator._component_obj, feature_perturbation="tree_path_dependent")
if ws:
logger.debug(f"_compute_shap_values TreeExplainer: {ws[0].message}")
shap_values = explainer.shap_values(pipeline_features, check_additivity=False)
shap_values = explainer.shap_values(features, check_additivity=False)
# shap only outputs values for positive class for Catboost binary estimators.
# this modifies the output to match the output format of other binary estimators.
# Ok to fill values of negative class with zeros since the negative class will get dropped
Expand All @@ -94,7 +93,7 @@ def _compute_shap_values(pipeline, features, training_data=None):
decision_function = estimator._component_obj.predict_proba
with warnings.catch_warnings(record=True) as ws:
explainer = shap.KernelExplainer(decision_function, sampled_training_data_features, link_function)
shap_values = explainer.shap_values(pipeline_features)
shap_values = explainer.shap_values(features)
if ws:
logger.debug(f"_compute_shap_values KernelExplainer: {ws[0].message}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from evalml.problem_types import ProblemTypes


def _make_rows(shap_values, normalized_values, top_k, include_shap_values=False):
def _make_rows(shap_values, normalized_values, pipeline_features, top_k, include_shap_values=False):
"""Makes the rows (one row for each feature) for the SHAP table.
Arguments:
Expand All @@ -35,15 +35,20 @@ def _make_rows(shap_values, normalized_values, top_k, include_shap_values=False)
for value, feature_name in features_to_display:
symbol = "+" if value >= 0 else "-"
display_text = symbol * min(int(abs(value) // 0.2) + 1, 5)
row = [feature_name, display_text]
feature_value = pipeline_features[feature_name].iloc[0]
if pd.api.types.is_number(feature_value) and not pd.api.types.is_bool(feature_value):
feature_value = "{:.2f}".format(feature_value)
else:
feature_value = str(feature_value)
row = [feature_name, feature_value, display_text]
if include_shap_values:
row.append(round(shap_values[feature_name][0], 2))
row.append("{:.2f}".format(shap_values[feature_name][0]))
rows.append(row)

return rows


def _make_table(shap_values, normalized_values, top_k, include_shap_values=False):
def _make_table(shap_values, normalized_values, pipeline_features, top_k, include_shap_values=False):
"""Make a table displaying the SHAP values for a prediction.
Arguments:
Expand All @@ -56,20 +61,21 @@ def _make_table(shap_values, normalized_values, top_k, include_shap_values=False
Returns:
str
"""
dtypes = ["t", "t", "f"] if include_shap_values else ["t", "t"]
alignment = ["c", "c", "c"] if include_shap_values else ["c", "c"]
n_cols = 4 if include_shap_values else 3
dtypes = ["t"] * n_cols
alignment = ["c"] * n_cols

table = Texttable()
table.set_deco(Texttable.HEADER)
table.set_cols_dtype(dtypes)
table.set_cols_align(alignment)

header = ["Feature Name", "Contribution to Prediction"]
header = ["Feature Name", "Feature Value", "Contribution to Prediction"]
if include_shap_values:
header.append("SHAP Value")

rows = [header]
rows += _make_rows(shap_values, normalized_values, top_k, include_shap_values)
rows += _make_rows(shap_values, normalized_values, pipeline_features, top_k, include_shap_values)
table.add_rows(rows)
return table.draw()

Expand All @@ -78,24 +84,24 @@ class _TableMaker(abc.ABC):
"""Makes a SHAP table for a regression, binary, or multiclass classification problem."""

@abc.abstractmethod
def __call__(self, shap_values, normalized_values, top_k, include_shap_values=False):
def __call__(self, shap_values, normalized_values, pipeline_features, top_k, include_shap_values=False):
"""Creates a table given shap values."""


class _SHAPRegressionTableMaker(_TableMaker):
"""Makes a SHAP table explaining a prediction for a regression problems."""

def __call__(self, shap_values, normalized_values, top_k, include_shap_values=False):
return _make_table(shap_values, normalized_values, top_k, include_shap_values)
def __call__(self, shap_values, normalized_values, pipeline_features, top_k, include_shap_values=False):
return _make_table(shap_values, normalized_values, pipeline_features, top_k, include_shap_values)


class _SHAPBinaryTableMaker(_TableMaker):
"""Makes a SHAP table explaining a prediction for a binary classification problem."""

def __call__(self, shap_values, normalized_values, top_k, include_shap_values=False):
def __call__(self, shap_values, normalized_values, pipeline_features, top_k, include_shap_values=False):
# The SHAP algorithm will return a two-element list for binary problems.
# By convention, we display the explanation for the dominant class.
return _make_table(shap_values[1], normalized_values[1], top_k, include_shap_values)
return _make_table(shap_values[1], normalized_values[1], pipeline_features, top_k, include_shap_values)


class _SHAPMultiClassTableMaker(_TableMaker):
Expand All @@ -104,11 +110,11 @@ class _SHAPMultiClassTableMaker(_TableMaker):
def __init__(self, class_names):
self.class_names = class_names

def __call__(self, shap_values, normalized_values, top_k, include_shap_values=False):
def __call__(self, shap_values, normalized_values, pipeline_features, top_k, include_shap_values=False):
strings = []
for class_name, class_values, normalized_class_values in zip(self.class_names, shap_values, normalized_values):
strings.append(f"Class: {class_name}\n")
table = _make_table(class_values, normalized_class_values, top_k, include_shap_values)
table = _make_table(class_values, normalized_class_values, pipeline_features, top_k, include_shap_values)
strings += table.splitlines()
strings.append("\n")
return "\n".join(strings)
Expand All @@ -132,8 +138,9 @@ def _make_single_prediction_shap_table(pipeline, input_features, top_k=3, traini
"""
if not (isinstance(input_features, pd.DataFrame) and input_features.shape[0] == 1):
raise ValueError("features must be stored in a dataframe of one row.")
pipeline_features = pipeline._transform(input_features)

shap_values = _compute_shap_values(pipeline, input_features, training_data)
shap_values = _compute_shap_values(pipeline, pipeline_features, training_data)
normalized_shap_values = _normalize_shap_values(shap_values)

if pipeline.problem_type == ProblemTypes.REGRESSION:
Expand All @@ -143,7 +150,7 @@ def _make_single_prediction_shap_table(pipeline, input_features, top_k=3, traini
else:
table_maker = _SHAPMultiClassTableMaker(pipeline._classes)

return table_maker(shap_values, normalized_shap_values, top_k, include_shap_values)
return table_maker(shap_values, normalized_shap_values, pipeline_features, top_k, include_shap_values)


class _ReportSectionMaker:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def calculate_shap_for_test(training_data, y, pipeline_class, n_points_to_explai
pipeline = pipeline_class({}, random_state=0)
points_to_explain = training_data[:n_points_to_explain]
pipeline.fit(training_data, y)
return _compute_shap_values(pipeline, points_to_explain, training_data)
return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data)


interpretable_estimators = [e for e in _all_estimators_used_in_search() if e.model_family not in {ModelFamily.XGBOOST, ModelFamily.BASELINE}]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,42 +31,42 @@ def test_explain_prediction_value_error(test_features):
explain_prediction(None, input_features=test_features, training_data=None)


explain_prediction_answer = """Feature Name Contribution to Prediction
=========================================
d ++++
a +++
c --
b ----""".splitlines()
explain_prediction_answer = """Feature Name Feature Value Contribution to Prediction
=========================================================
d 40.00 ++++
a 10.00 +++
c 30.00 --
b 20.00 ----""".splitlines()


explain_prediction_multiclass_answer = """Class: class_0
Feature Name Contribution to Prediction
=========================================
a +
b +
c -
d -
Feature Name Feature Value Contribution to Prediction
=========================================================
a 10.00 +
b 20.00 +
c 30.00 -
d 40.00 -
Class: class_1
Feature Name Contribution to Prediction
=========================================
a +++
b ++
c -
d --
Feature Name Feature Value Contribution to Prediction
=========================================================
a 10.00 +++
b 20.00 ++
c 30.00 -
d 40.00 --
Class: class_2
Feature Name Contribution to Prediction
=========================================
a +
b +
c ---
d ---
Feature Name Feature Value Contribution to Prediction
=========================================================
a 10.00 +
b 20.00 +
c 30.00 ---
d 40.00 ---
""".splitlines()


Expand Down Expand Up @@ -96,6 +96,8 @@ def test_explain_prediction(mock_normalize_shap_values,
pipeline = MagicMock()
pipeline.problem_type = problem_type
pipeline._classes = ["class_0", "class_1", "class_2"]
# By the time we call transform, we are looking at only one row of the input data.
pipeline._transform.return_value = pd.DataFrame({"a": [10], "b": [20], "c": [30], "d": [40]})
features = pd.DataFrame({"a": [1], "b": [2]})
table = explain_prediction(pipeline, features, top_k=2).splitlines()

Expand Down
Loading

0 comments on commit 2ae172a

Please sign in to comment.