Skip to content

Commit

Permalink
Find rows near the decision boundary (#2908)
Browse files Browse the repository at this point in the history
* initial commit with code

* update release note

* add raises docs

* remove space

* update docs

* address comments

* remove link
  • Loading branch information
bchen1116 committed Oct 19, 2021
1 parent 2767e33 commit 6f8d37a
Show file tree
Hide file tree
Showing 5 changed files with 341 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/source/api_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ Pipeline Utils

evalml.pipelines.utils.make_pipeline
evalml.pipelines.utils.generate_pipeline_code
evalml.pipelines.utils.rows_of_interest



Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added ``rows_of_interest`` to pipeline utils :pr:`2908`
* Added support for woodwork version ``0.8.2`` :pr:`2909`
* Enhanced the ``DateTimeFeaturizer`` to handle ``NaNs`` in date features :pr:`2909`
* Fixes
Expand Down
43 changes: 43 additions & 0 deletions docs/source/user_guide/pipelines.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,49 @@
"exec(code)\n",
"pipeline.fit(X, y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Grabbing rows near the decision boundary\n",
"For binary classification problems, you can also look at the rows closest to the decision boundary by using `rows_of_interest`. This method returns the indices of interest, which can then be used to obtain the subset of the data that falls closest to the decision boundary. This can help with further analysis of the model, and can give you better understanding of what rows the model could be having trouble with."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from evalml.demos import load_breast_cancer\n",
"from evalml.pipelines import BinaryClassificationPipeline\n",
"from evalml.pipelines.utils import rows_of_interest\n",
"\n",
"X, y = load_breast_cancer()\n",
"pipeline = BinaryClassificationPipeline(component_graph=[\"Imputer\", \"Standard Scaler\", \"Logistic Regression Classifier\"])\n",
"pipeline.fit(X, y)\n",
"indices = rows_of_interest(pipeline, X, y, types='all')\n",
"X.iloc[indices]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can see what the probabilities are for these rows to determine how close they are to the 0.5 default threshold. X is used here for brevity."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pred_proba = pipeline.predict_proba(X)\n",
"pos_value_proba = pred_proba.iloc[:, -1]\n",
"pos_value_proba.iloc[indices]"
]
}
],
"metadata": {
Expand Down
81 changes: 81 additions & 0 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,3 +449,84 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon):
},
)
return baseline


def rows_of_interest(
pipeline, X, y=None, threshold=None, epsilon=0.1, sort_values=True, types="all"
):
"""Get the row indices of the data that are closest to the threshold. Works only for binary classification problems and pipelines.
Args:
pipeline (PipelineBase): The fitted binary pipeline.
X (ww.DataTable, pd.DataFrame): The input features to predict on.
y (ww.DataColumn, pd.Series, None): The input target data, if available. Defaults to None.
threshold (float): The threshold value of interest to separate positive and negative predictions. If None, uses the pipeline threshold if set, else 0.5. Defaults to None.
epsilon (epsilon): The difference between the probability and the threshold that would make the row interesting for us. For instance, epsilon=0.1 and threhsold=0.5 would mean
we consider all rows in [0.4, 0.6] to be of interest. Defaults to 0.1.
sort_values (bool): Whether to return the indices sorted by the distance from the threshold, such that the first values are closer to the threshold and the later values are further. Defaults to True.
types (str): The type of rows to keep and return. Can be one of ['incorrect', 'correct', 'true_positive', 'true_negative', 'all']. Defaults to 'all'.
'incorrect' - return only the rows where the predictions are incorrect. This means that, given the threshold and target y, keep only the rows which are labeled wrong.
'correct' - return only the rows where the predictions are correct. This means that, given the threshold and target y, keep only the rows which are correctly labeled.
'true_positive' - return only the rows which are positive, as given by the targets.
'true_negative' - return only the rows which are negative, as given by the targets.
'all' - return all rows. This is the only option available when there is no target data provided.
Returns:
The indices corresponding to the rows of interest.
Raises:
ValueError: If pipeline is not a fitted Binary Classification pipeline.
ValueError: If types is invalid or y is not provided when types is not 'all'.
ValueError: If the threshold is provided and is exclusive of [0, 1].
"""
valid_types = ["incorrect", "correct", "true_positive", "true_negative", "all"]
if types not in valid_types:
raise ValueError(
"Invalid arg for 'types'! Must be one of {}".format(valid_types)
)

if types != "all" and y is None:
raise ValueError("Need an input y in order to use types {}".format(types))

if (
not isinstance(pipeline, BinaryClassificationPipeline)
or not pipeline._is_fitted
):
raise ValueError(
"Pipeline provided must be a fitted Binary Classification pipeline!"
)

if threshold is not None and (threshold < 0 or threshold > 1):
raise ValueError(
"Provided threshold {} must be between [0, 1]".format(threshold)
)

if threshold is None:
threshold = pipeline.threshold or 0.5

# get predicted proba
pred_proba = pipeline.predict_proba(X)
pos_value_proba = pred_proba.iloc[:, -1]
preds = pos_value_proba >= threshold
preds_value_proba = abs(pos_value_proba - threshold)

# placeholder for y if it isn't supplied
y_current = y if y is not None else preds

# logic for breaking apart the different categories
mask = y_current
if types in ["correct", "incorrect"]:
mask = preds == y
mask = mask.astype(bool)

if types in ["correct", "true_positive"]:
preds_value_proba = preds_value_proba[mask.values]
elif types in ["incorrect", "true_negative"]:
preds_value_proba = preds_value_proba[~mask.values]

if sort_values:
preds_value_proba = preds_value_proba.sort_values(kind="stable")

preds_value_proba = preds_value_proba[preds_value_proba <= epsilon]
return preds_value_proba.index.tolist()
215 changes: 215 additions & 0 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from unittest.mock import patch

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -38,6 +40,7 @@
get_estimators,
is_classification,
make_pipeline,
rows_of_interest,
)
from evalml.problem_types import ProblemTypes, is_regression, is_time_series

Expand Down Expand Up @@ -591,3 +594,215 @@ def __init__(self, random_arg=False, random_seed=0):
)
pipeline = generate_pipeline_code(mock_pipeline_with_custom_components)
assert pipeline == expected_code


def test_rows_of_interest_errors(X_y_binary):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
pipeline_mc = MulticlassClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X, y = X_y_binary

with pytest.raises(ValueError, match="Invalid arg for"):
rows_of_interest(pipeline, X, y, types="ball")

with pytest.raises(ValueError, match="Need an input y in order to"):
rows_of_interest(pipeline, X, types="correct")

with pytest.raises(ValueError, match="Pipeline provided must be a fitted"):
rows_of_interest(pipeline, X, y, types="all")

with pytest.raises(ValueError, match="Pipeline provided must be a fitted"):
rows_of_interest(pipeline_mc, X, y, types="all")

with pytest.raises(ValueError, match="Pipeline provided must be a fitted"):
rows_of_interest(pipeline_mc, X, y, types="all")

pipeline._is_fitted = True
with pytest.raises(ValueError, match="Provided threshold 1.1 must be between"):
rows_of_interest(pipeline, X, y, threshold=1.1)

with pytest.raises(ValueError, match="Provided threshold -0.1 must be between"):
rows_of_interest(pipeline, X, y, threshold=-0.1)


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
@pytest.mark.parametrize("threshold", [0.3, None, 0.7])
@pytest.mark.parametrize("y", [pd.Series([i % 2 for i in range(100)]), None])
def test_rows_of_interest_threshold(mock_fit, mock_pred_proba, threshold, y):
pipeline = BinaryClassificationPipeline(
component_graph=["Imputer", "Standard Scaler", "Logistic Regression Classifier"]
)
X = pd.DataFrame([i for i in range(100)])
y = y
pipeline._is_fitted = True

vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(
pipeline, X, y, threshold=threshold, epsilon=0.5, sort_values=True
)
if threshold == 0.3:
assert vals == list(range(100))
elif threshold == 0.7:
assert vals == list(range(75, 100)) + list(range(25, 75)) + list(range(25))
else:
assert vals == list(range(25, 75)) + list(range(25)) + list(range(75, 100))

pipeline._threshold = 0.9
vals = rows_of_interest(
pipeline, X, y, threshold=None, epsilon=0.5, sort_values=True
)
assert vals == list(range(75, 100)) + list(range(25, 75))


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
@pytest.mark.parametrize(
"types,expected_val",
[
("incorrect", list(range(75, 100))),
("correct", list(range(75))),
("true_positive", list(range(25, 75))),
("true_negative", list(range(25)) + list(range(75, 100))),
("all", list(range(100))),
],
)
def test_rows_of_interest_types(mock_fit, mock_pred_proba, types, expected_val):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X = pd.DataFrame([i for i in range(100)])
y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
pipeline._is_fitted = True

vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(pipeline, X, y, types=types, epsilon=0.5, sort_values=False)
assert vals == expected_val


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
@pytest.mark.parametrize("epsilon,expected_len", [(0.01, 50), (0.3, 75), (0.5, 100)])
def test_rows_of_interest_epsilon(mock_fit, mock_pred_proba, epsilon, expected_len):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X = pd.DataFrame([i for i in range(100)])
y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
pipeline._is_fitted = True

vals = [0.2] * 25 + [0.5] * 50 + [0.85] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(pipeline, X, y, epsilon=epsilon)
assert len(vals) == expected_len

if epsilon == 0.01:
vals = [0.2] * 25 + [0.65] * 50 + [0.85] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(pipeline, X, y, epsilon=epsilon)
assert len(vals) == 0


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
@pytest.mark.parametrize(
"sorts,expected_val",
[
(True, list(range(75, 100)) + list(range(25, 75)) + list(range(25))),
(False, list(range(100))),
],
)
def test_rows_of_interest_sorted(mock_fit, mock_pred_proba, sorts, expected_val):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X = pd.DataFrame([i for i in range(100)])
y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
pipeline._is_fitted = True

vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(
pipeline, X, y, threshold=0.9, epsilon=0.9, sort_values=sorts
)
assert vals == expected_val


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
def test_rows_of_interest_index(mock_fit, mock_pred_proba):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X = pd.DataFrame(
[i for i in range(100)], index=["index_{}".format(i) for i in range(100)]
)
pipeline._is_fitted = True

vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(pipeline, X, epsilon=0.5)
assert vals == list(range(25, 75)) + list(range(25)) + list(range(75, 100))


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
@pytest.mark.parametrize(
"types,sorts,epsilon,expected_vals",
[
("correct", True, 0.01, list(range(25, 75))),
("true_negative", True, 0.3, list(range(25))),
("all", False, 0.3, list(range(75))),
],
)
def test_rows_of_interest(
mock_fit, mock_pred_proba, types, sorts, epsilon, expected_vals
):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X = pd.DataFrame([i for i in range(100)])
y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
pipeline._is_fitted = True

vals = [0.2] * 25 + [0.5] * 50 + [0.85] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(
pipeline, X, y, types=types, sort_values=sorts, epsilon=epsilon
)
assert vals == expected_vals

if types == "all":
vals = rows_of_interest(
pipeline, X, types=types, sort_values=sorts, epsilon=epsilon
)
assert vals == expected_vals


@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
def test_rows_of_interest_empty(mock_fit, mock_pred_proba):
pipeline = BinaryClassificationPipeline(
component_graph=["Logistic Regression Classifier"]
)
X = pd.DataFrame([i for i in range(100)])
y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
pipeline._is_fitted = True

vals = [1] * 25 + [0] * 50 + [1] * 25
predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
mock_pred_proba.return_value = predicted_proba_values
vals = rows_of_interest(pipeline, X, y, epsilon=0.5, types="correct")
assert len(vals) == 0

0 comments on commit 6f8d37a

Please sign in to comment.