Find rows near the decision boundary (#2908)

* initial commit with code * update release note * add raises docs * remove space * update docs * address comments * remove link
alteryx · Oct 19, 2021 · 6f8d37a · 6f8d37a
1 parent 2767e33
commit 6f8d37a
Show file tree

Hide file tree

Showing 5 changed files with 341 additions and 0 deletions.
diff --git a/docs/source/api_index.rst b/docs/source/api_index.rst
@@ -119,6 +119,7 @@ Pipeline Utils
 
     evalml.pipelines.utils.make_pipeline
     evalml.pipelines.utils.generate_pipeline_code
+    evalml.pipelines.utils.rows_of_interest
 
 
 

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Added ``rows_of_interest`` to pipeline utils :pr:`2908`
         * Added support for woodwork version ``0.8.2`` :pr:`2909`
         * Enhanced the ``DateTimeFeaturizer`` to handle ``NaNs`` in date features :pr:`2909`
     * Fixes

diff --git a/docs/source/user_guide/pipelines.ipynb b/docs/source/user_guide/pipelines.ipynb
@@ -480,6 +480,49 @@
     "exec(code)\n",
     "pipeline.fit(X, y)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Grabbing rows near the decision boundary\n",
+    "For binary classification problems, you can also look at the rows closest to the decision boundary by using `rows_of_interest`. This method returns the indices of interest, which can then be used to obtain the subset of the data that falls closest to the decision boundary. This can help with further analysis of the model, and can give you better understanding of what rows the model could be having trouble with."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from evalml.demos import load_breast_cancer\n",
+    "from evalml.pipelines import BinaryClassificationPipeline\n",
+    "from evalml.pipelines.utils import rows_of_interest\n",
+    "\n",
+    "X, y = load_breast_cancer()\n",
+    "pipeline = BinaryClassificationPipeline(component_graph=[\"Imputer\", \"Standard Scaler\", \"Logistic Regression Classifier\"])\n",
+    "pipeline.fit(X, y)\n",
+    "indices = rows_of_interest(pipeline, X, y, types='all')\n",
+    "X.iloc[indices]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can see what the probabilities are for these rows to determine how close they are to the 0.5 default threshold. X is used here for brevity."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pred_proba = pipeline.predict_proba(X)\n",
+    "pos_value_proba = pred_proba.iloc[:, -1]\n",
+    "pos_value_proba.iloc[indices]"
+   ]
   }
  ],
  "metadata": {

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -449,3 +449,84 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon):
         },
     )
     return baseline
+
+
+def rows_of_interest(
+    pipeline, X, y=None, threshold=None, epsilon=0.1, sort_values=True, types="all"
+):
+    """Get the row indices of the data that are closest to the threshold. Works only for binary classification problems and pipelines.
+
+    Args:
+        pipeline (PipelineBase): The fitted binary pipeline.
+        X (ww.DataTable, pd.DataFrame): The input features to predict on.
+        y (ww.DataColumn, pd.Series, None): The input target data,  if available. Defaults to None.
+        threshold (float): The threshold value of interest to separate positive and negative predictions. If None, uses the pipeline threshold if set, else 0.5. Defaults to None.
+        epsilon (epsilon): The difference between the probability and the threshold that would make the row interesting for us. For instance, epsilon=0.1 and threhsold=0.5 would mean
+            we consider all rows in [0.4, 0.6] to be of interest. Defaults to 0.1.
+        sort_values (bool): Whether to return the indices sorted by the distance from the threshold, such that the first values are closer to the threshold and the later values are further. Defaults to True.
+        types (str): The type of rows to keep and return. Can be one of ['incorrect', 'correct', 'true_positive', 'true_negative', 'all']. Defaults to 'all'.
+
+            'incorrect' - return only the rows where the predictions are incorrect. This means that, given the threshold and target y, keep only the rows which are labeled wrong.
+            'correct' - return only the rows where the predictions are correct. This means that, given the threshold and target y, keep only the rows which are correctly labeled.
+            'true_positive' - return only the rows which are positive, as given by the targets.
+            'true_negative' - return only the rows which are negative, as given by the targets.
+            'all' - return all rows. This is the only option available when there is no target data provided.
+
+    Returns:
+        The indices corresponding to the rows of interest.
+
+    Raises:
+        ValueError: If pipeline is not a fitted Binary Classification pipeline.
+        ValueError: If types is invalid or y is not provided when types is not 'all'.
+        ValueError: If the threshold is provided and is exclusive of [0, 1].
+    """
+    valid_types = ["incorrect", "correct", "true_positive", "true_negative", "all"]
+    if types not in valid_types:
+        raise ValueError(
+            "Invalid arg for 'types'! Must be one of {}".format(valid_types)
+        )
+
+    if types != "all" and y is None:
+        raise ValueError("Need an input y in order to use types {}".format(types))
+
+    if (
+        not isinstance(pipeline, BinaryClassificationPipeline)
+        or not pipeline._is_fitted
+    ):
+        raise ValueError(
+            "Pipeline provided must be a fitted Binary Classification pipeline!"
+        )
+
+    if threshold is not None and (threshold < 0 or threshold > 1):
+        raise ValueError(
+            "Provided threshold {} must be between [0, 1]".format(threshold)
+        )
+
+    if threshold is None:
+        threshold = pipeline.threshold or 0.5
+
+    # get predicted proba
+    pred_proba = pipeline.predict_proba(X)
+    pos_value_proba = pred_proba.iloc[:, -1]
+    preds = pos_value_proba >= threshold
+    preds_value_proba = abs(pos_value_proba - threshold)
+
+    # placeholder for y if it isn't supplied
+    y_current = y if y is not None else preds
+
+    # logic for breaking apart the different categories
+    mask = y_current
+    if types in ["correct", "incorrect"]:
+        mask = preds == y
+    mask = mask.astype(bool)
+
+    if types in ["correct", "true_positive"]:
+        preds_value_proba = preds_value_proba[mask.values]
+    elif types in ["incorrect", "true_negative"]:
+        preds_value_proba = preds_value_proba[~mask.values]
+
+    if sort_values:
+        preds_value_proba = preds_value_proba.sort_values(kind="stable")
+
+    preds_value_proba = preds_value_proba[preds_value_proba <= epsilon]
+    return preds_value_proba.index.tolist()
diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -1,3 +1,5 @@
+from unittest.mock import patch
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -38,6 +40,7 @@
     get_estimators,
     is_classification,
     make_pipeline,
+    rows_of_interest,
 )
 from evalml.problem_types import ProblemTypes, is_regression, is_time_series
 
@@ -591,3 +594,215 @@ def __init__(self, random_arg=False, random_seed=0):
     )
     pipeline = generate_pipeline_code(mock_pipeline_with_custom_components)
     assert pipeline == expected_code
+
+
+def test_rows_of_interest_errors(X_y_binary):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    pipeline_mc = MulticlassClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X, y = X_y_binary
+
+    with pytest.raises(ValueError, match="Invalid arg for"):
+        rows_of_interest(pipeline, X, y, types="ball")
+
+    with pytest.raises(ValueError, match="Need an input y in order to"):
+        rows_of_interest(pipeline, X, types="correct")
+
+    with pytest.raises(ValueError, match="Pipeline provided must be a fitted"):
+        rows_of_interest(pipeline, X, y, types="all")
+
+    with pytest.raises(ValueError, match="Pipeline provided must be a fitted"):
+        rows_of_interest(pipeline_mc, X, y, types="all")
+
+    with pytest.raises(ValueError, match="Pipeline provided must be a fitted"):
+        rows_of_interest(pipeline_mc, X, y, types="all")
+
+    pipeline._is_fitted = True
+    with pytest.raises(ValueError, match="Provided threshold 1.1 must be between"):
+        rows_of_interest(pipeline, X, y, threshold=1.1)
+
+    with pytest.raises(ValueError, match="Provided threshold -0.1 must be between"):
+        rows_of_interest(pipeline, X, y, threshold=-0.1)
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+@pytest.mark.parametrize("threshold", [0.3, None, 0.7])
+@pytest.mark.parametrize("y", [pd.Series([i % 2 for i in range(100)]), None])
+def test_rows_of_interest_threshold(mock_fit, mock_pred_proba, threshold, y):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Imputer", "Standard Scaler", "Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame([i for i in range(100)])
+    y = y
+    pipeline._is_fitted = True
+
+    vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(
+        pipeline, X, y, threshold=threshold, epsilon=0.5, sort_values=True
+    )
+    if threshold == 0.3:
+        assert vals == list(range(100))
+    elif threshold == 0.7:
+        assert vals == list(range(75, 100)) + list(range(25, 75)) + list(range(25))
+    else:
+        assert vals == list(range(25, 75)) + list(range(25)) + list(range(75, 100))
+
+    pipeline._threshold = 0.9
+    vals = rows_of_interest(
+        pipeline, X, y, threshold=None, epsilon=0.5, sort_values=True
+    )
+    assert vals == list(range(75, 100)) + list(range(25, 75))
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+@pytest.mark.parametrize(
+    "types,expected_val",
+    [
+        ("incorrect", list(range(75, 100))),
+        ("correct", list(range(75))),
+        ("true_positive", list(range(25, 75))),
+        ("true_negative", list(range(25)) + list(range(75, 100))),
+        ("all", list(range(100))),
+    ],
+)
+def test_rows_of_interest_types(mock_fit, mock_pred_proba, types, expected_val):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame([i for i in range(100)])
+    y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
+    pipeline._is_fitted = True
+
+    vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(pipeline, X, y, types=types, epsilon=0.5, sort_values=False)
+    assert vals == expected_val
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+@pytest.mark.parametrize("epsilon,expected_len", [(0.01, 50), (0.3, 75), (0.5, 100)])
+def test_rows_of_interest_epsilon(mock_fit, mock_pred_proba, epsilon, expected_len):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame([i for i in range(100)])
+    y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
+    pipeline._is_fitted = True
+
+    vals = [0.2] * 25 + [0.5] * 50 + [0.85] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(pipeline, X, y, epsilon=epsilon)
+    assert len(vals) == expected_len
+
+    if epsilon == 0.01:
+        vals = [0.2] * 25 + [0.65] * 50 + [0.85] * 25
+        predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+        mock_pred_proba.return_value = predicted_proba_values
+        vals = rows_of_interest(pipeline, X, y, epsilon=epsilon)
+        assert len(vals) == 0
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+@pytest.mark.parametrize(
+    "sorts,expected_val",
+    [
+        (True, list(range(75, 100)) + list(range(25, 75)) + list(range(25))),
+        (False, list(range(100))),
+    ],
+)
+def test_rows_of_interest_sorted(mock_fit, mock_pred_proba, sorts, expected_val):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame([i for i in range(100)])
+    y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
+    pipeline._is_fitted = True
+
+    vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(
+        pipeline, X, y, threshold=0.9, epsilon=0.9, sort_values=sorts
+    )
+    assert vals == expected_val
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+def test_rows_of_interest_index(mock_fit, mock_pred_proba):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame(
+        [i for i in range(100)], index=["index_{}".format(i) for i in range(100)]
+    )
+    pipeline._is_fitted = True
+
+    vals = [0.2] * 25 + [0.5] * 50 + [0.8] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(pipeline, X, epsilon=0.5)
+    assert vals == list(range(25, 75)) + list(range(25)) + list(range(75, 100))
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+@pytest.mark.parametrize(
+    "types,sorts,epsilon,expected_vals",
+    [
+        ("correct", True, 0.01, list(range(25, 75))),
+        ("true_negative", True, 0.3, list(range(25))),
+        ("all", False, 0.3, list(range(75))),
+    ],
+)
+def test_rows_of_interest(
+    mock_fit, mock_pred_proba, types, sorts, epsilon, expected_vals
+):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame([i for i in range(100)])
+    y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
+    pipeline._is_fitted = True
+
+    vals = [0.2] * 25 + [0.5] * 50 + [0.85] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(
+        pipeline, X, y, types=types, sort_values=sorts, epsilon=epsilon
+    )
+    assert vals == expected_vals
+
+    if types == "all":
+        vals = rows_of_interest(
+            pipeline, X, types=types, sort_values=sorts, epsilon=epsilon
+        )
+        assert vals == expected_vals
+
+
+@patch("evalml.pipelines.BinaryClassificationPipeline.predict_proba")
+@patch("evalml.pipelines.BinaryClassificationPipeline.fit")
+def test_rows_of_interest_empty(mock_fit, mock_pred_proba):
+    pipeline = BinaryClassificationPipeline(
+        component_graph=["Logistic Regression Classifier"]
+    )
+    X = pd.DataFrame([i for i in range(100)])
+    y = pd.Series([0] * 25 + [1] * 50 + [0] * 25)
+    pipeline._is_fitted = True
+
+    vals = [1] * 25 + [0] * 50 + [1] * 25
+    predicted_proba_values = pd.DataFrame({0: [1 - v for v in vals], 1: vals})
+    mock_pred_proba.return_value = predicted_proba_values
+    vals = rows_of_interest(pipeline, X, y, epsilon=0.5, types="correct")
+    assert len(vals) == 0