alteryx · angela97lin · Mar 17, 2021 · Mar 5, 2021 · Mar 5, 2021 · Mar 6, 2021
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -8,6 +8,7 @@ Release Notes
         * Updated ``make_pipeline`` to not add ``Imputer`` if input data does not have numeric or categorical columns :pr:`1967`
         * Added recommended actions for the output of data check's ``validate`` method :pr:`1968`
     * Fixes
+        * Updated binary classification pipelines to use objective decision function during scoring of custom objectives :pr:`1934`
     * Changes
         * Removed ``data_checks`` parameter, ``data_check_results`` and data checks logic from ``AutoMLSearch`` :pr:`1935`
     * Documentation Changes

diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py
@@ -1,23 +1,17 @@
+from .binary_classification_pipeline_mixin import (
+    BinaryClassificationPipelineMixin
+)
+
 from evalml.objectives import get_objective
 from evalml.pipelines.classification_pipeline import ClassificationPipeline
 from evalml.problem_types import ProblemTypes
 from evalml.utils import infer_feature_types
 
 
-class BinaryClassificationPipeline(ClassificationPipeline):
+class BinaryClassificationPipeline(BinaryClassificationPipelineMixin, ClassificationPipeline):
     """Pipeline subclass for all binary classification pipelines."""
-    _threshold = None
     problem_type = ProblemTypes.BINARY
 
-    @property
-    def threshold(self):
-        """Threshold used to make a prediction. Defaults to None."""
-        return self._threshold
-
-    @threshold.setter
-    def threshold(self, value):
-        self._threshold = value
-
     def _predict(self, X, objective=None):
         """Make predictions using selected features.
 
@@ -37,10 +31,8 @@ def _predict(self, X, objective=None):
         if self.threshold is None:
             return self._component_graph.predict(X)
         ypred_proba = self.predict_proba(X).to_dataframe()
-        ypred_proba = ypred_proba.iloc[:, 1]
-        if objective is None:
-            return infer_feature_types(ypred_proba > self.threshold)
-        return infer_feature_types(objective.decision_function(ypred_proba, threshold=self.threshold, X=X))
+        predictions = self._predict_with_objective(X, ypred_proba, objective)
+        return infer_feature_types(predictions)
 
     def predict_proba(self, X):
         """Make probability estimates for labels. Assumes that the column at index 1 represents the positive label case.

diff --git a/evalml/pipelines/binary_classification_pipeline_mixin.py b/evalml/pipelines/binary_classification_pipeline_mixin.py
@@ -0,0 +1,55 @@
+import sys
+import traceback
+from collections import OrderedDict
+
+from evalml.exceptions import PipelineScoreError
+
+
+class BinaryClassificationPipelineMixin():
+    _threshold = None
+
+    @property
+    def threshold(self):
+        """Threshold used to make a prediction. Defaults to None."""
+        return self._threshold
+
+    @threshold.setter
+    def threshold(self, value):
+        self._threshold = value
+
+    def _predict_with_objective(self, X, ypred_proba, objective):
+        ypred_proba = ypred_proba.iloc[:, 1]
+        if objective is None:
+            return ypred_proba > self.threshold
+        return objective.decision_function(ypred_proba, threshold=self.threshold, X=X)
+
+    def _compute_predictions(self, X, y, objectives, time_series=False):
+        """Compute predictions/probabilities based on objectives."""
+        y_predicted = None
+        y_predicted_proba = None
+        if any(o.score_needs_proba for o in objectives) or self.threshold is not None:
+            y_predicted_proba = self.predict_proba(X, y) if time_series else self.predict_proba(X)
+        if any(not o.score_needs_proba for o in objectives) and self.threshold is None:
+            y_predicted = self._predict(X, y, pad=True) if time_series else self._predict(X)
+        return y_predicted, y_predicted_proba
+
+    def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives):
+        scored_successfully = OrderedDict()
+        exceptions = OrderedDict()
+        for objective in objectives:
+            try:
+                if not objective.is_defined_for_problem_type(self.problem_type):
+                    raise ValueError(f'Invalid objective {objective.name} specified for problem type {self.problem_type}')
+                y_pred_to_use = y_pred
+                if self.threshold is not None and not objective.score_needs_proba:
+                    y_pred_to_use = self._predict_with_objective(X, y_pred_proba, objective)
+                score = self._score(X, y, y_pred_proba if objective.score_needs_proba else y_pred_to_use, objective)
+                scored_successfully.update({objective.name: score})
+            except Exception as e:
+                tb = traceback.format_tb(sys.exc_info()[2])
+                exceptions[objective.name] = (e, tb)
+        if exceptions:
+            # If any objective failed, throw an PipelineScoreError
+            raise PipelineScoreError(exceptions, scored_successfully)
+        # No objectives failed, return the scores
+        return scored_successfully
diff --git a/evalml/pipelines/time_series_classification_pipelines.py b/evalml/pipelines/time_series_classification_pipelines.py
@@ -1,5 +1,10 @@
+
 import pandas as pd
 
+from .binary_classification_pipeline_mixin import (
+    BinaryClassificationPipelineMixin
+)
+
 from evalml.objectives import get_objective
 from evalml.pipelines.classification_pipeline import ClassificationPipeline
 from evalml.pipelines.pipeline_meta import TimeSeriesPipelineBaseMeta
@@ -169,17 +174,8 @@ def score(self, X, y, objectives):
                                           objectives=objectives)
 
 
-class TimeSeriesBinaryClassificationPipeline(TimeSeriesClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta):
+class TimeSeriesBinaryClassificationPipeline(BinaryClassificationPipelineMixin, TimeSeriesClassificationPipeline, metaclass=TimeSeriesPipelineBaseMeta):
     problem_type = ProblemTypes.TIME_SERIES_BINARY
-    _threshold = None
-
-    @property
-    def threshold(self):
-        return self._threshold
-
-    @threshold.setter
-    def threshold(self, value):
-        self._threshold = value
 
     def _predict(self, X, y, objective=None, pad=False):
         features = self.compute_estimator_features(X, y)

diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_binary_classification.py
@@ -4,6 +4,9 @@
 import pytest
 import woodwork as ww
 
+from evalml.exceptions import PipelineScoreError
+from evalml.objectives import FraudCost
+
 
 @patch('evalml.pipelines.ClassificationPipeline._decode_targets', return_value=[0, 1])
 @patch('evalml.objectives.BinaryClassificationObjective.decision_function', return_value=pd.Series([1, 0]))
@@ -66,3 +69,33 @@ def test_binary_predict_pipeline_objective_mismatch(mock_transform, X_y_binary,
     with pytest.raises(ValueError, match="You can only use a binary classification objective to make predictions for a binary classification pipeline."):
         binary_pipeline.predict(X, "precision micro")
     mock_transform.assert_called()
+
+
+@pytest.mark.parametrize("is_time_series", [True, False])
+@patch('evalml.objectives.FraudCost.decision_function')
+def test_binary_predict_pipeline_use_objective(mock_decision_function, is_time_series,
+                                               X_y_binary, logistic_regression_binary_pipeline_class, time_series_binary_classification_pipeline_class):
+    X, y = X_y_binary
+    binary_pipeline = None
+    if is_time_series:
+        binary_pipeline = time_series_binary_classification_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1},
+                                                                                       "pipeline": {"gap": 0, "max_delay": 0}})
+        mock_decision_function.return_value = pd.Series([0] * 98)
+
+    else:
+        binary_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
+        mock_decision_function.return_value = pd.Series([0] * 100)
+
+    binary_pipeline.threshold = 0.7
+    binary_pipeline.fit(X, y)
+    fraud_cost = FraudCost(amount_col=0)
+    binary_pipeline.score(X, y, ['precision', 'auc', fraud_cost])
+    mock_decision_function.assert_called()
+
+
+def test_binary_predict_pipeline_score_error(X_y_binary, logistic_regression_binary_pipeline_class):
+    X, y = X_y_binary
+    binary_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
+    binary_pipeline.fit(X, y)
+    with pytest.raises(PipelineScoreError, match='Invalid objective MCC Multiclass specified for problem type binary'):
+        binary_pipeline.score(X, y, ['MCC Multiclass'])
diff --git a/evalml/tests/pipeline_tests/test_time_series_baseline_pipeline.py b/evalml/tests/pipeline_tests/test_time_series_baseline_pipeline.py
@@ -62,9 +62,11 @@ def test_time_series_baseline_predict_proba(pipeline_class, gap, X_none):
 @pytest.mark.parametrize("only_use_y", [True, False])
 @pytest.mark.parametrize("gap,max_delay", [(0, 0), (1, 0), (0, 2), (1, 1), (1, 2), (2, 2), (7, 3), (2, 4)])
 @patch("evalml.pipelines.RegressionPipeline._score_all_objectives")
-@patch("evalml.pipelines.ClassificationPipeline._score_all_objectives")
+@patch("evalml.pipelines.TimeSeriesClassificationPipeline._score_all_objectives")
+@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline._score_all_objectives")
 @patch("evalml.pipelines.ClassificationPipeline._encode_targets", side_effect=lambda y: y)
-def test_time_series_baseline_score_offset(mock_encode, mock_classification_score, mock_regression_score, gap, max_delay,
+def test_time_series_baseline_score_offset(mock_encode, mock_binary_classification_score, mock_multiclass_classification_score,
+                                           mock_regression_score, gap, max_delay,
                                            only_use_y, pipeline_class, ts_data):
     X, y = ts_data
 
@@ -73,7 +75,13 @@ def test_time_series_baseline_score_offset(mock_encode, mock_classification_scor
         expected_target = expected_target[1:]
     clf = pipeline_class(parameters={"pipeline": {"gap": gap, "max_delay": max_delay},
                                      "Time Series Baseline Estimator": {"gap": gap, "max_delay": max_delay}})
-    mock_score = mock_regression_score if pipeline_class == TimeSeriesBaselineRegressionPipeline else mock_classification_score
+    mock_score = None
+    if pipeline_class == TimeSeriesBaselineRegressionPipeline:
+        mock_score = mock_regression_score
+    elif pipeline_class == TimeSeriesBaselineBinaryPipeline:
+        mock_score = mock_binary_classification_score
+    else:
+        mock_score = mock_multiclass_classification_score
     if only_use_y:
         clf.fit(None, y)
         clf.score(X=None, y=y, objectives=['MCC Binary'])

diff --git a/evalml/tests/pipeline_tests/test_time_series_pipeline.py b/evalml/tests/pipeline_tests/test_time_series_pipeline.py
@@ -159,12 +159,14 @@ def mock_predict(df, y=None):
 @patch("evalml.pipelines.components.LogisticRegressionClassifier.predict")
 @patch("evalml.pipelines.TimeSeriesClassificationPipeline._encode_targets", side_effect=lambda y: y)
 @patch("evalml.pipelines.PipelineBase._score_all_objectives")
-def test_score_drops_nans(mock_score, mock_encode_targets,
+@patch("evalml.pipelines.TimeSeriesBinaryClassificationPipeline._score_all_objectives")
+def test_score_drops_nans(mock_binary_score, mock_score, mock_encode_targets,
                           mock_classifier_predict, mock_classifier_fit,
                           mock_regressor_predict, mock_regressor_fit,
                           pipeline_class,
                           estimator_name, gap, max_delay, include_delayed_features, only_use_y, ts_data):
-
+    if pipeline_class == TimeSeriesBinaryClassificationPipeline:
+        mock_score = mock_binary_score
     if only_use_y and (not include_delayed_features or (max_delay == 0 and gap == 0)):
         pytest.skip("This would result in an empty feature dataframe.")