Percent better than baseline (#1050)

* Adding infrastructure to be able to compute percent difference between scores. * Adding percent_better_than_baseline_column to AutoML search. * Fixing broken automl tests and updating release notes for PR 1050. * Adding is_percentage and perfect_score to CostBenefitMatrix. * Removing is_percentage from ObjectiveBase. * Rewording some test variable names and minor tweaks to AutoML Search related to computing % better than baseline. * Updating docstring in calculate_percent_difference.
alteryx · Aug 18, 2020 · 88a5f1b · 88a5f1b
1 parent e00fa4b
commit 88a5f1b
Show file tree

Hide file tree

Showing 10 changed files with 171 additions and 8 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -9,6 +9,7 @@ Release Notes
         * Added new LSA component for text featurization :pr:`1022`
         * Added guide on installing with conda :pr:`1041`
         * Standardized error when calling transform/predict before fit for pipelines :pr:`1048`
+        * Added `percent_better_than_baseline` to Automl search rankings and full rankings table :pr:`1050`
     * Fixes
         * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
         * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`

diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb
@@ -66,7 +66,9 @@
     "\n",
     "* `score_needs_proba`: Only for classification objectives. `True` if the objective is intended to function with predicted probabilities as opposed to predicted values (example: cross entropy for classifiers).\n",
     "\n",
-    "* `decision_function`: Only for binary classification objectives. This function takes predicted probabilities that were output from the model and a binary classification threshold, and returns predicted values. "
+    "* `decision_function`: Only for binary classification objectives. This function takes predicted probabilities that were output from the model and a binary classification threshold, and returns predicted values.\n",
+    "\n",
+    "* `perfect_score`: The score achieved by a perfect model on this objective."
    ]
   },
   {
@@ -93,6 +95,7 @@
     "    name = \"Fraud Cost\"\n",
     "    greater_is_better = False\n",
     "    score_needs_proba = False\n",
+    "    perfect_score = 0.0\n",
     "\n",
     "    def __init__(self, retry_percentage=.5, interchange_fee=.02,\n",
     "                 fraud_payout_percentage=1.0, amount_col='amount'):\n",

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -197,6 +197,7 @@ def __init__(self,
         self.allowed_model_families = allowed_model_families
         self._automl_algorithm = None
         self._start = None
+        self._baseline_cv_score = None
 
         self._validate_problem_type()
 
@@ -528,6 +529,7 @@ def _add_baseline_pipelines(self, X, y):
                                 self._start)
 
                 baseline_results = self._compute_cv_scores(baseline, X, y)
+                self._baseline_cv_score = baseline_results["cv_score_mean"]
                 self._add_result(trained_pipeline=baseline,
                                  parameters=baseline.parameters,
                                  training_time=baseline_results['training_time'],
@@ -614,6 +616,7 @@ def _compute_cv_scores(self, pipeline, X, y):
 
     def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_scores):
         cv_score = cv_scores.mean()
+        percent_better = self.objective.calculate_percent_difference(cv_score, self._baseline_cv_score)
         # calculate high_variance_cv
         # if the coefficient of variance is greater than .2
         with warnings.catch_warnings():
@@ -634,6 +637,7 @@ def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_s
             "high_variance_cv": high_variance_cv,
             "training_time": training_time,
             "cv_data": cv_data,
+            "percent_better_than_baseline": percent_better
         }
         self._results['search_order'].append(pipeline_id)
 
@@ -780,7 +784,8 @@ def full_rankings(self):
         if self.objective.greater_is_better:
             ascending = False
 
-        full_rankings_cols = ["id", "pipeline_name", "score", "high_variance_cv", "parameters"]
+        full_rankings_cols = ["id", "pipeline_name", "score", "percent_better_than_baseline",
+                              "high_variance_cv", "parameters"]
         if not self.has_searched:
             return pd.DataFrame(columns=full_rankings_cols)
 

diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py
@@ -11,6 +11,7 @@ class CostBenefitMatrix(BinaryClassificationObjective):
     name = "Cost Benefit Matrix"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = np.inf
 
     def __init__(self, true_positive_cost, true_negative_cost, false_positive_cost, false_negative_cost):
         """Create instance of CostBenefitMatrix.

diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py
@@ -8,6 +8,7 @@ class FraudCost(BinaryClassificationObjective):
     name = "Fraud Cost"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def __init__(self, retry_percentage=.5, interchange_fee=.02,
                  fraud_payout_percentage=1.0, amount_col='amount'):

diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py
@@ -1,3 +1,5 @@
+import math
+
 import pandas as pd
 
 from .binary_classification_objective import BinaryClassificationObjective
@@ -8,6 +10,7 @@ class LeadScoring(BinaryClassificationObjective):
     name = "Lead Scoring"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = math.inf
 
     def __init__(self, true_positives=1, false_positives=-1):
         """Create instance.

diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py
@@ -26,6 +26,12 @@ def score_needs_proba(cls):
         """Returns a boolean determining if the score() method needs probability estimates. This should be true for objectives which work with predicted probabilities, like log loss or AUC, and false for objectives which compare predicted class labels to the actual labels, like F1 or correlation.
         """
 
+    @property
+    @classmethod
+    @abstractmethod
+    def perfect_score(cls):
+        """Returns the score obtained by evaluating this objective on a perfect model."""
+
     @classmethod
     @abstractmethod
     def objective_function(cls, y_true, y_predicted, X=None):
@@ -89,3 +95,27 @@ def validate_inputs(self, y_true, y_predicted):
             raise ValueError("y_predicted contains NaN or infinity")
         if self.score_needs_proba and np.any([(y_predicted < 0) | (y_predicted > 1)]):
             raise ValueError("y_predicted contains probability estimates not within [0, 1]")
+
+    @classmethod
+    def calculate_percent_difference(cls, score, baseline_score):
+        """Calculate the percent difference between scores.
+
+        Arguments:
+            score (float): A score. Output of the score method of this objective.
+            baseline_score (float): A score. Output of the score method of this objective. In practice,
+                this is the score achieved on this objective with a baseline estimator.
+
+        Returns:
+            float: The percent difference between the scores. This will be the difference normalized by the
+                baseline score.
+        """
+
+        if pd.isna(score) or pd.isna(baseline_score):
+            return np.nan
+
+        if baseline_score == 0:
+            return np.nan
+
+        difference = (baseline_score - score)
+        change = difference / baseline_score
+        return 100 * (-1) ** (cls.greater_is_better) * change