From 88a5f1b4eb46fe481ae997057fd12bb76136a3d6 Mon Sep 17 00:00:00 2001
From: Freddy Boulton <41651716+freddyaboulton@users.noreply.github.com>
Date: Tue, 18 Aug 2020 10:12:25 -0400
Subject: [PATCH] Percent better than baseline (#1050)

* Adding infrastructure to be able to compute percent difference between scores.

* Adding percent_better_than_baseline_column to AutoML search.

* Fixing broken automl tests and updating release notes for PR 1050.

* Adding is_percentage and perfect_score to CostBenefitMatrix.

* Removing is_percentage from ObjectiveBase.

* Rewording some test variable names and minor tweaks to AutoML Search related to computing % better than baseline.

* Updating docstring in calculate_percent_difference.
---
 docs/source/release_notes.rst                 |  1 +
 docs/source/user_guide/objectives.ipynb       |  5 +-
 evalml/automl/automl_search.py                |  7 +-
 evalml/objectives/cost_benefit_matrix.py      |  1 +
 evalml/objectives/fraud_cost.py               |  1 +
 evalml/objectives/lead_scoring.py             |  3 +
 evalml/objectives/objective_base.py           | 30 ++++++++
 evalml/objectives/standard_metrics.py         | 33 +++++++++
 evalml/tests/automl_tests/test_automl.py      | 69 +++++++++++++++++--
 .../objective_tests/test_standard_metrics.py  | 29 ++++++++
 10 files changed, 171 insertions(+), 8 deletions(-)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index 726396d42f..6adc80ff62 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -9,6 +9,7 @@ Release Notes
         * Added new LSA component for text featurization :pr:`1022`
         * Added guide on installing with conda :pr:`1041`
         * Standardized error when calling transform/predict before fit for pipelines :pr:`1048`
+        * Added `percent_better_than_baseline` to Automl search rankings and full rankings table :pr:`1050`
     * Fixes
         * Updated TextFeaturizer component to no longer require an internet connection to run :pr:`1022`
         * Fixed non-deterministic element of TextFeaturizer transformations :pr:`1022`
diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb
index 048a4c0b24..5cc3936b9c 100644
--- a/docs/source/user_guide/objectives.ipynb
+++ b/docs/source/user_guide/objectives.ipynb
@@ -66,7 +66,9 @@
     "\n",
     "* `score_needs_proba`: Only for classification objectives. `True` if the objective is intended to function with predicted probabilities as opposed to predicted values (example: cross entropy for classifiers).\n",
     "\n",
-    "* `decision_function`: Only for binary classification objectives. This function takes predicted probabilities that were output from the model and a binary classification threshold, and returns predicted values. "
+    "* `decision_function`: Only for binary classification objectives. This function takes predicted probabilities that were output from the model and a binary classification threshold, and returns predicted values.\n",
+    "\n",
+    "* `perfect_score`: The score achieved by a perfect model on this objective."
    ]
   },
   {
@@ -93,6 +95,7 @@
     "    name = \"Fraud Cost\"\n",
     "    greater_is_better = False\n",
     "    score_needs_proba = False\n",
+    "    perfect_score = 0.0\n",
     "\n",
     "    def __init__(self, retry_percentage=.5, interchange_fee=.02,\n",
     "                 fraud_payout_percentage=1.0, amount_col='amount'):\n",
diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
index d6ebcc6abc..bd6c5f75f8 100644
--- a/evalml/automl/automl_search.py
+++ b/evalml/automl/automl_search.py
@@ -197,6 +197,7 @@ def __init__(self,
         self.allowed_model_families = allowed_model_families
         self._automl_algorithm = None
         self._start = None
+        self._baseline_cv_score = None
 
         self._validate_problem_type()
 
@@ -528,6 +529,7 @@ def _add_baseline_pipelines(self, X, y):
                                 self._start)
 
                 baseline_results = self._compute_cv_scores(baseline, X, y)
+                self._baseline_cv_score = baseline_results["cv_score_mean"]
                 self._add_result(trained_pipeline=baseline,
                                  parameters=baseline.parameters,
                                  training_time=baseline_results['training_time'],
@@ -614,6 +616,7 @@ def _compute_cv_scores(self, pipeline, X, y):
 
     def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_scores):
         cv_score = cv_scores.mean()
+        percent_better = self.objective.calculate_percent_difference(cv_score, self._baseline_cv_score)
         # calculate high_variance_cv
         # if the coefficient of variance is greater than .2
         with warnings.catch_warnings():
@@ -634,6 +637,7 @@ def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_s
             "high_variance_cv": high_variance_cv,
             "training_time": training_time,
             "cv_data": cv_data,
+            "percent_better_than_baseline": percent_better
         }
         self._results['search_order'].append(pipeline_id)
 
@@ -780,7 +784,8 @@ def full_rankings(self):
         if self.objective.greater_is_better:
             ascending = False
 
-        full_rankings_cols = ["id", "pipeline_name", "score", "high_variance_cv", "parameters"]
+        full_rankings_cols = ["id", "pipeline_name", "score", "percent_better_than_baseline",
+                              "high_variance_cv", "parameters"]
         if not self.has_searched:
             return pd.DataFrame(columns=full_rankings_cols)
 
diff --git a/evalml/objectives/cost_benefit_matrix.py b/evalml/objectives/cost_benefit_matrix.py
index 3e48c0c1b4..676350f726 100644
--- a/evalml/objectives/cost_benefit_matrix.py
+++ b/evalml/objectives/cost_benefit_matrix.py
@@ -11,6 +11,7 @@ class CostBenefitMatrix(BinaryClassificationObjective):
     name = "Cost Benefit Matrix"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = np.inf
 
     def __init__(self, true_positive_cost, true_negative_cost, false_positive_cost, false_negative_cost):
         """Create instance of CostBenefitMatrix.
diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py
index d094e2e14d..d24a470991 100644
--- a/evalml/objectives/fraud_cost.py
+++ b/evalml/objectives/fraud_cost.py
@@ -8,6 +8,7 @@ class FraudCost(BinaryClassificationObjective):
     name = "Fraud Cost"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def __init__(self, retry_percentage=.5, interchange_fee=.02,
                  fraud_payout_percentage=1.0, amount_col='amount'):
diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py
index 7dced4d64d..3354ad5b9f 100644
--- a/evalml/objectives/lead_scoring.py
+++ b/evalml/objectives/lead_scoring.py
@@ -1,3 +1,5 @@
+import math
+
 import pandas as pd
 
 from .binary_classification_objective import BinaryClassificationObjective
@@ -8,6 +10,7 @@ class LeadScoring(BinaryClassificationObjective):
     name = "Lead Scoring"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = math.inf
 
     def __init__(self, true_positives=1, false_positives=-1):
         """Create instance.
diff --git a/evalml/objectives/objective_base.py b/evalml/objectives/objective_base.py
index 4db33014b2..9887ceb80f 100644
--- a/evalml/objectives/objective_base.py
+++ b/evalml/objectives/objective_base.py
@@ -26,6 +26,12 @@ def score_needs_proba(cls):
         """Returns a boolean determining if the score() method needs probability estimates. This should be true for objectives which work with predicted probabilities, like log loss or AUC, and false for objectives which compare predicted class labels to the actual labels, like F1 or correlation.
         """
 
+    @property
+    @classmethod
+    @abstractmethod
+    def perfect_score(cls):
+        """Returns the score obtained by evaluating this objective on a perfect model."""
+
     @classmethod
     @abstractmethod
     def objective_function(cls, y_true, y_predicted, X=None):
@@ -89,3 +95,27 @@ def validate_inputs(self, y_true, y_predicted):
             raise ValueError("y_predicted contains NaN or infinity")
         if self.score_needs_proba and np.any([(y_predicted < 0) | (y_predicted > 1)]):
             raise ValueError("y_predicted contains probability estimates not within [0, 1]")
+
+    @classmethod
+    def calculate_percent_difference(cls, score, baseline_score):
+        """Calculate the percent difference between scores.
+
+        Arguments:
+            score (float): A score. Output of the score method of this objective.
+            baseline_score (float): A score. Output of the score method of this objective. In practice,
+                this is the score achieved on this objective with a baseline estimator.
+
+        Returns:
+            float: The percent difference between the scores. This will be the difference normalized by the
+                baseline score.
+        """
+
+        if pd.isna(score) or pd.isna(baseline_score):
+            return np.nan
+
+        if baseline_score == 0:
+            return np.nan
+
+        difference = (baseline_score - score)
+        change = difference / baseline_score
+        return 100 * (-1) ** (cls.greater_is_better) * change
diff --git a/evalml/objectives/standard_metrics.py b/evalml/objectives/standard_metrics.py
index 01125bae4e..c1a6e8867f 100644
--- a/evalml/objectives/standard_metrics.py
+++ b/evalml/objectives/standard_metrics.py
@@ -16,6 +16,7 @@ class AccuracyBinary(BinaryClassificationObjective):
     name = "Accuracy Binary"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.accuracy_score(y_true, y_predicted)
@@ -26,6 +27,7 @@ class AccuracyMulticlass(MulticlassClassificationObjective):
     name = "Accuracy Multiclass"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.accuracy_score(y_true, y_predicted)
@@ -36,6 +38,7 @@ class BalancedAccuracyBinary(BinaryClassificationObjective):
     name = "Balanced Accuracy Binary"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.balanced_accuracy_score(y_true, y_predicted)
@@ -46,6 +49,7 @@ class BalancedAccuracyMulticlass(MulticlassClassificationObjective):
     name = "Balanced Accuracy Multiclass"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.balanced_accuracy_score(y_true, y_predicted)
@@ -56,6 +60,7 @@ class F1(BinaryClassificationObjective):
     name = "F1"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.f1_score(y_true, y_predicted, zero_division=0.0)
@@ -66,6 +71,7 @@ class F1Micro(MulticlassClassificationObjective):
     name = "F1 Micro"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.f1_score(y_true, y_predicted, average='micro', zero_division=0.0)
@@ -76,6 +82,7 @@ class F1Macro(MulticlassClassificationObjective):
     name = "F1 Macro"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.f1_score(y_true, y_predicted, average='macro', zero_division=0.0)
@@ -86,6 +93,7 @@ class F1Weighted(MulticlassClassificationObjective):
     name = "F1 Weighted"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.f1_score(y_true, y_predicted, average='weighted', zero_division=0.0)
@@ -96,6 +104,7 @@ class Precision(BinaryClassificationObjective):
     name = "Precision"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.precision_score(y_true, y_predicted, zero_division=0.0)
@@ -106,6 +115,7 @@ class PrecisionMicro(MulticlassClassificationObjective):
     name = "Precision Micro"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.precision_score(y_true, y_predicted, average='micro', zero_division=0.0)
@@ -116,6 +126,7 @@ class PrecisionMacro(MulticlassClassificationObjective):
     name = "Precision Macro"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.precision_score(y_true, y_predicted, average='macro', zero_division=0.0)
@@ -126,6 +137,7 @@ class PrecisionWeighted(MulticlassClassificationObjective):
     name = "Precision Weighted"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.precision_score(y_true, y_predicted, average='weighted', zero_division=0.0)
@@ -136,6 +148,7 @@ class Recall(BinaryClassificationObjective):
     name = "Recall"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.recall_score(y_true, y_predicted, zero_division=0.0)
@@ -146,6 +159,7 @@ class RecallMicro(MulticlassClassificationObjective):
     name = "Recall Micro"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.recall_score(y_true, y_predicted, average='micro', zero_division=0.0)
@@ -156,6 +170,7 @@ class RecallMacro(MulticlassClassificationObjective):
     name = "Recall Macro"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.recall_score(y_true, y_predicted, average='macro', zero_division=0.0)
@@ -166,6 +181,7 @@ class RecallWeighted(MulticlassClassificationObjective):
     name = "Recall Weighted"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.recall_score(y_true, y_predicted, average='weighted', zero_division=0.0)
@@ -176,6 +192,7 @@ class AUC(BinaryClassificationObjective):
     name = "AUC"
     greater_is_better = True
     score_needs_proba = True
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.roc_auc_score(y_true, y_predicted)
@@ -186,6 +203,7 @@ class AUCMicro(MulticlassClassificationObjective):
     name = "AUC Micro"
     greater_is_better = True
     score_needs_proba = True
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         y_true, y_predicted = _handle_predictions(y_true, y_predicted)
@@ -197,6 +215,7 @@ class AUCMacro(MulticlassClassificationObjective):
     name = "AUC Macro"
     greater_is_better = True
     score_needs_proba = True
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         y_true, y_predicted = _handle_predictions(y_true, y_predicted)
@@ -208,6 +227,7 @@ class AUCWeighted(MulticlassClassificationObjective):
     name = "AUC Weighted"
     greater_is_better = True
     score_needs_proba = True
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         y_true, y_predicted = _handle_predictions(y_true, y_predicted)
@@ -219,6 +239,7 @@ class LogLossBinary(BinaryClassificationObjective):
     name = "Log Loss Binary"
     greater_is_better = False
     score_needs_proba = True
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.log_loss(y_true, y_predicted)
@@ -229,6 +250,7 @@ class LogLossMulticlass(MulticlassClassificationObjective):
     name = "Log Loss Multiclass"
     greater_is_better = False
     score_needs_proba = True
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.log_loss(y_true, y_predicted)
@@ -239,6 +261,7 @@ class MCCBinary(BinaryClassificationObjective):
     name = "MCC Binary"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         with warnings.catch_warnings():
@@ -252,6 +275,7 @@ class MCCMulticlass(MulticlassClassificationObjective):
     name = "MCC Multiclass"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         with warnings.catch_warnings():
@@ -265,6 +289,7 @@ class RootMeanSquaredError(RegressionObjective):
     name = "Root Mean Squared Error"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.mean_squared_error(y_true, y_predicted, squared=False)
@@ -278,6 +303,7 @@ class RootMeanSquaredLogError(RegressionObjective):
     name = "Root Mean Squared Log Error"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return np.sqrt(metrics.mean_squared_log_error(y_true, y_predicted))
@@ -291,6 +317,7 @@ class MeanSquaredLogError(RegressionObjective):
     name = "Mean Squared Log Error"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.mean_squared_log_error(y_true, y_predicted)
@@ -301,6 +328,7 @@ class R2(RegressionObjective):
     name = "R2"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.r2_score(y_true, y_predicted)
@@ -311,6 +339,7 @@ class MAE(RegressionObjective):
     name = "MAE"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.mean_absolute_error(y_true, y_predicted)
@@ -321,6 +350,7 @@ class MSE(RegressionObjective):
     name = "MSE"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.mean_squared_error(y_true, y_predicted)
@@ -331,6 +361,7 @@ class MedianAE(RegressionObjective):
     name = "MedianAE"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.median_absolute_error(y_true, y_predicted)
@@ -341,6 +372,7 @@ class MaxError(RegressionObjective):
     name = "MaxError"
     greater_is_better = False
     score_needs_proba = False
+    perfect_score = 0.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.max_error(y_true, y_predicted)
@@ -351,6 +383,7 @@ class ExpVariance(RegressionObjective):
     name = "ExpVariance"
     greater_is_better = True
     score_needs_proba = False
+    perfect_score = 1.0
 
     def objective_function(self, y_true, y_predicted, X=None):
         return metrics.explained_variance_score(y_true, y_predicted)
diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
index e15b56597c..cb9581869b 100644
--- a/evalml/tests/automl_tests/test_automl.py
+++ b/evalml/tests/automl_tests/test_automl.py
@@ -1,4 +1,5 @@
 import os
+from itertools import product
 from unittest.mock import MagicMock, patch
 
 import cloudpickle
@@ -19,6 +20,7 @@
 from evalml.exceptions import AutoMLSearchException, PipelineNotFoundError
 from evalml.model_family import ModelFamily
 from evalml.objectives import FraudCost
+from evalml.objectives.utils import OPTIONS
 from evalml.pipelines import (
     BinaryClassificationPipeline,
     MulticlassClassificationPipeline,
@@ -53,7 +55,8 @@ def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type):
     assert automl.results['search_order'] == [0, 1]
     assert len(automl.results['pipeline_results']) == 2
     for pipeline_id, results in automl.results['pipeline_results'].items():
-        assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', 'score', 'high_variance_cv', 'training_time', 'cv_data'}
+        assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', 'score', 'high_variance_cv', 'training_time',
+                                  'cv_data', 'percent_better_than_baseline'}
         assert results['id'] == pipeline_id
         assert isinstance(results['pipeline_name'], str)
         assert issubclass(results['pipeline_class'], expected_pipeline_class)
@@ -75,11 +78,11 @@ def test_search_results(X_y_regression, X_y_binary, X_y_multi, automl_type):
     assert isinstance(automl.rankings, pd.DataFrame)
     assert isinstance(automl.full_rankings, pd.DataFrame)
     assert np.all(automl.rankings.dtypes == pd.Series(
-        [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
-        index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
+        [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
+        index=['id', 'pipeline_name', 'score', 'percent_better_than_baseline', 'high_variance_cv', 'parameters']))
     assert np.all(automl.full_rankings.dtypes == pd.Series(
-        [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
-        index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
+        [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
+        index=['id', 'pipeline_name', 'score', 'percent_better_than_baseline', 'high_variance_cv', 'parameters']))
 
 
 @pytest.mark.parametrize("automl_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION])
@@ -727,7 +730,8 @@ def test_no_search():
     assert isinstance(automl.rankings, pd.DataFrame)
     assert isinstance(automl.full_rankings, pd.DataFrame)
 
-    df_columns = ["id", "pipeline_name", "score", "high_variance_cv", "parameters"]
+    df_columns = ["id", "pipeline_name", "score", "percent_better_than_baseline",
+                  "high_variance_cv", "parameters"]
     assert (automl.rankings.columns == df_columns).all()
     assert (automl.full_rankings.columns == df_columns).all()
 
@@ -954,3 +958,56 @@ def test_error_during_train_test_split(mock_fit, mock_score, mock_train_test_spl
     automl.search(X, y)
     for pipeline in automl.results['pipeline_results'].values():
         assert np.isnan(pipeline['score'])
+
+
+@pytest.mark.parametrize("objective_tuple,pipeline_scores,baseline_score",
+                         product(OPTIONS.items(),
+                                 [(0.3, 0.4), (np.nan, 0.4), (0.3, np.nan), (np.nan, np.nan)],
+                                 [0.1, np.nan]))
+def test_percent_better_than_baseline_in_rankings(objective_tuple, pipeline_scores, baseline_score,
+                                                  dummy_binary_pipeline_class, dummy_multiclass_pipeline_class,
+                                                  dummy_regression_pipeline_class,
+                                                  X_y_binary):
+
+    # Ok to only use binary labels since score and fit methods are mocked
+    X, y = X_y_binary
+
+    name, objective = objective_tuple
+
+    pipeline_class = {ProblemTypes.BINARY: dummy_binary_pipeline_class,
+                      ProblemTypes.MULTICLASS: dummy_multiclass_pipeline_class,
+                      ProblemTypes.REGRESSION: dummy_regression_pipeline_class}[objective.problem_type]
+    baseline_pipeline_class = {ProblemTypes.BINARY: "evalml.pipelines.ModeBaselineBinaryPipeline",
+                               ProblemTypes.MULTICLASS: "evalml.pipelines.ModeBaselineMulticlassPipeline",
+                               ProblemTypes.REGRESSION: "evalml.pipelines.MeanBaselineRegressionPipeline",
+                               }[objective.problem_type]
+
+    class DummyPipeline(pipeline_class):
+        problem_type = objective.problem_type
+
+        def fit(self, *args, **kwargs):
+            """Mocking fit"""
+
+    class Pipeline1(DummyPipeline):
+        name = "Pipeline1"
+
+    class Pipeline2(DummyPipeline):
+        name = "Pipeline2"
+
+    mock_score_1 = MagicMock(return_value={objective.name: pipeline_scores[0]})
+    mock_score_2 = MagicMock(return_value={objective.name: pipeline_scores[1]})
+    Pipeline1.score = mock_score_1
+    Pipeline2.score = mock_score_2
+
+    automl = AutoMLSearch(problem_type=objective.problem_type, max_pipelines=3,
+                          allowed_pipelines=[Pipeline1, Pipeline2], objective=name)
+
+    with patch(baseline_pipeline_class + ".score", return_value={objective.name: baseline_score}):
+        automl.search(X, y, data_checks=None)
+        scores = dict(zip(automl.rankings.pipeline_name, automl.rankings.percent_better_than_baseline))
+        baseline_name = next(name for name in automl.rankings.pipeline_name if name not in {"Pipeline1", "Pipeline2"})
+        answers = {"Pipeline1": round(objective.calculate_percent_difference(pipeline_scores[0], baseline_score), 2),
+                   "Pipeline2": round(objective.calculate_percent_difference(pipeline_scores[1], baseline_score), 2),
+                   baseline_name: round(objective.calculate_percent_difference(baseline_score, baseline_score), 2)}
+        for name in answers:
+            np.testing.assert_almost_equal(scores[name], answers[name], decimal=3)
diff --git a/evalml/tests/objective_tests/test_standard_metrics.py b/evalml/tests/objective_tests/test_standard_metrics.py
index b220e53b05..b55933f3b6 100644
--- a/evalml/tests/objective_tests/test_standard_metrics.py
+++ b/evalml/tests/objective_tests/test_standard_metrics.py
@@ -1,4 +1,7 @@
+from itertools import product
+
 import numpy as np
+import pandas as pd
 import pytest
 from sklearn.metrics import matthews_corrcoef as sk_matthews_corrcoef
 
@@ -16,6 +19,7 @@
     MCCBinary,
     MCCMulticlass,
     MeanSquaredLogError,
+    ObjectiveBase,
     Precision,
     PrecisionMacro,
     PrecisionMicro,
@@ -28,9 +32,12 @@
     RootMeanSquaredLogError
 )
 from evalml.objectives.utils import OPTIONS
+from evalml.utils.gen_utils import _get_subclasses
 
 EPS = 1e-5
 
+all_objectives = _get_subclasses(ObjectiveBase)
+
 
 def test_input_contains_nan():
     y_predicted = np.array([np.nan, 0, 0])
@@ -403,3 +410,25 @@ def test_mcc_catches_warnings():
         MCCBinary().objective_function(y_true, y_predicted)
         MCCMulticlass().objective_function(y_true, y_predicted)
         assert len(record) == 0
+
+
+@pytest.mark.parametrize("objective_class", all_objectives)
+def test_calculate_percent_difference(objective_class):
+    score = 5
+    reference_score = 10
+
+    change = ((-1) ** (not objective_class.greater_is_better) * (score - reference_score)) / reference_score
+    answer = 100 * change
+
+    assert objective_class.calculate_percent_difference(score, reference_score) == answer
+    assert objective_class.perfect_score is not None
+
+
+@pytest.mark.parametrize("objective_class,nan_value", product(all_objectives, [None, np.nan]))
+def test_calculate_percent_difference_with_nan(objective_class, nan_value):
+
+    assert pd.isna(objective_class.calculate_percent_difference(nan_value, 2))
+    assert pd.isna(objective_class.calculate_percent_difference(-1, nan_value))
+    assert pd.isna(objective_class.calculate_percent_difference(nan_value, nan_value))
+
+    assert pd.isna(objective_class.calculate_percent_difference(2, 0))