Change score output to return one dictionary (#429)

alteryx · Mar 4, 2020 · 1ca6769 · 1ca6769
1 parent 5fdbdd7
commit 1ca6769
Show file tree

Hide file tree

Showing 12 changed files with 61 additions and 45 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -20,6 +20,7 @@ Changelog
         * Downloaded demo and test datasets so unit tests can run offline :pr:`408`
         * Remove `_needs_fitting` attribute from Components :pr:`398`
         * Changed plot.feature_importance to show only non-zero feature importances by default, added optional parameter to show all :pr:`413`
+        * Changed the output of `score` to return one dictionary :pr:`429`
     * Documentation Changes
         * Update release.md with instructions to release to internal license key :pr:`354`
     * Testing Changes
@@ -37,6 +38,7 @@ Changelog
     * Pipelines will now no longer take an objective parameter during instantiation, and will no longer have an objective attribute.
     * ``fit()`` and ``predict()`` now use an optional ``objective`` parameter, which is only used in binary classification pipelines to fit for a specific objective.
     * ``score()`` will now use a required ``objectives`` parameter that is used to determine all the objectives to score on. This differs from the previous behavior, where the pipeline's objective was scored on regardless.
+    * ``score()`` will now return one dictionary of all objective scores.
 **v0.6.0 Dec. 16, 2019**
     * Enhancements
         * Added ability to create a plot of feature importances :pr:`133`

diff --git a/evalml/automl/auto_base.py b/evalml/automl/auto_base.py
@@ -259,18 +259,19 @@ def _do_iteration(self, X, y, pbar, raise_errors):
             objectives_to_score = [self.objective] + self.additional_objectives
             try:
                 pipeline.fit(X_train, y_train, self.objective)
-                score, other_scores = pipeline.score(X_test, y_test, objectives=objectives_to_score)
+                scores = pipeline.score(X_test, y_test, objectives=objectives_to_score)
+                score = scores[self.objective.name]
                 plot_data.append(pipeline.get_plot_data(X_test, y_test, self.plot_metrics))
             except Exception as e:
                 if raise_errors:
                     raise e
                 if pbar:
                     pbar.write(str(e))
                 score = np.nan
-                other_scores = OrderedDict(zip([n.name for n in self.additional_objectives], [np.nan] * len(self.additional_objectives)))
+                scores = OrderedDict(zip([n.name for n in self.additional_objectives], [np.nan] * len(self.additional_objectives)))
             ordered_scores = OrderedDict()
             ordered_scores.update({self.objective.name: score})
-            ordered_scores.update(other_scores)
+            ordered_scores.update(scores)
             ordered_scores.update({"# Training": len(y_train)})
             ordered_scores.update({"# Testing": len(y_test)})
             cv_data.append({"all_objective_scores": ordered_scores, "score": score})

diff --git a/evalml/pipelines/binary_classification_pipeline.py b/evalml/pipelines/binary_classification_pipeline.py
@@ -85,7 +85,7 @@ def score(self, X, y, objectives):
             objectives (list): list of objectives to score
 
         Returns:
-            float, dict:  score, ordered dictionary of other objective scores
+            dict: ordered dictionary of objective scores
         """
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
@@ -97,7 +97,7 @@ def score(self, X, y, objectives):
         y_predicted = None
         y_predicted_proba = None
 
-        scores = []
+        scores = OrderedDict()
         for objective in objectives:
             if objective.score_needs_proba:
                 if y_predicted_proba is None:
@@ -110,15 +110,10 @@ def score(self, X, y, objectives):
                 y_predictions = y_predicted
 
             if objective.uses_extra_columns:
-                scores.append(objective.score(y_predictions, y, X))
+                scores.update({objective.name: objective.score(y_predictions, y, X)})
             else:
-                scores.append(objective.score(y_predictions, y))
-        if not objectives:
-            return scores[0], {}
-
-        other_scores = OrderedDict(zip([n.name for n in objectives[1:]], scores[1:]))
-
-        return scores[0], other_scores
+                scores.update({objective.name: objective.score(y_predictions, y)})
+        return scores
 
     def get_plot_data(self, X, y, plot_metrics):
         """Generates plotting data for the pipeline for each specified plot metric
@@ -138,7 +133,7 @@ def get_plot_data(self, X, y, plot_metrics):
             y = pd.Series(y)
         y_predicted = None
         y_predicted_proba = None
-        scores = []
+        scores = OrderedDict()
         for plot_metric in plot_metrics:
             if plot_metric.score_needs_proba:
                 if y_predicted_proba is None:
@@ -149,6 +144,5 @@ def get_plot_data(self, X, y, plot_metrics):
                 if y_predicted is None:
                     y_predicted = self.predict(X)
                 y_predictions = y_predicted
-            scores.append(plot_metric.score(y_predictions, y))
-        scores = OrderedDict(zip([n.name for n in plot_metrics], scores))
+            scores.update({plot_metric.name: plot_metric.score(y_predictions, y)})
         return scores
diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
@@ -178,7 +178,7 @@ def score(self, X, y, objectives):
             objectives (list): Non-empty list of objectives to score on
 
         Returns:
-            float, dict:  score, ordered dictionary of other objective scores
+            dict: ordered dictionary of objective scores
         """
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
@@ -190,7 +190,7 @@ def score(self, X, y, objectives):
         y_predicted = None
         y_predicted_proba = None
 
-        scores = []
+        scores = OrderedDict()
         for objective in objectives:
             if objective.score_needs_proba:
                 if y_predicted_proba is None:
@@ -202,15 +202,11 @@ def score(self, X, y, objectives):
                 y_predictions = y_predicted
 
             if objective.uses_extra_columns:
-                scores.append(objective.score(y_predictions, y, X))
+                scores.update({objective.name: objective.score(y_predictions, y, X)})
             else:
-                scores.append(objective.score(y_predictions, y))
-        if not objectives:
-            return scores[0], {}
+                scores.update({objective.name: objective.score(y_predictions, y)})
 
-        other_scores = OrderedDict(zip([n.name for n in objectives[1:]], scores[1:]))
-
-        return scores[0], other_scores
+        return scores
 
     def get_plot_data(self, X, y, plot_metrics):
         """Generates plotting data for the pipeline for each specified plot metric
@@ -230,7 +226,7 @@ def get_plot_data(self, X, y, plot_metrics):
             y = pd.Series(y)
         y_predicted = None
         y_predicted_proba = None
-        scores = []
+        scores = OrderedDict()
         for plot_metric in plot_metrics:
             if plot_metric.score_needs_proba:
                 if y_predicted_proba is None:
@@ -240,8 +236,7 @@ def get_plot_data(self, X, y, plot_metrics):
                 if y_predicted is None:
                     y_predicted = self.predict(X)
                 y_predictions = y_predicted
-            scores.append(plot_metric.score(y_predictions, y))
-        scores = OrderedDict(zip([n.name for n in plot_metrics], scores))
+            scores.update({plot_metric.name: plot_metric.score(y_predictions, y)})
         return scores
 
     @property

diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_catboost_classification.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_catboost_classification.py
@@ -33,11 +33,11 @@ def test_catboost_multi(X_y_multi):
     clf = CatBoostMulticlassClassificationPipeline(impute_strategy='mean', n_estimators=1000, bootstrap_type='Bayesian',
                                                    number_features=X.shape[1], eta=0.03, max_depth=6, random_state=0)
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
 
     assert((y_pred == sk_pipeline.predict(X)).all())
-    assert (sk_score == clf_score[0])
+    assert (sk_score == clf_scores[objective.name])
     assert len(np.unique(y_pred)) == 3
     assert len(clf.feature_importances) == len(X[0])
     assert not clf.feature_importances.isnull().all().all()

diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_logistic_regression.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_logistic_regression.py
@@ -43,10 +43,10 @@ def test_lor_multi(X_y_multi):
     objective = PrecisionMicro()
     clf = LogisticRegressionMulticlassPipeline(penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
     assert((y_pred == sk_pipeline.predict(X)).all())
-    assert (sk_score == clf_score[0])
+    assert (sk_score == clf_scores[objective.name])
     assert len(np.unique(y_pred)) == 3
     assert len(clf.feature_importances) == len(X[0])
     assert not clf.feature_importances.isnull().all().all()

diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_rf.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_rf.py
@@ -46,11 +46,11 @@ def test_rf_multi(X_y_multi):
     objective = PrecisionMicro()
     clf = RFMulticlassClassificationPipeline(n_estimators=10, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0]))
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
 
     assert((y_pred == sk_pipeline.predict(X)).all())
-    assert (sk_score == clf_score[0])
+    assert (sk_score == clf_scores[objective.name])
     assert len(np.unique(y_pred)) == 3
     assert len(clf.feature_importances) == len(X[0])
     assert not clf.feature_importances.isnull().all().all()

diff --git a/evalml/tests/pipeline_tests/classification_pipeline_tests/test_xgboost.py b/evalml/tests/pipeline_tests/classification_pipeline_tests/test_xgboost.py
@@ -47,11 +47,11 @@ def test_xg_multi(X_y_multi):
     objective = PrecisionMicro()
     clf = XGBoostMulticlassPipeline(eta=0.1, min_child_weight=1, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=len(X[0]), n_estimators=10)
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
 
     assert((y_pred == sk_pipeline.predict(X)).all())
-    assert (sk_score == clf_score[0])
+    assert (sk_score == clf_scores[objective.name])
     assert len(np.unique(y_pred)) == 3
     assert len(clf.feature_importances) == len(X[0])
     assert not clf.feature_importances.isnull().all().all()

diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_catboost_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_catboost_regression.py
@@ -29,11 +29,16 @@ def test_catboost_regression(X_y_reg):
     clf = CatBoostRegressionPipeline(n_estimators=1000, eta=0.03, number_features=X.shape[1],
                                      bootstrap_type='Bayesian', max_depth=6, impute_strategy='mean', random_state=0)
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
 
     np.testing.assert_almost_equal(y_pred, sk_pipeline.predict(X), decimal=5)
-    np.testing.assert_almost_equal(sk_score, clf_score[0], decimal=5)
+    np.testing.assert_almost_equal(sk_score, clf_scores[objective.name], decimal=5)
+
+    # testing objective parameter passed in does not change results
+    clf.fit(X, y, objective)
+    y_pred_with_objective = clf.predict(X, objective)
+    np.testing.assert_almost_equal(y_pred, y_pred_with_objective, decimal=5)
 
     # testing objective parameter passed in does not change results
     clf.fit(X, y, objective)

diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_linear_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_linear_regression.py
@@ -40,11 +40,11 @@ def test_linear_regression(X_y_categorical_regression):
                                    random_state=0,
                                    n_jobs=-1)
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
 
     np.testing.assert_almost_equal(y_pred, sk_pipeline.predict(X), decimal=5)
-    np.testing.assert_almost_equal(sk_score, clf_score[0], decimal=5)
+    np.testing.assert_almost_equal(sk_score, clf_scores[objective.name], decimal=5)
     assert not clf.feature_importances.isnull().all().all()
 
     # testing objective parameter passed in does not change results

diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_rf_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_rf_regression.py
@@ -43,10 +43,15 @@ def test_rf_regression(X_y_categorical_regression):
     clf = RFRegressionPipeline(n_estimators=10, max_depth=3, impute_strategy='mean', percent_features=1.0, number_features=X.shape[1])
 
     clf.fit(X, y)
-    clf_score = clf.score(X, y, [objective])
+    clf_scores = clf.score(X, y, [objective])
     y_pred = clf.predict(X)
     np.testing.assert_almost_equal(y_pred, sk_pipeline.predict(X), decimal=5)
-    np.testing.assert_almost_equal(sk_score, clf_score[0], decimal=5)
+    np.testing.assert_almost_equal(sk_score, clf_scores[objective.name], decimal=5)
+
+    # testing objective parameter passed in does not change results
+    clf.fit(X, y, objective)
+    y_pred_with_objective = clf.predict(X, objective)
+    np.testing.assert_almost_equal(y_pred, y_pred_with_objective, decimal=5)
 
     # testing objective parameter passed in does not change results
     clf.fit(X, y, objective)

diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py
@@ -1,9 +1,10 @@
 import os
 
+import numpy as np
 import pytest
 
 from evalml.model_types import ModelTypes
-from evalml.objectives import FraudCost, Precision
+from evalml.objectives import FraudCost, Precision, Recall
 from evalml.pipelines import (
     LogisticRegressionBinaryPipeline,
     LogisticRegressionMulticlassPipeline,
@@ -181,8 +182,21 @@ def test_score_with_empty_list_of_objectives(X_y):
     X, y = X_y
     clf = LogisticRegressionBinaryPipeline(penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
     clf.fit(X, y)
-    with pytest.raises(IndexError):
-        clf.score(X, y, [])
+    scores = clf.score(X, y, [])
+    assert len(scores.values()) == 0
+
+
+def test_score_with_list_of_multiple_objectives(X_y):
+    X, y = X_y
+    clf = LogisticRegressionBinaryPipeline(penalty='l2', C=1.0, impute_strategy='mean', number_features=len(X[0]), random_state=0)
+    clf.fit(X, y)
+    recall_name = Recall().name
+    precision_name = Precision().name
+    objective_names = [recall_name, precision_name]
+    scores = clf.score(X, y, objective_names)
+    assert len(scores.values()) == 2
+    assert all(name in scores.keys() for name in objective_names)
+    assert not any(np.isnan(val) for val in scores.values())
 
 
 def test_n_jobs(X_y):