API for refitting pipelines on entire training data (#876)

* Add API for refitting pipelines on entire training data. Update get_pipelines to return an untrained pipeline copy instead of a CV-trained copy. Refactor and improve AutoSearchBase init methods Train a cloned copy of the pipeline, not the pipeline itself. Save bin class threshold in cv data * Docs changes * Changelog * Update tests for fraud detection / lead scoring * Add test coverage for max_time input case * Update one broken test, add invalid get coverage * Update a bunch of tests * Add a search test, update others * Add one more line to cover get_pipelines * Fixed a bug in what baseline parameters are recorded * Always set binary_classification_threshold in cv_data, default None. Pass random_state to new pipeline * Check for binary_classification_threshold in cv_data * Update changelog * Add test coverage of describe_pipeline * Expand invalid test for codecov * Remove time for cross-platform matching * Codecov * Lint
alteryx · Jun 30, 2020 · 704fe03 · 704fe03
1 parent cf6ad68
commit 704fe03
Show file tree

Hide file tree

Showing 15 changed files with 227 additions and 68 deletions.
diff --git a/docs/source/automl/overfitting_protection.ipynb b/docs/source/automl/overfitting_protection.ipynb
@@ -77,6 +77,7 @@
     "\n",
     "automl.search(X, y)\n",
     "best_pipeline = automl.best_pipeline\n",
+    "best_pipeline.fit(X, y)\n",
     "best_pipeline.feature_importance"
    ]
   },

diff --git a/docs/source/automl/search_results.ipynb b/docs/source/automl/search_results.ipynb
@@ -111,6 +111,7 @@
    "outputs": [],
    "source": [
     "pipeline = automl.get_pipeline(1)\n",
+    "pipeline.fit(X, y)\n",
     "pipeline.feature_importance"
    ]
   },

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -23,6 +23,8 @@ Changelog
         * Added `default_parameters` class property to components and pipelines :pr:`879`
         * Added better support for disabling data checks in automl search :pr:`892`
         * Added ability to save and load AutoML objects to file :pr:`888`
+        * Updated `AutoSearchBase.get_pipelines` to return an untrained pipeline instance :pr:`876`
+        * Saved learned binary classification thresholds in automl results cv data dict :pr:`876`
     * Fixes
         * Fixed bug where SimpleImputer cannot handle dropped columns :pr:`846`
         * Fixed bug where PerColumnImputer cannot handle dropped columns :pr:`855`
@@ -53,6 +55,7 @@ Changelog
         * Pipelines' and classifiers' `feature_importances` is renamed `feature_importance`, `graph_feature_importances` is renamed `graph_feature_importance` :pr:`883`
         * Passing data_checks=None to automl search will not perform any data checks as opposed to default checks. :pr:`892`
         * Pipelines to search for in AutoML are now determined automatically, rather than using the statically-defined pipeline classes. :pr:`870`
+        * Updated `AutoSearchBase.get_pipelines` to return an untrained pipeline instance, instead of one which happened to be trained on the final cross-validation fold :pr:`876`
 
 
 **v0.10.0 May 29, 2020**

diff --git a/docs/source/index.ipynb b/docs/source/index.ipynb
@@ -160,6 +160,7 @@
    "outputs": [],
    "source": [
     "pipeline = automl.best_pipeline\n",
+    "pipeline.fit(X_train, y_train)\n",
     "pipeline.score(X_holdout, y_holdout, [\"f1\"])"
    ]
   },

diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -1,5 +1,5 @@
 import inspect
-import operator
+from operator import itemgetter
 
 from .automl_algorithm import AutoMLAlgorithm, AutoMLAlgorithmException
 
@@ -46,7 +46,7 @@ def next_batch(self):
         if self._batch_number == 1:
             if len(self._first_batch_results) == 0:
                 raise AutoMLAlgorithmException('No results were reported from the first batch')
-            self._first_batch_results = sorted(self._first_batch_results, key=operator.itemgetter(0))
+            self._first_batch_results = sorted(self._first_batch_results, key=itemgetter(0))
 
         next_batch = []
         if self._batch_number == 0:

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -20,8 +20,10 @@
 from evalml.automl.data_splitters import TrainingValidationSplit
 from evalml.data_checks import DataChecks, DefaultDataChecks, EmptyDataChecks
 from evalml.data_checks.data_check_message_type import DataCheckMessageType
+from evalml.exceptions import PipelineNotFoundError
 from evalml.objectives import get_objective, get_objectives
 from evalml.pipelines import (
+    BinaryClassificationPipeline,
     MeanBaselineRegressionPipeline,
     ModeBaselineBinaryPipeline,
     ModeBaselineMulticlassPipeline
@@ -170,7 +172,6 @@ def __init__(self,
             'pipeline_results': {},
             'search_order': []
         }
-        self.trained_pipelines = {}
         self.random_state = get_random_state(random_state)
         self.n_jobs = n_jobs
 
@@ -470,7 +471,7 @@ def _add_baseline_pipelines(self, X, y, pbar, raise_errors=True):
 
         baseline_results = self._compute_cv_scores(baseline, X, y, raise_errors=raise_errors, pbar=pbar)
         self._add_result(trained_pipeline=baseline,
-                         parameters=strategy_dict,
+                         parameters=baseline.parameters,
                          training_time=baseline_results['training_time'],
                          cv_data=baseline_results['cv_data'],
                          cv_scores=baseline_results['cv_scores'])
@@ -500,17 +501,18 @@ def _compute_cv_scores(self, pipeline, X, y, raise_errors=True, pbar=None):
 
                 if self.optimize_thresholds and self.objective.problem_type == ProblemTypes.BINARY and self.objective.can_optimize_threshold:
                     X_train, X_threshold_tuning, y_train, y_threshold_tuning = train_test_split(X_train, y_train, test_size=0.2, random_state=self.random_state)
-                pipeline.fit(X_train, y_train)
+                cv_pipeline = pipeline.clone()
+                cv_pipeline.fit(X_train, y_train)
                 if self.objective.problem_type == ProblemTypes.BINARY:
-                    pipeline.threshold = 0.5
+                    cv_pipeline.threshold = 0.5
                     if self.optimize_thresholds and self.objective.can_optimize_threshold:
-                        y_predict_proba = pipeline.predict_proba(X_threshold_tuning)
+                        y_predict_proba = cv_pipeline.predict_proba(X_threshold_tuning)
                         if isinstance(y_predict_proba, pd.DataFrame):
                             y_predict_proba = y_predict_proba.iloc[:, 1]
                         else:
                             y_predict_proba = y_predict_proba[:, 1]
-                        pipeline.threshold = self.objective.optimize_threshold(y_predict_proba, y_threshold_tuning, X=X_threshold_tuning)
-                scores = pipeline.score(X_test, y_test, objectives=objectives_to_score)
+                        cv_pipeline.threshold = self.objective.optimize_threshold(y_predict_proba, y_threshold_tuning, X=X_threshold_tuning)
+                scores = cv_pipeline.score(X_test, y_test, objectives=objectives_to_score)
                 score = scores[self.objective.name]
             except Exception as e:
                 logger.error("Exception during automl search: {}".format(str(e)))
@@ -525,7 +527,11 @@ def _compute_cv_scores(self, pipeline, X, y, raise_errors=True, pbar=None):
             ordered_scores.update(scores)
             ordered_scores.update({"# Training": len(y_train)})
             ordered_scores.update({"# Testing": len(y_test)})
-            cv_data.append({"all_objective_scores": ordered_scores, "score": score})
+
+            evaluation_entry = {"all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None}
+            if isinstance(cv_pipeline, BinaryClassificationPipeline) and cv_pipeline.threshold is not None:
+                evaluation_entry['binary_classification_threshold'] = cv_pipeline.threshold
+            cv_data.append(evaluation_entry)
 
         training_time = time.time() - start
         cv_scores = pd.Series([fold['score'] for fold in cv_data])
@@ -552,16 +558,13 @@ def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_s
             "score": cv_score,
             "high_variance_cv": high_variance_cv,
             "training_time": training_time,
-            "cv_data": cv_data
+            "cv_data": cv_data,
         }
-
         self.results['search_order'].append(pipeline_id)
 
         if self.add_result_callback:
             self.add_result_callback(self.results['pipeline_results'][pipeline_id], trained_pipeline)
 
-        self._save_pipeline(pipeline_id, trained_pipeline)
-
     def _evaluate(self, pipeline, X, y, raise_errors=True, pbar=None):
         parameters = pipeline.parameters
         evaluation_results = self._compute_cv_scores(pipeline, X, y, raise_errors=raise_errors, pbar=pbar)
@@ -576,22 +579,25 @@ def _evaluate(self, pipeline, X, y, raise_errors=True, pbar=None):
         logger.debug('Adding results complete')
         return evaluation_results
 
-    def _save_pipeline(self, pipeline_id, trained_pipeline):
-        self.trained_pipelines[pipeline_id] = trained_pipeline
-
-    def get_pipeline(self, pipeline_id):
-        """Retrieves trained pipeline
+    def get_pipeline(self, pipeline_id, random_state=0):
+        """Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline
+        initialized with the parameters used to train that pipeline during automl search.
 
         Arguments:
             pipeline_id (int): pipeline to retrieve
+            random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
 
         Returns:
-            Pipeline: pipeline associated with id
+            PipelineBase: untrained pipeline instance associated with the provided ID
         """
-        if pipeline_id not in self.trained_pipelines:
-            raise RuntimeError("Pipeline not found")
-
-        return self.trained_pipelines[pipeline_id]
+        pipeline_results = self.results['pipeline_results'].get(pipeline_id)
+        if pipeline_results is None:
+            raise PipelineNotFoundError("Pipeline not found in automl results")
+        pipeline_class = pipeline_results.get('pipeline_class')
+        parameters = pipeline_results.get('parameters')
+        if pipeline_class is None or parameters is None:
+            raise PipelineNotFoundError("Pipeline class or parameters not found in automl results")
+        return pipeline_class(parameters, random_state=random_state)
 
     def describe_pipeline(self, pipeline_id, return_dict=False):
         """Describe a pipeline
@@ -689,7 +695,11 @@ def full_rankings(self):
 
     @property
     def best_pipeline(self):
-        """Returns the best model found"""
+        """Returns an untrained instance of the best pipeline and parameters found during automl search.
+
+        Returns:
+            PipelineBase: untrained pipeline instance associated with the best automl search result.
+        """
         best = self.rankings.iloc[0]
         return self.get_pipeline(best["id"])
 

diff --git a/evalml/exceptions/__init__.py b/evalml/exceptions/__init__.py
@@ -1,2 +1,2 @@
 # flake8:noqa
-from .exceptions import MethodPropertyNotFoundError, ObjectiveNotFoundError, IllFormattedClassNameError, MissingComponentError
+from .exceptions import *
diff --git a/evalml/exceptions/exceptions.py b/evalml/exceptions/exceptions.py
@@ -3,6 +3,11 @@ class MethodPropertyNotFoundError(Exception):
     pass
 
 
+class PipelineNotFoundError(Exception):
+    """An exception raised when a particular pipeline is not found in automl search results"""
+    pass
+
+
 class ObjectiveNotFoundError(Exception):
     """Exception to raise when specified objective does not exist."""
     pass
@@ -14,5 +19,5 @@ class IllFormattedClassNameError(Exception):
 
 
 class MissingComponentError(Exception):
-    """An exception thrown when a component is not found in all_components()"""
+    """An exception raised when a component is not found in all_components()"""
     pass
diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -14,40 +14,105 @@
     DataChecks,
     DataCheckWarning
 )
+from evalml.exceptions import PipelineNotFoundError
 from evalml.model_family import ModelFamily
 from evalml.objectives import FraudCost
-from evalml.pipelines import BinaryClassificationPipeline
+from evalml.pipelines import (
+    BinaryClassificationPipeline,
+    MulticlassClassificationPipeline,
+    RegressionPipeline
+)
 from evalml.pipelines.utils import get_estimators, get_pipelines, make_pipeline
 from evalml.problem_types import ProblemTypes
 from evalml.tuners import NoParamsException, RandomSearchTuner
 
 
-def test_pipeline_limits(caplog, X_y):
+@pytest.mark.parametrize("automl_type", [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS])
+def test_search_results(X_y_reg, X_y, X_y_multi, automl_type):
+    expected_cv_data_keys = {'all_objective_scores', 'score', 'binary_classification_threshold'}
+    automl = AutoMLSearch(problem_type=automl_type, max_pipelines=2)
+    if automl_type == ProblemTypes.REGRESSION:
+        expected_pipeline_class = RegressionPipeline
+        X, y = X_y_reg
+    elif automl_type == ProblemTypes.BINARY:
+        expected_pipeline_class = BinaryClassificationPipeline
+        X, y = X_y
+    elif automl_type == ProblemTypes.MULTICLASS:
+        expected_pipeline_class = MulticlassClassificationPipeline
+        X, y = X_y_multi
+
+    automl.search(X, y)
+    assert automl.results.keys() == {'pipeline_results', 'search_order'}
+    assert automl.results['search_order'] == [0, 1]
+    assert len(automl.results['pipeline_results']) == 2
+    for pipeline_id, results in automl.results['pipeline_results'].items():
+        assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', 'score', 'high_variance_cv', 'training_time', 'cv_data'}
+        assert results['id'] == pipeline_id
+        assert isinstance(results['pipeline_name'], str)
+        assert issubclass(results['pipeline_class'], expected_pipeline_class)
+        assert isinstance(results['pipeline_summary'], str)
+        assert isinstance(results['parameters'], dict)
+        assert isinstance(results['score'], float)
+        assert isinstance(results['high_variance_cv'], np.bool_)
+        assert isinstance(results['cv_data'], list)
+        for cv_result in results['cv_data']:
+            assert cv_result.keys() == expected_cv_data_keys
+            if automl_type == ProblemTypes.BINARY:
+                assert isinstance(cv_result['binary_classification_threshold'], float)
+            else:
+                assert cv_result['binary_classification_threshold'] is None
+        assert automl.get_pipeline(pipeline_id).parameters == results['parameters']
+    assert isinstance(automl.rankings, pd.DataFrame)
+    assert isinstance(automl.full_rankings, pd.DataFrame)
+    assert np.all(automl.rankings.dtypes == pd.Series(
+        [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
+        index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
+    assert np.all(automl.full_rankings.dtypes == pd.Series(
+        [np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
+        index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
+
+
+@patch('evalml.pipelines.BinaryClassificationPipeline.score')
+@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
+def test_pipeline_limits(mock_fit, mock_score, caplog, X_y):
     X, y = X_y
+    mock_score.return_value = {'Log Loss Binary': 1.0}
 
     automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
     automl.search(X, y)
     out = caplog.text
     assert "Searching up to 1 pipelines. " in out
+    assert len(automl.results['pipeline_results']) == 1
 
     caplog.clear()
     automl = AutoMLSearch(problem_type='binary', max_time=1)
     automl.search(X, y)
     out = caplog.text
     assert "Will stop searching for new pipelines after 1 seconds" in out
+    assert len(automl.results['pipeline_results']) >= 1
+
+    caplog.clear()
+    automl = AutoMLSearch(problem_type='multiclass', max_time=1e-16)
+    automl.search(X, y)
+    out = caplog.text
+    assert "Will stop searching for new pipelines after 0 seconds" in out
+    # search will always run at least one pipeline
+    assert len(automl.results['pipeline_results']) >= 1
 
     caplog.clear()
     automl = AutoMLSearch(problem_type='binary', max_time=1, max_pipelines=5)
     automl.search(X, y)
     out = caplog.text
     assert "Searching up to 5 pipelines. " in out
     assert "Will stop searching for new pipelines after 1 seconds" in out
+    assert len(automl.results['pipeline_results']) <= 5
 
     caplog.clear()
     automl = AutoMLSearch(problem_type='binary')
     automl.search(X, y)
     out = caplog.text
     assert "Using default limit of max_pipelines=5." in out
+    assert len(automl.results['pipeline_results']) <= 5
 
 
 def test_search_order(X_y):
@@ -434,7 +499,8 @@ def test_automl_serialization(X_y, tmpdir):
     automl.save(path)
     loaded_automl = automl.load(path)
     for i in range(num_max_pipelines):
-        assert automl.get_pipeline(i).score(X, y, ['precision']) == loaded_automl.get_pipeline(i).score(X, y, ['precision'])
+        assert automl.get_pipeline(i).__class__ == loaded_automl.get_pipeline(i).__class__
+        assert automl.get_pipeline(i).parameters == loaded_automl.get_pipeline(i).parameters
         assert automl.results == loaded_automl.results
         pd.testing.assert_frame_equal(automl.rankings, loaded_automl.rankings)
 
@@ -626,3 +692,57 @@ class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class):
     automl.add_to_rankings(test_pipeline_trained, X, y)
 
     assert list(automl.rankings['score'].values).count(0.1234) == 2
+
+
+@patch('evalml.pipelines.BinaryClassificationPipeline.score')
+@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
+def test_get_pipeline_invalid(mock_fit, mock_score, X_y):
+    X, y = X_y
+    mock_score.return_value = {'Log Loss Binary': 1.0}
+
+    automl = AutoMLSearch(problem_type='binary')
+    with pytest.raises(PipelineNotFoundError, match="Pipeline not found in automl results"):
+        automl.get_pipeline(1000)
+
+    automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
+    automl.search(X, y)
+    assert automl.get_pipeline(0).name == 'Mode Baseline Binary Classification Pipeline'
+    automl.results['pipeline_results'][0].pop('pipeline_class')
+    with pytest.raises(PipelineNotFoundError, match="Pipeline class or parameters not found in automl results"):
+        automl.get_pipeline(0)
+
+    automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
+    automl.search(X, y)
+    assert automl.get_pipeline(0).name == 'Mode Baseline Binary Classification Pipeline'
+    automl.results['pipeline_results'][0].pop('parameters')
+    with pytest.raises(PipelineNotFoundError, match="Pipeline class or parameters not found in automl results"):
+        automl.get_pipeline(0)
+
+
+@patch('evalml.pipelines.BinaryClassificationPipeline.score')
+@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
+def test_describe_pipeline(mock_fit, mock_score, caplog, X_y):
+    X, y = X_y
+    mock_score.return_value = {'Log Loss Binary': 1.0}
+
+    automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
+    automl.search(X, y)
+    out = caplog.text
+    assert "Searching up to 1 pipelines. " in out
+
+    assert len(automl.results['pipeline_results']) == 1
+    caplog.clear()
+    automl.describe_pipeline(0)
+    out = caplog.text
+    assert "Mode Baseline Binary Classification Pipeline" in out
+    assert "Problem Type: Binary Classification" in out
+    assert "Model Family: Baseline" in out
+    assert "* strategy : random_weighted" in out
+    assert "Total training time (including CV): " in out
+    assert """Log Loss Binary # Training # Testing
+0                      1.000     66.000    34.000
+1                      1.000     67.000    33.000
+2                      1.000     67.000    33.000
+mean                   1.000          -         -
+std                    0.000          -         -
+coef of var            0.000          -         -""" in out