Skip to content

Commit

Permalink
API for refitting pipelines on entire training data (#876)
Browse files Browse the repository at this point in the history
* Add API for refitting pipelines on entire training data. Update get_pipelines to return an untrained pipeline copy instead of a CV-trained copy.

Refactor and improve AutoSearchBase init methods

Train a cloned copy of the pipeline, not the pipeline itself. Save bin class threshold in cv data

* Docs changes

* Changelog

* Update tests for fraud detection / lead scoring

* Add test coverage for max_time input case

* Update one broken test, add invalid get coverage

* Update a bunch of tests

* Add a search test, update others

* Add one more line to cover get_pipelines

* Fixed a bug in what baseline parameters are recorded

* Always set binary_classification_threshold in cv_data, default None. Pass random_state to new pipeline

* Check for binary_classification_threshold in cv_data

* Update changelog

* Add test coverage of describe_pipeline

* Expand invalid test for codecov

* Remove time for cross-platform matching

* Codecov

* Lint
  • Loading branch information
dsherry committed Jun 30, 2020
1 parent cf6ad68 commit 704fe03
Show file tree
Hide file tree
Showing 15 changed files with 227 additions and 68 deletions.
1 change: 1 addition & 0 deletions docs/source/automl/overfitting_protection.ipynb
Expand Up @@ -77,6 +77,7 @@
"\n",
"automl.search(X, y)\n",
"best_pipeline = automl.best_pipeline\n",
"best_pipeline.fit(X, y)\n",
"best_pipeline.feature_importance"
]
},
Expand Down
1 change: 1 addition & 0 deletions docs/source/automl/search_results.ipynb
Expand Up @@ -111,6 +111,7 @@
"outputs": [],
"source": [
"pipeline = automl.get_pipeline(1)\n",
"pipeline.fit(X, y)\n",
"pipeline.feature_importance"
]
},
Expand Down
3 changes: 3 additions & 0 deletions docs/source/changelog.rst
Expand Up @@ -23,6 +23,8 @@ Changelog
* Added `default_parameters` class property to components and pipelines :pr:`879`
* Added better support for disabling data checks in automl search :pr:`892`
* Added ability to save and load AutoML objects to file :pr:`888`
* Updated `AutoSearchBase.get_pipelines` to return an untrained pipeline instance :pr:`876`
* Saved learned binary classification thresholds in automl results cv data dict :pr:`876`
* Fixes
* Fixed bug where SimpleImputer cannot handle dropped columns :pr:`846`
* Fixed bug where PerColumnImputer cannot handle dropped columns :pr:`855`
Expand Down Expand Up @@ -53,6 +55,7 @@ Changelog
* Pipelines' and classifiers' `feature_importances` is renamed `feature_importance`, `graph_feature_importances` is renamed `graph_feature_importance` :pr:`883`
* Passing data_checks=None to automl search will not perform any data checks as opposed to default checks. :pr:`892`
* Pipelines to search for in AutoML are now determined automatically, rather than using the statically-defined pipeline classes. :pr:`870`
* Updated `AutoSearchBase.get_pipelines` to return an untrained pipeline instance, instead of one which happened to be trained on the final cross-validation fold :pr:`876`


**v0.10.0 May 29, 2020**
Expand Down
1 change: 1 addition & 0 deletions docs/source/index.ipynb
Expand Up @@ -160,6 +160,7 @@
"outputs": [],
"source": [
"pipeline = automl.best_pipeline\n",
"pipeline.fit(X_train, y_train)\n",
"pipeline.score(X_holdout, y_holdout, [\"f1\"])"
]
},
Expand Down
4 changes: 2 additions & 2 deletions evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -1,5 +1,5 @@
import inspect
import operator
from operator import itemgetter

from .automl_algorithm import AutoMLAlgorithm, AutoMLAlgorithmException

Expand Down Expand Up @@ -46,7 +46,7 @@ def next_batch(self):
if self._batch_number == 1:
if len(self._first_batch_results) == 0:
raise AutoMLAlgorithmException('No results were reported from the first batch')
self._first_batch_results = sorted(self._first_batch_results, key=operator.itemgetter(0))
self._first_batch_results = sorted(self._first_batch_results, key=itemgetter(0))

next_batch = []
if self._batch_number == 0:
Expand Down
56 changes: 33 additions & 23 deletions evalml/automl/automl_search.py
Expand Up @@ -20,8 +20,10 @@
from evalml.automl.data_splitters import TrainingValidationSplit
from evalml.data_checks import DataChecks, DefaultDataChecks, EmptyDataChecks
from evalml.data_checks.data_check_message_type import DataCheckMessageType
from evalml.exceptions import PipelineNotFoundError
from evalml.objectives import get_objective, get_objectives
from evalml.pipelines import (
BinaryClassificationPipeline,
MeanBaselineRegressionPipeline,
ModeBaselineBinaryPipeline,
ModeBaselineMulticlassPipeline
Expand Down Expand Up @@ -170,7 +172,6 @@ def __init__(self,
'pipeline_results': {},
'search_order': []
}
self.trained_pipelines = {}
self.random_state = get_random_state(random_state)
self.n_jobs = n_jobs

Expand Down Expand Up @@ -470,7 +471,7 @@ def _add_baseline_pipelines(self, X, y, pbar, raise_errors=True):

baseline_results = self._compute_cv_scores(baseline, X, y, raise_errors=raise_errors, pbar=pbar)
self._add_result(trained_pipeline=baseline,
parameters=strategy_dict,
parameters=baseline.parameters,
training_time=baseline_results['training_time'],
cv_data=baseline_results['cv_data'],
cv_scores=baseline_results['cv_scores'])
Expand Down Expand Up @@ -500,17 +501,18 @@ def _compute_cv_scores(self, pipeline, X, y, raise_errors=True, pbar=None):

if self.optimize_thresholds and self.objective.problem_type == ProblemTypes.BINARY and self.objective.can_optimize_threshold:
X_train, X_threshold_tuning, y_train, y_threshold_tuning = train_test_split(X_train, y_train, test_size=0.2, random_state=self.random_state)
pipeline.fit(X_train, y_train)
cv_pipeline = pipeline.clone()
cv_pipeline.fit(X_train, y_train)
if self.objective.problem_type == ProblemTypes.BINARY:
pipeline.threshold = 0.5
cv_pipeline.threshold = 0.5
if self.optimize_thresholds and self.objective.can_optimize_threshold:
y_predict_proba = pipeline.predict_proba(X_threshold_tuning)
y_predict_proba = cv_pipeline.predict_proba(X_threshold_tuning)
if isinstance(y_predict_proba, pd.DataFrame):
y_predict_proba = y_predict_proba.iloc[:, 1]
else:
y_predict_proba = y_predict_proba[:, 1]
pipeline.threshold = self.objective.optimize_threshold(y_predict_proba, y_threshold_tuning, X=X_threshold_tuning)
scores = pipeline.score(X_test, y_test, objectives=objectives_to_score)
cv_pipeline.threshold = self.objective.optimize_threshold(y_predict_proba, y_threshold_tuning, X=X_threshold_tuning)
scores = cv_pipeline.score(X_test, y_test, objectives=objectives_to_score)
score = scores[self.objective.name]
except Exception as e:
logger.error("Exception during automl search: {}".format(str(e)))
Expand All @@ -525,7 +527,11 @@ def _compute_cv_scores(self, pipeline, X, y, raise_errors=True, pbar=None):
ordered_scores.update(scores)
ordered_scores.update({"# Training": len(y_train)})
ordered_scores.update({"# Testing": len(y_test)})
cv_data.append({"all_objective_scores": ordered_scores, "score": score})

evaluation_entry = {"all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None}
if isinstance(cv_pipeline, BinaryClassificationPipeline) and cv_pipeline.threshold is not None:
evaluation_entry['binary_classification_threshold'] = cv_pipeline.threshold
cv_data.append(evaluation_entry)

training_time = time.time() - start
cv_scores = pd.Series([fold['score'] for fold in cv_data])
Expand All @@ -552,16 +558,13 @@ def _add_result(self, trained_pipeline, parameters, training_time, cv_data, cv_s
"score": cv_score,
"high_variance_cv": high_variance_cv,
"training_time": training_time,
"cv_data": cv_data
"cv_data": cv_data,
}

self.results['search_order'].append(pipeline_id)

if self.add_result_callback:
self.add_result_callback(self.results['pipeline_results'][pipeline_id], trained_pipeline)

self._save_pipeline(pipeline_id, trained_pipeline)

def _evaluate(self, pipeline, X, y, raise_errors=True, pbar=None):
parameters = pipeline.parameters
evaluation_results = self._compute_cv_scores(pipeline, X, y, raise_errors=raise_errors, pbar=pbar)
Expand All @@ -576,22 +579,25 @@ def _evaluate(self, pipeline, X, y, raise_errors=True, pbar=None):
logger.debug('Adding results complete')
return evaluation_results

def _save_pipeline(self, pipeline_id, trained_pipeline):
self.trained_pipelines[pipeline_id] = trained_pipeline

def get_pipeline(self, pipeline_id):
"""Retrieves trained pipeline
def get_pipeline(self, pipeline_id, random_state=0):
"""Given the ID of a pipeline training result, returns an untrained instance of the specified pipeline
initialized with the parameters used to train that pipeline during automl search.
Arguments:
pipeline_id (int): pipeline to retrieve
random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
Returns:
Pipeline: pipeline associated with id
PipelineBase: untrained pipeline instance associated with the provided ID
"""
if pipeline_id not in self.trained_pipelines:
raise RuntimeError("Pipeline not found")

return self.trained_pipelines[pipeline_id]
pipeline_results = self.results['pipeline_results'].get(pipeline_id)
if pipeline_results is None:
raise PipelineNotFoundError("Pipeline not found in automl results")
pipeline_class = pipeline_results.get('pipeline_class')
parameters = pipeline_results.get('parameters')
if pipeline_class is None or parameters is None:
raise PipelineNotFoundError("Pipeline class or parameters not found in automl results")
return pipeline_class(parameters, random_state=random_state)

def describe_pipeline(self, pipeline_id, return_dict=False):
"""Describe a pipeline
Expand Down Expand Up @@ -689,7 +695,11 @@ def full_rankings(self):

@property
def best_pipeline(self):
"""Returns the best model found"""
"""Returns an untrained instance of the best pipeline and parameters found during automl search.
Returns:
PipelineBase: untrained pipeline instance associated with the best automl search result.
"""
best = self.rankings.iloc[0]
return self.get_pipeline(best["id"])

Expand Down
2 changes: 1 addition & 1 deletion evalml/exceptions/__init__.py
@@ -1,2 +1,2 @@
# flake8:noqa
from .exceptions import MethodPropertyNotFoundError, ObjectiveNotFoundError, IllFormattedClassNameError, MissingComponentError
from .exceptions import *
7 changes: 6 additions & 1 deletion evalml/exceptions/exceptions.py
Expand Up @@ -3,6 +3,11 @@ class MethodPropertyNotFoundError(Exception):
pass


class PipelineNotFoundError(Exception):
"""An exception raised when a particular pipeline is not found in automl search results"""
pass


class ObjectiveNotFoundError(Exception):
"""Exception to raise when specified objective does not exist."""
pass
Expand All @@ -14,5 +19,5 @@ class IllFormattedClassNameError(Exception):


class MissingComponentError(Exception):
"""An exception thrown when a component is not found in all_components()"""
"""An exception raised when a component is not found in all_components()"""
pass
126 changes: 123 additions & 3 deletions evalml/tests/automl_tests/test_automl.py
Expand Up @@ -14,40 +14,105 @@
DataChecks,
DataCheckWarning
)
from evalml.exceptions import PipelineNotFoundError
from evalml.model_family import ModelFamily
from evalml.objectives import FraudCost
from evalml.pipelines import BinaryClassificationPipeline
from evalml.pipelines import (
BinaryClassificationPipeline,
MulticlassClassificationPipeline,
RegressionPipeline
)
from evalml.pipelines.utils import get_estimators, get_pipelines, make_pipeline
from evalml.problem_types import ProblemTypes
from evalml.tuners import NoParamsException, RandomSearchTuner


def test_pipeline_limits(caplog, X_y):
@pytest.mark.parametrize("automl_type", [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS])
def test_search_results(X_y_reg, X_y, X_y_multi, automl_type):
expected_cv_data_keys = {'all_objective_scores', 'score', 'binary_classification_threshold'}
automl = AutoMLSearch(problem_type=automl_type, max_pipelines=2)
if automl_type == ProblemTypes.REGRESSION:
expected_pipeline_class = RegressionPipeline
X, y = X_y_reg
elif automl_type == ProblemTypes.BINARY:
expected_pipeline_class = BinaryClassificationPipeline
X, y = X_y
elif automl_type == ProblemTypes.MULTICLASS:
expected_pipeline_class = MulticlassClassificationPipeline
X, y = X_y_multi

automl.search(X, y)
assert automl.results.keys() == {'pipeline_results', 'search_order'}
assert automl.results['search_order'] == [0, 1]
assert len(automl.results['pipeline_results']) == 2
for pipeline_id, results in automl.results['pipeline_results'].items():
assert results.keys() == {'id', 'pipeline_name', 'pipeline_class', 'pipeline_summary', 'parameters', 'score', 'high_variance_cv', 'training_time', 'cv_data'}
assert results['id'] == pipeline_id
assert isinstance(results['pipeline_name'], str)
assert issubclass(results['pipeline_class'], expected_pipeline_class)
assert isinstance(results['pipeline_summary'], str)
assert isinstance(results['parameters'], dict)
assert isinstance(results['score'], float)
assert isinstance(results['high_variance_cv'], np.bool_)
assert isinstance(results['cv_data'], list)
for cv_result in results['cv_data']:
assert cv_result.keys() == expected_cv_data_keys
if automl_type == ProblemTypes.BINARY:
assert isinstance(cv_result['binary_classification_threshold'], float)
else:
assert cv_result['binary_classification_threshold'] is None
assert automl.get_pipeline(pipeline_id).parameters == results['parameters']
assert isinstance(automl.rankings, pd.DataFrame)
assert isinstance(automl.full_rankings, pd.DataFrame)
assert np.all(automl.rankings.dtypes == pd.Series(
[np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))
assert np.all(automl.full_rankings.dtypes == pd.Series(
[np.dtype('int64'), np.dtype('O'), np.dtype('float64'), np.dtype('bool'), np.dtype('O')],
index=['id', 'pipeline_name', 'score', 'high_variance_cv', 'parameters']))


@patch('evalml.pipelines.BinaryClassificationPipeline.score')
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
def test_pipeline_limits(mock_fit, mock_score, caplog, X_y):
X, y = X_y
mock_score.return_value = {'Log Loss Binary': 1.0}

automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
automl.search(X, y)
out = caplog.text
assert "Searching up to 1 pipelines. " in out
assert len(automl.results['pipeline_results']) == 1

caplog.clear()
automl = AutoMLSearch(problem_type='binary', max_time=1)
automl.search(X, y)
out = caplog.text
assert "Will stop searching for new pipelines after 1 seconds" in out
assert len(automl.results['pipeline_results']) >= 1

caplog.clear()
automl = AutoMLSearch(problem_type='multiclass', max_time=1e-16)
automl.search(X, y)
out = caplog.text
assert "Will stop searching for new pipelines after 0 seconds" in out
# search will always run at least one pipeline
assert len(automl.results['pipeline_results']) >= 1

caplog.clear()
automl = AutoMLSearch(problem_type='binary', max_time=1, max_pipelines=5)
automl.search(X, y)
out = caplog.text
assert "Searching up to 5 pipelines. " in out
assert "Will stop searching for new pipelines after 1 seconds" in out
assert len(automl.results['pipeline_results']) <= 5

caplog.clear()
automl = AutoMLSearch(problem_type='binary')
automl.search(X, y)
out = caplog.text
assert "Using default limit of max_pipelines=5." in out
assert len(automl.results['pipeline_results']) <= 5


def test_search_order(X_y):
Expand Down Expand Up @@ -434,7 +499,8 @@ def test_automl_serialization(X_y, tmpdir):
automl.save(path)
loaded_automl = automl.load(path)
for i in range(num_max_pipelines):
assert automl.get_pipeline(i).score(X, y, ['precision']) == loaded_automl.get_pipeline(i).score(X, y, ['precision'])
assert automl.get_pipeline(i).__class__ == loaded_automl.get_pipeline(i).__class__
assert automl.get_pipeline(i).parameters == loaded_automl.get_pipeline(i).parameters
assert automl.results == loaded_automl.results
pd.testing.assert_frame_equal(automl.rankings, loaded_automl.rankings)

Expand Down Expand Up @@ -626,3 +692,57 @@ class CoolBinaryClassificationPipeline(dummy_binary_pipeline_class):
automl.add_to_rankings(test_pipeline_trained, X, y)

assert list(automl.rankings['score'].values).count(0.1234) == 2


@patch('evalml.pipelines.BinaryClassificationPipeline.score')
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
def test_get_pipeline_invalid(mock_fit, mock_score, X_y):
X, y = X_y
mock_score.return_value = {'Log Loss Binary': 1.0}

automl = AutoMLSearch(problem_type='binary')
with pytest.raises(PipelineNotFoundError, match="Pipeline not found in automl results"):
automl.get_pipeline(1000)

automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
automl.search(X, y)
assert automl.get_pipeline(0).name == 'Mode Baseline Binary Classification Pipeline'
automl.results['pipeline_results'][0].pop('pipeline_class')
with pytest.raises(PipelineNotFoundError, match="Pipeline class or parameters not found in automl results"):
automl.get_pipeline(0)

automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
automl.search(X, y)
assert automl.get_pipeline(0).name == 'Mode Baseline Binary Classification Pipeline'
automl.results['pipeline_results'][0].pop('parameters')
with pytest.raises(PipelineNotFoundError, match="Pipeline class or parameters not found in automl results"):
automl.get_pipeline(0)


@patch('evalml.pipelines.BinaryClassificationPipeline.score')
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
def test_describe_pipeline(mock_fit, mock_score, caplog, X_y):
X, y = X_y
mock_score.return_value = {'Log Loss Binary': 1.0}

automl = AutoMLSearch(problem_type='binary', max_pipelines=1)
automl.search(X, y)
out = caplog.text
assert "Searching up to 1 pipelines. " in out

assert len(automl.results['pipeline_results']) == 1
caplog.clear()
automl.describe_pipeline(0)
out = caplog.text
assert "Mode Baseline Binary Classification Pipeline" in out
assert "Problem Type: Binary Classification" in out
assert "Model Family: Baseline" in out
assert "* strategy : random_weighted" in out
assert "Total training time (including CV): " in out
assert """Log Loss Binary # Training # Testing
0 1.000 66.000 34.000
1 1.000 67.000 33.000
2 1.000 67.000 33.000
mean 1.000 - -
std 0.000 - -
coef of var 0.000 - -""" in out

0 comments on commit 704fe03

Please sign in to comment.