Skip to content

Commit

Permalink
Limit model usage for EN, XBG, CB for large multiclass problems (#2982)
Browse files Browse the repository at this point in the history
* Limit model usage for EN, XBG, CB for large multiclass problems

* update release

* add new test

* update release notes

* update test

* fix typo docstring

* fix import issues

* update tests for comments

* remove unued import
  • Loading branch information
bchen1116 committed Nov 2, 2021
1 parent 981024f commit b7b64b9
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Limit computationally-intensive models during ``AutoMLSearch`` for certain multiclass problems, allow for opt-in with parameter ``allow_long_running_models`` :pr:`2982`
* Added support for stacked ensemble pipelines to prediction explanations module :pr:`2971`
* Fixes
* Fixed bug where ``Oversampler`` didn't consider boolean columns to be categorical :pr:`2980`
Expand Down
33 changes: 32 additions & 1 deletion evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components.utils import get_estimators
from evalml.pipelines.utils import make_pipeline
from evalml.problem_types import is_time_series
from evalml.problem_types import is_multiclass, is_time_series
from evalml.utils import infer_feature_types
from evalml.utils.logger import get_logger

Expand Down Expand Up @@ -62,6 +62,8 @@ class IterativeAlgorithm(AutoMLAlgorithm):
pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None.
custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None.
_estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to None, which uses _ESTIMATOR_FAMILY_ORDER.
allow_long_running_models (bool): Whether or not to allow longer-running models for large multiclass problems. If False and no pipelines, component graphs, or model families are provided,
AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
"""

Expand All @@ -85,6 +87,7 @@ def __init__(
pipeline_params=None,
custom_hyperparameters=None,
_estimator_family_order=None,
allow_long_running_models=False,
verbose=False,
):
self.X = infer_feature_types(X)
Expand All @@ -104,6 +107,7 @@ def __init__(
self.text_in_ensembling = text_in_ensembling
self.max_batches = max_batches
self.max_iterations = max_iterations
self.allow_long_running_models = allow_long_running_models
if verbose:
self.logger = get_logger(f"{__name__}.verbose")
else:
Expand Down Expand Up @@ -146,6 +150,32 @@ def __init__(
" and Real!"
)

def _filter_estimators(self, estimators):
"""Function to remove computationally expensive and long-running estimators from datasets with large numbers of unique classes. Thresholds were determined empirically."""
estimators_to_drop = []
if (
not is_multiclass(self.problem_type)
or self.allow_long_running_models
or self.allowed_model_families is not None
):
return estimators
unique = self.y.nunique()
if unique > 75:
estimators_to_drop.extend(["Elastic Net Classifier", "XGBoost Classifier"])
if unique > 150:
estimators_to_drop.append("CatBoost Classifier")
dropped_estimators = [e for e in estimators if e.name in estimators_to_drop]
if len(dropped_estimators):
self.logger.info(
"Dropping estimators {} because the number of unique targets is {} and `allow_long_running_models` is set to {}".format(
", ".join(sorted([e.name for e in dropped_estimators])),
unique,
self.allow_long_running_models,
)
)
estimators = [e for e in estimators if e not in dropped_estimators]
return estimators

def _create_pipelines(self):
indices = []
pipelines_to_sort = []
Expand All @@ -156,6 +186,7 @@ def _create_pipelines(self):
allowed_estimators = get_estimators(
self.problem_type, self.allowed_model_families
)
allowed_estimators = self._filter_estimators(allowed_estimators)
if (
is_time_series(self.problem_type)
and self._pipeline_params["pipeline"]["date_index"]
Expand Down
6 changes: 6 additions & 0 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,9 @@ class AutoMLSearch:
sampler_balanced_ratio (float): The minority:majority class ratio that we consider balanced, so a 1:4 ratio would be equal to 0.25. If the class balance is larger than this provided value,
then we will not add a sampler since the data is then considered balanced. Overrides the `sampler_ratio` of the samplers. Defaults to 0.25.
allow_long_running_models (bool): Whether or not to allow longer-running models for large multiclass problems. If False and no pipelines, component graphs, or model families are provided,
AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
_ensembling_split_size (float): The amount of the training data we'll set aside for training ensemble metalearners. Only used when ensembling is True.
Must be between 0 and 1, exclusive. Defaults to 0.2
Expand Down Expand Up @@ -426,6 +429,7 @@ def __init__(
custom_hyperparameters=None,
sampler_method="auto",
sampler_balanced_ratio=0.25,
allow_long_running_models=False,
_ensembling_split_size=0.2,
_pipelines_per_batch=5,
_automl_algorithm="iterative",
Expand Down Expand Up @@ -587,6 +591,7 @@ def __init__(
)
self.allowed_component_graphs = allowed_component_graphs
self.allowed_model_families = allowed_model_families
self.allow_long_running_models = allow_long_running_models
self._start = 0.0
self._baseline_cv_scores = {}
self.show_batch_output = False
Expand Down Expand Up @@ -690,6 +695,7 @@ def __init__(
text_in_ensembling=text_in_ensembling,
pipeline_params=parameters,
custom_hyperparameters=custom_hyperparameters,
allow_long_running_models=allow_long_running_models,
verbose=self.verbose,
)
elif _automl_algorithm == "default":
Expand Down
28 changes: 28 additions & 0 deletions evalml/tests/automl_tests/test_automl_search_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1629,3 +1629,31 @@ def test_time_series_pipeline_parameter_warnings(
assert len(w) == (2 if len(set_values) else 1)
if len(w) == 2:
assert w[1].message.components == set_values


@pytest.mark.parametrize("allow_long_running_models", [True, False])
@pytest.mark.parametrize("unique", [10, 200])
def test_automl_passes_allow_long_running_models_iterative(
unique, allow_long_running_models, caplog, has_minimal_dependencies
):
X = pd.DataFrame()
y = pd.Series([i for i in range(unique)] * 5)
automl = AutoMLSearch(
X_train=X,
y_train=y,
problem_type="multiclass",
allow_long_running_models=allow_long_running_models,
_automl_algorithm="iterative",
verbose=True,
)
assert (
automl._automl_algorithm.allow_long_running_models == allow_long_running_models
)
if allow_long_running_models or unique == 10:
assert "Dropping estimators" not in caplog.text
return
estimators = ["Elastic Net Classifier"]
if not has_minimal_dependencies:
estimators.extend(["CatBoost Classifier", "XGBoost Classifier"])

assert "Dropping estimators {}".format(", ".join(sorted(estimators))) in caplog.text
134 changes: 134 additions & 0 deletions evalml/tests/automl_tests/test_iterative_algorithm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch

import numpy as np
import pandas as pd
import pytest
from skopt.space import Categorical, Integer, Real

Expand Down Expand Up @@ -929,3 +930,136 @@ def test_iterative_algorithm_sampling_params(
for component in p.component_graph:
if "sampler" in component.name:
assert component.parameters["sampling_ratio"] == 0.25


@pytest.mark.parametrize("allowed_model_families", [None, [ModelFamily.XGBOOST]])
@pytest.mark.parametrize(
"allowed_component_graphs",
[None, {"Pipeline_1": ["Imputer", "XGBoost Classifier"]}],
)
@pytest.mark.parametrize("allow_long_running_models", [True, False])
@pytest.mark.parametrize(
"length,models_missing",
[
(10, []),
(75, []),
(100, ["Elastic Net Classifier", "XGBoost Classifier"]),
(160, ["Elastic Net Classifier", "XGBoost Classifier", "CatBoost Classifier"]),
],
)
def test_iterative_algorithm_allow_long_running_models(
length,
models_missing,
allow_long_running_models,
allowed_component_graphs,
allowed_model_families,
has_minimal_dependencies,
):
if has_minimal_dependencies:
return
X = pd.DataFrame()
y = pd.Series([i for i in range(length)] * 5)
y_short = pd.Series([i for i in range(10)] * 5)
algo = IterativeAlgorithm(
X=X,
y=y,
problem_type="multiclass",
random_seed=0,
allowed_model_families=allowed_model_families,
allowed_component_graphs=allowed_component_graphs,
allow_long_running_models=allow_long_running_models,
)
if allowed_model_families is not None or allowed_component_graphs is not None:
assert len(algo.allowed_pipelines) == 1
return
algo_short = IterativeAlgorithm(
X=X,
y=y_short,
problem_type="multiclass",
random_seed=0,
allowed_model_families=allowed_model_families,
allowed_component_graphs=allowed_component_graphs,
)
if allow_long_running_models:
assert len(algo_short.allowed_pipelines) == len(algo.allowed_pipelines)
else:
assert len(algo_short.allowed_pipelines) == len(algo.allowed_pipelines) + len(
models_missing
)
for p in algo.allowed_pipelines:
assert all([s not in p.name for s in models_missing])


@pytest.mark.parametrize("problem", ["binary", "multiclass", "regression"])
@pytest.mark.parametrize("allow_long_running_models", [True, False])
@pytest.mark.parametrize(
"length,models_missing", [(10, 0), (75, 0), (100, 2), (160, 3)]
)
def test_iterative_algorithm_allow_long_running_models_problem(
length, models_missing, allow_long_running_models, problem, has_minimal_dependencies
):
X = pd.DataFrame()
y = pd.Series([i for i in range(length)] * 5)
y_short = pd.Series([i for i in range(10)] * 5)
algo = IterativeAlgorithm(
X=X,
y=y,
problem_type=problem,
random_seed=0,
allow_long_running_models=allow_long_running_models,
)
algo_reg = IterativeAlgorithm(
X=X,
y=y_short,
problem_type=problem,
random_seed=0,
)
if problem != "multiclass" or allow_long_running_models:
assert len(algo.allowed_pipelines) == len(algo_reg.allowed_pipelines)
models_missing = 0

if has_minimal_dependencies and models_missing > 0:
# no XGBoost or CatBoost installed
models_missing = 1
assert len(algo.allowed_pipelines) + models_missing == len(
algo_reg.allowed_pipelines
)


def test_iterative_algorithm_allow_long_running_models_next_batch(
has_minimal_dependencies,
):
models_missing = [
"Elastic Net Classifier",
"XGBoost Classifier",
"CatBoost Classifier",
]
if has_minimal_dependencies:
models_missing = ["Elastic Net Classifier"]
X = pd.DataFrame()
y = pd.Series([i for i in range(200)] * 5)

algo = IterativeAlgorithm(
X=X,
y=y,
problem_type="multiclass",
random_seed=0,
allow_long_running_models=False,
)
next_batch = algo.next_batch()

for pipeline in next_batch:
assert all([m not in pipeline.name for m in models_missing])

# the "best" score will be the 1st dummy pipeline
scores = np.arange(0, len(next_batch))
for score, pipeline in zip(scores, next_batch):
algo.add_result(score, pipeline, {"id": algo.pipeline_number})

for i in range(1, 5):
next_batch = algo.next_batch()
for pipeline in next_batch:
assert all([m not in pipeline.name for m in models_missing])
scores = -np.arange(0, len(next_batch))
for score, pipeline in zip(scores, next_batch):
algo.add_result(score, pipeline, {"id": algo.pipeline_number})

0 comments on commit b7b64b9

Please sign in to comment.