Limit model usage for EN, XBG, CB for large multiclass problems (#2982)

* Limit model usage for EN, XBG, CB for large multiclass problems * update release * add new test * update release notes * update test * fix typo docstring * fix import issues * update tests for comments * remove unued import
alteryx · Nov 2, 2021 · b7b64b9 · b7b64b9
1 parent 981024f
commit b7b64b9
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 1 deletion.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Limit computationally-intensive models during ``AutoMLSearch`` for certain multiclass problems, allow for opt-in with parameter ``allow_long_running_models`` :pr:`2982`
         * Added support for stacked ensemble pipelines to prediction explanations module :pr:`2971`
     * Fixes
         * Fixed bug where ``Oversampler`` didn't consider boolean columns to be categorical :pr:`2980`

diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -15,7 +15,7 @@
 from evalml.model_family import ModelFamily
 from evalml.pipelines.components.utils import get_estimators
 from evalml.pipelines.utils import make_pipeline
-from evalml.problem_types import is_time_series
+from evalml.problem_types import is_multiclass, is_time_series
 from evalml.utils import infer_feature_types
 from evalml.utils.logger import get_logger
 
@@ -62,6 +62,8 @@ class IterativeAlgorithm(AutoMLAlgorithm):
         pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines. Defaults to None.
         custom_hyperparameters (dict or None): Custom hyperparameter ranges specified for pipelines to iterate over. Defaults to None.
         _estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to None, which uses _ESTIMATOR_FAMILY_ORDER.
+        allow_long_running_models (bool): Whether or not to allow longer-running models for large multiclass problems. If False and no pipelines, component graphs, or model families are provided,
+            AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
         verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
     """
 
@@ -85,6 +87,7 @@ def __init__(
         pipeline_params=None,
         custom_hyperparameters=None,
         _estimator_family_order=None,
+        allow_long_running_models=False,
         verbose=False,
     ):
         self.X = infer_feature_types(X)
@@ -104,6 +107,7 @@ def __init__(
         self.text_in_ensembling = text_in_ensembling
         self.max_batches = max_batches
         self.max_iterations = max_iterations
+        self.allow_long_running_models = allow_long_running_models
         if verbose:
             self.logger = get_logger(f"{__name__}.verbose")
         else:
@@ -146,6 +150,32 @@ def __init__(
                         " and Real!"
                     )
 
+    def _filter_estimators(self, estimators):
+        """Function to remove computationally expensive and long-running estimators from datasets with large numbers of unique classes. Thresholds were determined empirically."""
+        estimators_to_drop = []
+        if (
+            not is_multiclass(self.problem_type)
+            or self.allow_long_running_models
+            or self.allowed_model_families is not None
+        ):
+            return estimators
+        unique = self.y.nunique()
+        if unique > 75:
+            estimators_to_drop.extend(["Elastic Net Classifier", "XGBoost Classifier"])
+        if unique > 150:
+            estimators_to_drop.append("CatBoost Classifier")
+        dropped_estimators = [e for e in estimators if e.name in estimators_to_drop]
+        if len(dropped_estimators):
+            self.logger.info(
+                "Dropping estimators {} because the number of unique targets is {} and `allow_long_running_models` is set to {}".format(
+                    ", ".join(sorted([e.name for e in dropped_estimators])),
+                    unique,
+                    self.allow_long_running_models,
+                )
+            )
+        estimators = [e for e in estimators if e not in dropped_estimators]
+        return estimators
+
     def _create_pipelines(self):
         indices = []
         pipelines_to_sort = []
@@ -156,6 +186,7 @@ def _create_pipelines(self):
             allowed_estimators = get_estimators(
                 self.problem_type, self.allowed_model_families
             )
+            allowed_estimators = self._filter_estimators(allowed_estimators)
             if (
                 is_time_series(self.problem_type)
                 and self._pipeline_params["pipeline"]["date_index"]

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -379,6 +379,9 @@ class AutoMLSearch:
         sampler_balanced_ratio (float): The minority:majority class ratio that we consider balanced, so a 1:4 ratio would be equal to 0.25. If the class balance is larger than this provided value,
             then we will not add a sampler since the data is then considered balanced. Overrides the `sampler_ratio` of the samplers. Defaults to 0.25.
 
+        allow_long_running_models (bool): Whether or not to allow longer-running models for large multiclass problems. If False and no pipelines, component graphs, or model families are provided,
+            AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
+
         _ensembling_split_size (float): The amount of the training data we'll set aside for training ensemble metalearners. Only used when ensembling is True.
             Must be between 0 and 1, exclusive. Defaults to 0.2
 
@@ -426,6 +429,7 @@ def __init__(
         custom_hyperparameters=None,
         sampler_method="auto",
         sampler_balanced_ratio=0.25,
+        allow_long_running_models=False,
         _ensembling_split_size=0.2,
         _pipelines_per_batch=5,
         _automl_algorithm="iterative",
@@ -587,6 +591,7 @@ def __init__(
                     )
         self.allowed_component_graphs = allowed_component_graphs
         self.allowed_model_families = allowed_model_families
+        self.allow_long_running_models = allow_long_running_models
         self._start = 0.0
         self._baseline_cv_scores = {}
         self.show_batch_output = False
@@ -690,6 +695,7 @@ def __init__(
                 text_in_ensembling=text_in_ensembling,
                 pipeline_params=parameters,
                 custom_hyperparameters=custom_hyperparameters,
+                allow_long_running_models=allow_long_running_models,
                 verbose=self.verbose,
             )
         elif _automl_algorithm == "default":

diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py
@@ -1629,3 +1629,31 @@ def test_time_series_pipeline_parameter_warnings(
     assert len(w) == (2 if len(set_values) else 1)
     if len(w) == 2:
         assert w[1].message.components == set_values
+
+
+@pytest.mark.parametrize("allow_long_running_models", [True, False])
+@pytest.mark.parametrize("unique", [10, 200])
+def test_automl_passes_allow_long_running_models_iterative(
+    unique, allow_long_running_models, caplog, has_minimal_dependencies
+):
+    X = pd.DataFrame()
+    y = pd.Series([i for i in range(unique)] * 5)
+    automl = AutoMLSearch(
+        X_train=X,
+        y_train=y,
+        problem_type="multiclass",
+        allow_long_running_models=allow_long_running_models,
+        _automl_algorithm="iterative",
+        verbose=True,
+    )
+    assert (
+        automl._automl_algorithm.allow_long_running_models == allow_long_running_models
+    )
+    if allow_long_running_models or unique == 10:
+        assert "Dropping estimators" not in caplog.text
+        return
+    estimators = ["Elastic Net Classifier"]
+    if not has_minimal_dependencies:
+        estimators.extend(["CatBoost Classifier", "XGBoost Classifier"])
+
+    assert "Dropping estimators {}".format(", ".join(sorted(estimators))) in caplog.text
diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py
@@ -1,6 +1,7 @@
 from unittest.mock import patch
 
 import numpy as np
+import pandas as pd
 import pytest
 from skopt.space import Categorical, Integer, Real
 
@@ -929,3 +930,136 @@ def test_iterative_algorithm_sampling_params(
             for component in p.component_graph:
                 if "sampler" in component.name:
                     assert component.parameters["sampling_ratio"] == 0.25
+
+
+@pytest.mark.parametrize("allowed_model_families", [None, [ModelFamily.XGBOOST]])
+@pytest.mark.parametrize(
+    "allowed_component_graphs",
+    [None, {"Pipeline_1": ["Imputer", "XGBoost Classifier"]}],
+)
+@pytest.mark.parametrize("allow_long_running_models", [True, False])
+@pytest.mark.parametrize(
+    "length,models_missing",
+    [
+        (10, []),
+        (75, []),
+        (100, ["Elastic Net Classifier", "XGBoost Classifier"]),
+        (160, ["Elastic Net Classifier", "XGBoost Classifier", "CatBoost Classifier"]),
+    ],
+)
+def test_iterative_algorithm_allow_long_running_models(
+    length,
+    models_missing,
+    allow_long_running_models,
+    allowed_component_graphs,
+    allowed_model_families,
+    has_minimal_dependencies,
+):
+    if has_minimal_dependencies:
+        return
+    X = pd.DataFrame()
+    y = pd.Series([i for i in range(length)] * 5)
+    y_short = pd.Series([i for i in range(10)] * 5)
+    algo = IterativeAlgorithm(
+        X=X,
+        y=y,
+        problem_type="multiclass",
+        random_seed=0,
+        allowed_model_families=allowed_model_families,
+        allowed_component_graphs=allowed_component_graphs,
+        allow_long_running_models=allow_long_running_models,
+    )
+    if allowed_model_families is not None or allowed_component_graphs is not None:
+        assert len(algo.allowed_pipelines) == 1
+        return
+    algo_short = IterativeAlgorithm(
+        X=X,
+        y=y_short,
+        problem_type="multiclass",
+        random_seed=0,
+        allowed_model_families=allowed_model_families,
+        allowed_component_graphs=allowed_component_graphs,
+    )
+    if allow_long_running_models:
+        assert len(algo_short.allowed_pipelines) == len(algo.allowed_pipelines)
+    else:
+        assert len(algo_short.allowed_pipelines) == len(algo.allowed_pipelines) + len(
+            models_missing
+        )
+        for p in algo.allowed_pipelines:
+            assert all([s not in p.name for s in models_missing])
+
+
+@pytest.mark.parametrize("problem", ["binary", "multiclass", "regression"])
+@pytest.mark.parametrize("allow_long_running_models", [True, False])
+@pytest.mark.parametrize(
+    "length,models_missing", [(10, 0), (75, 0), (100, 2), (160, 3)]
+)
+def test_iterative_algorithm_allow_long_running_models_problem(
+    length, models_missing, allow_long_running_models, problem, has_minimal_dependencies
+):
+    X = pd.DataFrame()
+    y = pd.Series([i for i in range(length)] * 5)
+    y_short = pd.Series([i for i in range(10)] * 5)
+    algo = IterativeAlgorithm(
+        X=X,
+        y=y,
+        problem_type=problem,
+        random_seed=0,
+        allow_long_running_models=allow_long_running_models,
+    )
+    algo_reg = IterativeAlgorithm(
+        X=X,
+        y=y_short,
+        problem_type=problem,
+        random_seed=0,
+    )
+    if problem != "multiclass" or allow_long_running_models:
+        assert len(algo.allowed_pipelines) == len(algo_reg.allowed_pipelines)
+        models_missing = 0
+
+    if has_minimal_dependencies and models_missing > 0:
+        # no XGBoost or CatBoost installed
+        models_missing = 1
+    assert len(algo.allowed_pipelines) + models_missing == len(
+        algo_reg.allowed_pipelines
+    )
+
+
+def test_iterative_algorithm_allow_long_running_models_next_batch(
+    has_minimal_dependencies,
+):
+    models_missing = [
+        "Elastic Net Classifier",
+        "XGBoost Classifier",
+        "CatBoost Classifier",
+    ]
+    if has_minimal_dependencies:
+        models_missing = ["Elastic Net Classifier"]
+    X = pd.DataFrame()
+    y = pd.Series([i for i in range(200)] * 5)
+
+    algo = IterativeAlgorithm(
+        X=X,
+        y=y,
+        problem_type="multiclass",
+        random_seed=0,
+        allow_long_running_models=False,
+    )
+    next_batch = algo.next_batch()
+
+    for pipeline in next_batch:
+        assert all([m not in pipeline.name for m in models_missing])
+
+    # the "best" score will be the 1st dummy pipeline
+    scores = np.arange(0, len(next_batch))
+    for score, pipeline in zip(scores, next_batch):
+        algo.add_result(score, pipeline, {"id": algo.pipeline_number})
+
+    for i in range(1, 5):
+        next_batch = algo.next_batch()
+        for pipeline in next_batch:
+            assert all([m not in pipeline.name for m in models_missing])
+        scores = -np.arange(0, len(next_batch))
+        for score, pipeline in zip(scores, next_batch):
+            algo.add_result(score, pipeline, {"id": algo.pipeline_number})