Revert "Conditionally include the imputer in pipelines (#3657)" (#3672)

* Revert "Conditionally include the imputer in pipelines (#3657)" This reverts commit ceebe08. * Release. * Fixed release.
alteryx · Aug 19, 2022 · 2ae367e · 2ae367e
1 parent 85472b6
commit 2ae367e
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 63 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
     * Enhancements
     * Fixes
         * ``IDColumnsDataCheck`` now only returns an action code to set the first column as the primary key if it contains unique values :pr:`3639`
+        * Reverted the ``make_pipeline`` changes that conditionally included the imputers :pr:`3672`
     * Changes
     * Documentation Changes
     * Testing Changes

diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py
@@ -115,7 +115,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None):
         Returns:
             pd.Series: Estimated labels.
         """
-        X = infer_feature_types(X)
         _predictions = self._predict(X, objective=objective)
         predictions = self.inverse_transform(_predictions.astype(int))
         predictions = pd.Series(

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -168,10 +168,9 @@ def _get_imputer(X, y, problem_type, estimator_class, sampler_name=None):
         logical_types.Datetime,
     }
 
-    if (
-        len(input_logical_types.intersection(types_imputer_handles))
-        or len(text_columns)
-    ) and X.isna().any().any():
+    if len(input_logical_types.intersection(types_imputer_handles)) or len(
+        text_columns,
+    ):
         components.append(Imputer)
 
     return components

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -3177,6 +3177,7 @@ def test_automl_validate_objective(non_core_objective, X_y_regression):
 def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary):
     X, y = X_y_binary
     params = {
+        "Imputer": {"numeric_impute_strategy": "most_frequent"},
         "Logistic Regression Classifier": {"C": 10, "penalty": "l2"},
         "Elastic Net Classifier": {"l1_ratio": 0.2},
     }
@@ -3192,6 +3193,11 @@ def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary):
     with env.test_context(score_return_value={automl.objective.name: 1.23}):
         automl.search()
     for i, row in automl.rankings.iterrows():
+        if "Imputer" in row["parameters"]:
+            assert (
+                row["parameters"]["Imputer"]["numeric_impute_strategy"]
+                == "most_frequent"
+            )
         if "Logistic Regression Classifier" in row["parameters"]:
             assert row["parameters"]["Logistic Regression Classifier"]["C"] == 10
             assert (
@@ -3203,7 +3209,6 @@ def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary):
 
 def test_automl_pipeline_params_multiple(AutoMLTestEnv, X_y_regression):
     X, y = X_y_regression
-    X[0][2] = None  # introduce a null value to include the imputer
     hyperparams = {
         "Imputer": {
             "numeric_impute_strategy": Categorical(["median", "most_frequent"]),
@@ -3252,6 +3257,7 @@ def test_automl_pipeline_params_multiple(AutoMLTestEnv, X_y_regression):
 def test_automl_pipeline_params_kwargs(AutoMLTestEnv, X_y_multi):
     X, y = X_y_multi
     hyperparams = {
+        "Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])},
         "Decision Tree Classifier": {
             "max_depth": Integer(1, 2),
             "ccp_alpha": Real(0.1, 0.5),
@@ -3269,6 +3275,11 @@ def test_automl_pipeline_params_kwargs(AutoMLTestEnv, X_y_multi):
     with env.test_context(score_return_value={automl.objective.name: 1.0}):
         automl.search()
     for i, row in automl.rankings.iterrows():
+        if "Imputer" in row["parameters"]:
+            assert (
+                row["parameters"]["Imputer"]["numeric_impute_strategy"]
+                == "most_frequent"
+            )
         if "Decision Tree Classifier" in row["parameters"]:
             assert (
                 0.1 < row["parameters"]["Decision Tree Classifier"]["ccp_alpha"] < 0.5
@@ -4741,16 +4752,13 @@ def test_search_parameters_held_automl(
             break
     assert found_dtc
     for tuners in aml.automl_algorithm._tuners.values():
-        if "Imputer" in tuners._pipeline_hyperparameter_ranges:
-            assert (
-                tuners._pipeline_hyperparameter_ranges["Imputer"][
-                    "numeric_impute_strategy"
-                ]
-                == expected
-            )
-            assert tuners._pipeline_hyperparameter_ranges["Imputer"][
-                "categorical_impute_strategy"
-            ] == ["most_frequent"]
+        assert (
+            tuners._pipeline_hyperparameter_ranges["Imputer"]["numeric_impute_strategy"]
+            == expected
+        )
+        assert tuners._pipeline_hyperparameter_ranges["Imputer"][
+            "categorical_impute_strategy"
+        ] == ["most_frequent"]
         # make sure that there are no set hyperparameters when we don't have defaults
         assert tuners._pipeline_hyperparameter_ranges["Label Encoder"] == {}
         assert tuners.propose()["Label Encoder"] == {}

diff --git a/evalml/tests/automl_tests/test_automl_iterative_algorithm.py b/evalml/tests/automl_tests/test_automl_iterative_algorithm.py
@@ -553,11 +553,10 @@ def test_pipeline_custom_hyperparameters_make_pipeline(
                     "Column_2",
                 ]
             if custom_hyperparameters:
-                if component_graphs:
-                    assert (
-                        row["parameters"]["Imputer"]["numeric_impute_strategy"]
-                        in search_parameters_["Imputer"]["numeric_impute_strategy"]
-                    )
+                assert (
+                    row["parameters"]["Imputer"]["numeric_impute_strategy"]
+                    in search_parameters_["Imputer"]["numeric_impute_strategy"]
+                )
                 assert (
                     4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7
                 )
@@ -567,12 +566,11 @@ def test_pipeline_custom_hyperparameters_make_pipeline(
                     <= 210
                 )
             else:
-                if component_graphs:
-                    assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [
-                        "mean",
-                        "median",
-                        "most_frequent",
-                    ]
+                assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [
+                    "mean",
+                    "median",
+                    "most_frequent",
+                ]
                 assert (
                     1
                     <= row["parameters"]["Random Forest Classifier"]["max_depth"]
@@ -994,12 +992,9 @@ def test_pipeline_parameter_warnings_component_graphs(
         env = AutoMLTestEnv("binary")
         with env.test_context(score_return_value={automl.objective.name: 1.0}):
             automl.search()
-    if len(w) == 1 and not len(set_values):
-        assert "components {'Imputer'} will not be used" in str(w[0].message)
-    else:
-        assert len(w) == (1 if len(set_values) else 0)
-        if len(w):
-            assert w[0].message.components == set_values
+    assert len(w) == (1 if len(set_values) else 0)
+    if len(w):
+        assert w[0].message.components == set_values
 
 
 def test_graph_automl(X_y_multi):

diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py
@@ -377,9 +377,11 @@ def test_make_split_pipeline(X_y_binary):
         "Label Encoder",
         "Categorical Pipeline - Select Columns Transformer",
         "Categorical Pipeline - Label Encoder",
+        "Categorical Pipeline - Imputer",
         "Categorical Pipeline - One Hot Encoder",
         "Numeric Pipeline - Select Columns By Type Transformer",
         "Numeric Pipeline - Label Encoder",
+        "Numeric Pipeline - Imputer",
         "Numeric Pipeline - Select Columns Transformer",
         "Random Forest Classifier",
     ]
@@ -414,6 +416,7 @@ def test_make_split_pipeline_categorical_only(X_y_binary):
     compute_order = [
         "Select Columns Transformer",
         "Label Encoder",
+        "Imputer",
         "One Hot Encoder",
         "Random Forest Classifier",
     ]

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -55,34 +55,33 @@
 @pytest.mark.parametrize("input_type", ["pd", "ww"])
 @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
 @pytest.mark.parametrize(
-    "test_description, column_names, has_null",
+    "test_description, column_names",
     [
-        ("all nan is not categorical", ["all_null", "numerical"], True),
-        ("mixed types", ["all_null", "categorical", "dates", "numerical"], True),
-        ("no all_null columns", ["numerical", "categorical", "dates"], False),
-        ("date, numerical", ["dates", "numerical"], False),
-        ("only text", ["text"], False),
-        ("only dates", ["dates"], False),
-        ("only numerical", ["numerical"], False),
-        ("only ip", ["ip"], False),
-        ("only all_null", ["all_null"], True),
-        ("only categorical", ["categorical"], False),
-        ("text with other features", ["text", "numerical", "categorical"], False),
-        ("url with other features", ["url", "numerical", "categorical"], False),
-        ("ip with other features", ["ip", "numerical", "categorical"], False),
-        ("email with other features", ["email", "numerical", "categorical"], False),
-        ("only null int", ["int_null"], True),
-        ("only null bool", ["bool_null"], True),
-        ("only null age", ["age_null"], True),
-        ("nullable_types", ["numerical", "int_null", "bool_null", "age_null"], True),
+        ("all nan is not categorical", ["all_null", "numerical"]),
+        ("mixed types", ["all_null", "categorical", "dates", "numerical"]),
+        ("no all_null columns", ["numerical", "categorical", "dates"]),
+        ("date, numerical", ["dates", "numerical"]),
+        ("only text", ["text"]),
+        ("only dates", ["dates"]),
+        ("only numerical", ["numerical"]),
+        ("only ip", ["ip"]),
+        ("only all_null", ["all_null"]),
+        ("only categorical", ["categorical"]),
+        ("text with other features", ["text", "numerical", "categorical"]),
+        ("url with other features", ["url", "numerical", "categorical"]),
+        ("ip with other features", ["ip", "numerical", "categorical"]),
+        ("email with other features", ["email", "numerical", "categorical"]),
+        ("only null int", ["int_null"]),
+        ("only null bool", ["bool_null"]),
+        ("only null age", ["age_null"]),
+        ("nullable_types", ["numerical", "int_null", "bool_null", "age_null"]),
     ],
 )
 def test_make_pipeline(
     problem_type,
     input_type,
     test_description,
     column_names,
-    has_null,
     get_test_data_from_configuration,
 ):
 
@@ -149,11 +148,7 @@ def test_make_pipeline(
             )
             email_featurizer = [EmailFeaturizer] if "email" in column_names else []
             url_featurizer = [URLFeaturizer] if "url" in column_names else []
-            imputer = (
-                []
-                if ((column_names in [["ip"], ["all_null"]]) or not has_null)
-                else [Imputer]
-            )
+            imputer = [] if (column_names in [["ip"], ["all_null"]]) else [Imputer]
             drop_nan_rows_transformer = (
                 [DropNaNRowsTransformer]
                 if is_time_series(problem_type)
@@ -1023,7 +1018,7 @@ def test_make_pipeline_from_multiple_graphs_with_sampler(X_y_binary):
         problem_type=ProblemTypes.BINARY,
     )
     second_pipeline_sampler = (
-        "Pipeline w/ Label Encoder + Undersampler Pipeline 2 - Undersampler.y"
+        "Pipeline w/ Label Encoder + Imputer + Undersampler Pipeline 2 - Undersampler.y"
     )
     assert (
         combined_pipeline.component_graph.get_inputs("Random Forest Classifier")[2]
@@ -1066,14 +1061,13 @@ def test_make_pipeline_from_multiple_graphs_prior_components(X_y_binary):
         prior_components=prior_components,
         sub_pipeline_names=sub_pipeline_names,
     )
+
     assert (
-        combined_pipeline.component_graph.get_inputs("First Pipeline - Undersampler")[0]
+        combined_pipeline.component_graph.get_inputs("First Pipeline - Imputer")[0]
         == "DFS Transformer.x"
     )
     assert (
-        combined_pipeline.component_graph.get_inputs("Second Pipeline - Undersampler")[
-            0
-        ]
+        combined_pipeline.component_graph.get_inputs("Second Pipeline - Imputer")[0]
         == "DFS Transformer.x"
     )