diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index d529bd5a88..8a127e460a 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -4,6 +4,7 @@ Release Notes * Enhancements * Fixes * ``IDColumnsDataCheck`` now only returns an action code to set the first column as the primary key if it contains unique values :pr:`3639` + * Reverted the ``make_pipeline`` changes that conditionally included the imputers :pr:`3672` * Changes * Documentation Changes * Testing Changes diff --git a/evalml/pipelines/classification_pipeline.py b/evalml/pipelines/classification_pipeline.py index 5e3c7922a2..e31393a218 100644 --- a/evalml/pipelines/classification_pipeline.py +++ b/evalml/pipelines/classification_pipeline.py @@ -115,7 +115,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None): Returns: pd.Series: Estimated labels. """ - X = infer_feature_types(X) _predictions = self._predict(X, objective=objective) predictions = self.inverse_transform(_predictions.astype(int)) predictions = pd.Series( diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 134a19bb9e..553c978fad 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -168,10 +168,9 @@ def _get_imputer(X, y, problem_type, estimator_class, sampler_name=None): logical_types.Datetime, } - if ( - len(input_logical_types.intersection(types_imputer_handles)) - or len(text_columns) - ) and X.isna().any().any(): + if len(input_logical_types.intersection(types_imputer_handles)) or len( + text_columns, + ): components.append(Imputer) return components diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index bc4753c9d2..1a46f3aba9 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -3177,6 +3177,7 @@ def test_automl_validate_objective(non_core_objective, X_y_regression): def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary): X, y = X_y_binary params = { + "Imputer": {"numeric_impute_strategy": "most_frequent"}, "Logistic Regression Classifier": {"C": 10, "penalty": "l2"}, "Elastic Net Classifier": {"l1_ratio": 0.2}, } @@ -3192,6 +3193,11 @@ def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary): with env.test_context(score_return_value={automl.objective.name: 1.23}): automl.search() for i, row in automl.rankings.iterrows(): + if "Imputer" in row["parameters"]: + assert ( + row["parameters"]["Imputer"]["numeric_impute_strategy"] + == "most_frequent" + ) if "Logistic Regression Classifier" in row["parameters"]: assert row["parameters"]["Logistic Regression Classifier"]["C"] == 10 assert ( @@ -3203,7 +3209,6 @@ def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary): def test_automl_pipeline_params_multiple(AutoMLTestEnv, X_y_regression): X, y = X_y_regression - X[0][2] = None # introduce a null value to include the imputer hyperparams = { "Imputer": { "numeric_impute_strategy": Categorical(["median", "most_frequent"]), @@ -3252,6 +3257,7 @@ def test_automl_pipeline_params_multiple(AutoMLTestEnv, X_y_regression): def test_automl_pipeline_params_kwargs(AutoMLTestEnv, X_y_multi): X, y = X_y_multi hyperparams = { + "Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])}, "Decision Tree Classifier": { "max_depth": Integer(1, 2), "ccp_alpha": Real(0.1, 0.5), @@ -3269,6 +3275,11 @@ def test_automl_pipeline_params_kwargs(AutoMLTestEnv, X_y_multi): with env.test_context(score_return_value={automl.objective.name: 1.0}): automl.search() for i, row in automl.rankings.iterrows(): + if "Imputer" in row["parameters"]: + assert ( + row["parameters"]["Imputer"]["numeric_impute_strategy"] + == "most_frequent" + ) if "Decision Tree Classifier" in row["parameters"]: assert ( 0.1 < row["parameters"]["Decision Tree Classifier"]["ccp_alpha"] < 0.5 @@ -4741,16 +4752,13 @@ def test_search_parameters_held_automl( break assert found_dtc for tuners in aml.automl_algorithm._tuners.values(): - if "Imputer" in tuners._pipeline_hyperparameter_ranges: - assert ( - tuners._pipeline_hyperparameter_ranges["Imputer"][ - "numeric_impute_strategy" - ] - == expected - ) - assert tuners._pipeline_hyperparameter_ranges["Imputer"][ - "categorical_impute_strategy" - ] == ["most_frequent"] + assert ( + tuners._pipeline_hyperparameter_ranges["Imputer"]["numeric_impute_strategy"] + == expected + ) + assert tuners._pipeline_hyperparameter_ranges["Imputer"][ + "categorical_impute_strategy" + ] == ["most_frequent"] # make sure that there are no set hyperparameters when we don't have defaults assert tuners._pipeline_hyperparameter_ranges["Label Encoder"] == {} assert tuners.propose()["Label Encoder"] == {} diff --git a/evalml/tests/automl_tests/test_automl_iterative_algorithm.py b/evalml/tests/automl_tests/test_automl_iterative_algorithm.py index 80936e7b23..e595d49805 100644 --- a/evalml/tests/automl_tests/test_automl_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_automl_iterative_algorithm.py @@ -553,11 +553,10 @@ def test_pipeline_custom_hyperparameters_make_pipeline( "Column_2", ] if custom_hyperparameters: - if component_graphs: - assert ( - row["parameters"]["Imputer"]["numeric_impute_strategy"] - in search_parameters_["Imputer"]["numeric_impute_strategy"] - ) + assert ( + row["parameters"]["Imputer"]["numeric_impute_strategy"] + in search_parameters_["Imputer"]["numeric_impute_strategy"] + ) assert ( 4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7 ) @@ -567,12 +566,11 @@ def test_pipeline_custom_hyperparameters_make_pipeline( <= 210 ) else: - if component_graphs: - assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [ - "mean", - "median", - "most_frequent", - ] + assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [ + "mean", + "median", + "most_frequent", + ] assert ( 1 <= row["parameters"]["Random Forest Classifier"]["max_depth"] @@ -994,12 +992,9 @@ def test_pipeline_parameter_warnings_component_graphs( env = AutoMLTestEnv("binary") with env.test_context(score_return_value={automl.objective.name: 1.0}): automl.search() - if len(w) == 1 and not len(set_values): - assert "components {'Imputer'} will not be used" in str(w[0].message) - else: - assert len(w) == (1 if len(set_values) else 0) - if len(w): - assert w[0].message.components == set_values + assert len(w) == (1 if len(set_values) else 0) + if len(w): + assert w[0].message.components == set_values def test_graph_automl(X_y_multi): diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index 4fe6f738c5..5eba1dfc17 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -377,9 +377,11 @@ def test_make_split_pipeline(X_y_binary): "Label Encoder", "Categorical Pipeline - Select Columns Transformer", "Categorical Pipeline - Label Encoder", + "Categorical Pipeline - Imputer", "Categorical Pipeline - One Hot Encoder", "Numeric Pipeline - Select Columns By Type Transformer", "Numeric Pipeline - Label Encoder", + "Numeric Pipeline - Imputer", "Numeric Pipeline - Select Columns Transformer", "Random Forest Classifier", ] @@ -414,6 +416,7 @@ def test_make_split_pipeline_categorical_only(X_y_binary): compute_order = [ "Select Columns Transformer", "Label Encoder", + "Imputer", "One Hot Encoder", "Random Forest Classifier", ] diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py index 6baefc53f5..d186ed4968 100644 --- a/evalml/tests/pipeline_tests/test_pipeline_utils.py +++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py @@ -55,26 +55,26 @@ @pytest.mark.parametrize("input_type", ["pd", "ww"]) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) @pytest.mark.parametrize( - "test_description, column_names, has_null", + "test_description, column_names", [ - ("all nan is not categorical", ["all_null", "numerical"], True), - ("mixed types", ["all_null", "categorical", "dates", "numerical"], True), - ("no all_null columns", ["numerical", "categorical", "dates"], False), - ("date, numerical", ["dates", "numerical"], False), - ("only text", ["text"], False), - ("only dates", ["dates"], False), - ("only numerical", ["numerical"], False), - ("only ip", ["ip"], False), - ("only all_null", ["all_null"], True), - ("only categorical", ["categorical"], False), - ("text with other features", ["text", "numerical", "categorical"], False), - ("url with other features", ["url", "numerical", "categorical"], False), - ("ip with other features", ["ip", "numerical", "categorical"], False), - ("email with other features", ["email", "numerical", "categorical"], False), - ("only null int", ["int_null"], True), - ("only null bool", ["bool_null"], True), - ("only null age", ["age_null"], True), - ("nullable_types", ["numerical", "int_null", "bool_null", "age_null"], True), + ("all nan is not categorical", ["all_null", "numerical"]), + ("mixed types", ["all_null", "categorical", "dates", "numerical"]), + ("no all_null columns", ["numerical", "categorical", "dates"]), + ("date, numerical", ["dates", "numerical"]), + ("only text", ["text"]), + ("only dates", ["dates"]), + ("only numerical", ["numerical"]), + ("only ip", ["ip"]), + ("only all_null", ["all_null"]), + ("only categorical", ["categorical"]), + ("text with other features", ["text", "numerical", "categorical"]), + ("url with other features", ["url", "numerical", "categorical"]), + ("ip with other features", ["ip", "numerical", "categorical"]), + ("email with other features", ["email", "numerical", "categorical"]), + ("only null int", ["int_null"]), + ("only null bool", ["bool_null"]), + ("only null age", ["age_null"]), + ("nullable_types", ["numerical", "int_null", "bool_null", "age_null"]), ], ) def test_make_pipeline( @@ -82,7 +82,6 @@ def test_make_pipeline( input_type, test_description, column_names, - has_null, get_test_data_from_configuration, ): @@ -149,11 +148,7 @@ def test_make_pipeline( ) email_featurizer = [EmailFeaturizer] if "email" in column_names else [] url_featurizer = [URLFeaturizer] if "url" in column_names else [] - imputer = ( - [] - if ((column_names in [["ip"], ["all_null"]]) or not has_null) - else [Imputer] - ) + imputer = [] if (column_names in [["ip"], ["all_null"]]) else [Imputer] drop_nan_rows_transformer = ( [DropNaNRowsTransformer] if is_time_series(problem_type) @@ -1023,7 +1018,7 @@ def test_make_pipeline_from_multiple_graphs_with_sampler(X_y_binary): problem_type=ProblemTypes.BINARY, ) second_pipeline_sampler = ( - "Pipeline w/ Label Encoder + Undersampler Pipeline 2 - Undersampler.y" + "Pipeline w/ Label Encoder + Imputer + Undersampler Pipeline 2 - Undersampler.y" ) assert ( combined_pipeline.component_graph.get_inputs("Random Forest Classifier")[2] @@ -1066,14 +1061,13 @@ def test_make_pipeline_from_multiple_graphs_prior_components(X_y_binary): prior_components=prior_components, sub_pipeline_names=sub_pipeline_names, ) + assert ( - combined_pipeline.component_graph.get_inputs("First Pipeline - Undersampler")[0] + combined_pipeline.component_graph.get_inputs("First Pipeline - Imputer")[0] == "DFS Transformer.x" ) assert ( - combined_pipeline.component_graph.get_inputs("Second Pipeline - Undersampler")[ - 0 - ] + combined_pipeline.component_graph.get_inputs("Second Pipeline - Imputer")[0] == "DFS Transformer.x" )