Skip to content

Commit

Permalink
Revert "Conditionally include the imputer in pipelines (#3657)" (#3672)
Browse files Browse the repository at this point in the history
* Revert "Conditionally include the imputer in pipelines (#3657)"

This reverts commit ceebe08.

* Release.

* Fixed release.
  • Loading branch information
chukarsten committed Aug 19, 2022
1 parent 85472b6 commit 2ae367e
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 63 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -4,6 +4,7 @@ Release Notes
* Enhancements
* Fixes
* ``IDColumnsDataCheck`` now only returns an action code to set the first column as the primary key if it contains unique values :pr:`3639`
* Reverted the ``make_pipeline`` changes that conditionally included the imputers :pr:`3672`
* Changes
* Documentation Changes
* Testing Changes
Expand Down
1 change: 0 additions & 1 deletion evalml/pipelines/classification_pipeline.py
Expand Up @@ -115,7 +115,6 @@ def predict(self, X, objective=None, X_train=None, y_train=None):
Returns:
pd.Series: Estimated labels.
"""
X = infer_feature_types(X)
_predictions = self._predict(X, objective=objective)
predictions = self.inverse_transform(_predictions.astype(int))
predictions = pd.Series(
Expand Down
7 changes: 3 additions & 4 deletions evalml/pipelines/utils.py
Expand Up @@ -168,10 +168,9 @@ def _get_imputer(X, y, problem_type, estimator_class, sampler_name=None):
logical_types.Datetime,
}

if (
len(input_logical_types.intersection(types_imputer_handles))
or len(text_columns)
) and X.isna().any().any():
if len(input_logical_types.intersection(types_imputer_handles)) or len(
text_columns,
):
components.append(Imputer)

return components
Expand Down
30 changes: 19 additions & 11 deletions evalml/tests/automl_tests/test_automl.py
Expand Up @@ -3177,6 +3177,7 @@ def test_automl_validate_objective(non_core_objective, X_y_regression):
def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary):
X, y = X_y_binary
params = {
"Imputer": {"numeric_impute_strategy": "most_frequent"},
"Logistic Regression Classifier": {"C": 10, "penalty": "l2"},
"Elastic Net Classifier": {"l1_ratio": 0.2},
}
Expand All @@ -3192,6 +3193,11 @@ def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary):
with env.test_context(score_return_value={automl.objective.name: 1.23}):
automl.search()
for i, row in automl.rankings.iterrows():
if "Imputer" in row["parameters"]:
assert (
row["parameters"]["Imputer"]["numeric_impute_strategy"]
== "most_frequent"
)
if "Logistic Regression Classifier" in row["parameters"]:
assert row["parameters"]["Logistic Regression Classifier"]["C"] == 10
assert (
Expand All @@ -3203,7 +3209,6 @@ def test_automl_pipeline_params_simple(AutoMLTestEnv, X_y_binary):

def test_automl_pipeline_params_multiple(AutoMLTestEnv, X_y_regression):
X, y = X_y_regression
X[0][2] = None # introduce a null value to include the imputer
hyperparams = {
"Imputer": {
"numeric_impute_strategy": Categorical(["median", "most_frequent"]),
Expand Down Expand Up @@ -3252,6 +3257,7 @@ def test_automl_pipeline_params_multiple(AutoMLTestEnv, X_y_regression):
def test_automl_pipeline_params_kwargs(AutoMLTestEnv, X_y_multi):
X, y = X_y_multi
hyperparams = {
"Imputer": {"numeric_impute_strategy": Categorical(["most_frequent"])},
"Decision Tree Classifier": {
"max_depth": Integer(1, 2),
"ccp_alpha": Real(0.1, 0.5),
Expand All @@ -3269,6 +3275,11 @@ def test_automl_pipeline_params_kwargs(AutoMLTestEnv, X_y_multi):
with env.test_context(score_return_value={automl.objective.name: 1.0}):
automl.search()
for i, row in automl.rankings.iterrows():
if "Imputer" in row["parameters"]:
assert (
row["parameters"]["Imputer"]["numeric_impute_strategy"]
== "most_frequent"
)
if "Decision Tree Classifier" in row["parameters"]:
assert (
0.1 < row["parameters"]["Decision Tree Classifier"]["ccp_alpha"] < 0.5
Expand Down Expand Up @@ -4741,16 +4752,13 @@ def test_search_parameters_held_automl(
break
assert found_dtc
for tuners in aml.automl_algorithm._tuners.values():
if "Imputer" in tuners._pipeline_hyperparameter_ranges:
assert (
tuners._pipeline_hyperparameter_ranges["Imputer"][
"numeric_impute_strategy"
]
== expected
)
assert tuners._pipeline_hyperparameter_ranges["Imputer"][
"categorical_impute_strategy"
] == ["most_frequent"]
assert (
tuners._pipeline_hyperparameter_ranges["Imputer"]["numeric_impute_strategy"]
== expected
)
assert tuners._pipeline_hyperparameter_ranges["Imputer"][
"categorical_impute_strategy"
] == ["most_frequent"]
# make sure that there are no set hyperparameters when we don't have defaults
assert tuners._pipeline_hyperparameter_ranges["Label Encoder"] == {}
assert tuners.propose()["Label Encoder"] == {}
Expand Down
29 changes: 12 additions & 17 deletions evalml/tests/automl_tests/test_automl_iterative_algorithm.py
Expand Up @@ -553,11 +553,10 @@ def test_pipeline_custom_hyperparameters_make_pipeline(
"Column_2",
]
if custom_hyperparameters:
if component_graphs:
assert (
row["parameters"]["Imputer"]["numeric_impute_strategy"]
in search_parameters_["Imputer"]["numeric_impute_strategy"]
)
assert (
row["parameters"]["Imputer"]["numeric_impute_strategy"]
in search_parameters_["Imputer"]["numeric_impute_strategy"]
)
assert (
4 <= row["parameters"]["Random Forest Classifier"]["max_depth"] <= 7
)
Expand All @@ -567,12 +566,11 @@ def test_pipeline_custom_hyperparameters_make_pipeline(
<= 210
)
else:
if component_graphs:
assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [
"mean",
"median",
"most_frequent",
]
assert row["parameters"]["Imputer"]["numeric_impute_strategy"] in [
"mean",
"median",
"most_frequent",
]
assert (
1
<= row["parameters"]["Random Forest Classifier"]["max_depth"]
Expand Down Expand Up @@ -994,12 +992,9 @@ def test_pipeline_parameter_warnings_component_graphs(
env = AutoMLTestEnv("binary")
with env.test_context(score_return_value={automl.objective.name: 1.0}):
automl.search()
if len(w) == 1 and not len(set_values):
assert "components {'Imputer'} will not be used" in str(w[0].message)
else:
assert len(w) == (1 if len(set_values) else 0)
if len(w):
assert w[0].message.components == set_values
assert len(w) == (1 if len(set_values) else 0)
if len(w):
assert w[0].message.components == set_values


def test_graph_automl(X_y_multi):
Expand Down
3 changes: 3 additions & 0 deletions evalml/tests/automl_tests/test_default_algorithm.py
Expand Up @@ -377,9 +377,11 @@ def test_make_split_pipeline(X_y_binary):
"Label Encoder",
"Categorical Pipeline - Select Columns Transformer",
"Categorical Pipeline - Label Encoder",
"Categorical Pipeline - Imputer",
"Categorical Pipeline - One Hot Encoder",
"Numeric Pipeline - Select Columns By Type Transformer",
"Numeric Pipeline - Label Encoder",
"Numeric Pipeline - Imputer",
"Numeric Pipeline - Select Columns Transformer",
"Random Forest Classifier",
]
Expand Down Expand Up @@ -414,6 +416,7 @@ def test_make_split_pipeline_categorical_only(X_y_binary):
compute_order = [
"Select Columns Transformer",
"Label Encoder",
"Imputer",
"One Hot Encoder",
"Random Forest Classifier",
]
Expand Down
54 changes: 24 additions & 30 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Expand Up @@ -55,34 +55,33 @@
@pytest.mark.parametrize("input_type", ["pd", "ww"])
@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
@pytest.mark.parametrize(
"test_description, column_names, has_null",
"test_description, column_names",
[
("all nan is not categorical", ["all_null", "numerical"], True),
("mixed types", ["all_null", "categorical", "dates", "numerical"], True),
("no all_null columns", ["numerical", "categorical", "dates"], False),
("date, numerical", ["dates", "numerical"], False),
("only text", ["text"], False),
("only dates", ["dates"], False),
("only numerical", ["numerical"], False),
("only ip", ["ip"], False),
("only all_null", ["all_null"], True),
("only categorical", ["categorical"], False),
("text with other features", ["text", "numerical", "categorical"], False),
("url with other features", ["url", "numerical", "categorical"], False),
("ip with other features", ["ip", "numerical", "categorical"], False),
("email with other features", ["email", "numerical", "categorical"], False),
("only null int", ["int_null"], True),
("only null bool", ["bool_null"], True),
("only null age", ["age_null"], True),
("nullable_types", ["numerical", "int_null", "bool_null", "age_null"], True),
("all nan is not categorical", ["all_null", "numerical"]),
("mixed types", ["all_null", "categorical", "dates", "numerical"]),
("no all_null columns", ["numerical", "categorical", "dates"]),
("date, numerical", ["dates", "numerical"]),
("only text", ["text"]),
("only dates", ["dates"]),
("only numerical", ["numerical"]),
("only ip", ["ip"]),
("only all_null", ["all_null"]),
("only categorical", ["categorical"]),
("text with other features", ["text", "numerical", "categorical"]),
("url with other features", ["url", "numerical", "categorical"]),
("ip with other features", ["ip", "numerical", "categorical"]),
("email with other features", ["email", "numerical", "categorical"]),
("only null int", ["int_null"]),
("only null bool", ["bool_null"]),
("only null age", ["age_null"]),
("nullable_types", ["numerical", "int_null", "bool_null", "age_null"]),
],
)
def test_make_pipeline(
problem_type,
input_type,
test_description,
column_names,
has_null,
get_test_data_from_configuration,
):

Expand Down Expand Up @@ -149,11 +148,7 @@ def test_make_pipeline(
)
email_featurizer = [EmailFeaturizer] if "email" in column_names else []
url_featurizer = [URLFeaturizer] if "url" in column_names else []
imputer = (
[]
if ((column_names in [["ip"], ["all_null"]]) or not has_null)
else [Imputer]
)
imputer = [] if (column_names in [["ip"], ["all_null"]]) else [Imputer]
drop_nan_rows_transformer = (
[DropNaNRowsTransformer]
if is_time_series(problem_type)
Expand Down Expand Up @@ -1023,7 +1018,7 @@ def test_make_pipeline_from_multiple_graphs_with_sampler(X_y_binary):
problem_type=ProblemTypes.BINARY,
)
second_pipeline_sampler = (
"Pipeline w/ Label Encoder + Undersampler Pipeline 2 - Undersampler.y"
"Pipeline w/ Label Encoder + Imputer + Undersampler Pipeline 2 - Undersampler.y"
)
assert (
combined_pipeline.component_graph.get_inputs("Random Forest Classifier")[2]
Expand Down Expand Up @@ -1066,14 +1061,13 @@ def test_make_pipeline_from_multiple_graphs_prior_components(X_y_binary):
prior_components=prior_components,
sub_pipeline_names=sub_pipeline_names,
)

assert (
combined_pipeline.component_graph.get_inputs("First Pipeline - Undersampler")[0]
combined_pipeline.component_graph.get_inputs("First Pipeline - Imputer")[0]
== "DFS Transformer.x"
)
assert (
combined_pipeline.component_graph.get_inputs("Second Pipeline - Undersampler")[
0
]
combined_pipeline.component_graph.get_inputs("Second Pipeline - Imputer")[0]
== "DFS Transformer.x"
)

Expand Down

0 comments on commit 2ae367e

Please sign in to comment.