Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't add Imputer to make_pipeline if no numeric / categorical columns #1967

Merged
merged 4 commits into from Mar 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -3,6 +3,7 @@ Release Notes
**Future Releases**
* Enhancements
* Added multiple oversampling and undersampling sampling methods as data splitters for imbalanced classification :pr:`1775`
* Updated ``make_pipeline`` to not add ``Imputer`` if input data does not have numeric or categorical columns :pr:`1967`
* Fixes
* Changes
* Documentation Changes
Expand Down
8 changes: 6 additions & 2 deletions evalml/pipelines/utils.py
@@ -1,5 +1,7 @@
import json

from woodwork import logical_types

from .binary_classification_pipeline import BinaryClassificationPipeline
from .multiclass_classification_pipeline import (
MulticlassClassificationPipeline
Expand Down Expand Up @@ -60,8 +62,10 @@ def _get_preprocessing_components(X, y, problem_type, estimator_class):
all_null_cols = X_pd.columns[X_pd.isnull().all()]
if len(all_null_cols) > 0:
pp_components.append(DropNullColumns)

pp_components.append(Imputer)
input_logical_types = set(X.logical_types.values())
types_imputer_handles = {logical_types.Boolean, logical_types.Categorical, logical_types.Double, logical_types.Integer}
if len(input_logical_types.intersection(types_imputer_handles)) > 0:
pp_components.append(Imputer)

text_columns = list(X.select('natural_language').columns)
if len(text_columns) > 0:
Expand Down
82 changes: 66 additions & 16 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Expand Up @@ -92,10 +92,9 @@ def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand Down Expand Up @@ -126,10 +125,9 @@ def test_make_pipeline(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand Down Expand Up @@ -160,10 +158,9 @@ def test_make_pipeline_no_nulls(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand Down Expand Up @@ -194,10 +191,9 @@ def test_make_pipeline_no_datetimes(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand Down Expand Up @@ -225,10 +221,9 @@ def test_make_pipeline_no_column_names(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand Down Expand Up @@ -259,10 +254,9 @@ def test_make_pipeline_text_columns(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand All @@ -272,6 +266,64 @@ def test_make_pipeline_text_columns(input_type, problem_type):
assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components


@pytest.mark.parametrize("input_type", ["pd", "ww"])
@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
def test_make_pipeline_only_text_columns(input_type, problem_type):
X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"],
"another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]})
y = pd.Series([0, 0, 1, 1, 0])
if input_type == 'ww':
X = ww.DataTable(X)
y = ww.DataColumn(y)
estimators = get_estimators(problem_type=problem_type)

pipeline_class = _get_pipeline_base_class(problem_type)
if problem_type == ProblemTypes.MULTICLASS:
y = pd.Series([0, 2, 1, 2])

for estimator_class in estimators:
if problem_type in estimator_class.supported_problem_types:
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
standard_scaler = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
standard_scaler = [StandardScaler]
assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]


@pytest.mark.parametrize("input_type", ["pd", "ww"])
@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
def test_make_pipeline_only_datetime_columns(input_type, problem_type):
X = pd.DataFrame({"some dates": pd.date_range('2000-02-03', periods=5, freq='W'),
"some other dates": pd.date_range('2000-05-19', periods=5, freq='W')})
y = pd.Series([0, 0, 1, 1, 0])
if input_type == 'ww':
X = ww.DataTable(X)
y = ww.DataColumn(y)
estimators = get_estimators(problem_type=problem_type)

pipeline_class = _get_pipeline_base_class(problem_type)
if problem_type == ProblemTypes.MULTICLASS:
y = pd.Series([0, 2, 1, 2])

for estimator_class in estimators:
if problem_type in estimator_class.supported_problem_types:
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
standard_scaler = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
standard_scaler = [StandardScaler]
assert pipeline.component_graph == [DateTimeFeaturizer] + delayed_features + standard_scaler + [estimator_class]


@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
def test_make_pipeline_numpy_input(problem_type):
X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan, np.nan]])
Expand All @@ -286,10 +338,9 @@ def test_make_pipeline_numpy_input(problem_type):
if problem_type in estimator_class.supported_problem_types:
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [StandardScaler, estimator_class]
else:
Expand Down Expand Up @@ -317,10 +368,9 @@ def test_make_pipeline_datetime_no_categorical(input_type, problem_type):
pipeline = make_pipeline(X, y, estimator_class, problem_type)
assert isinstance(pipeline, type(pipeline_class))
assert pipeline.custom_hyperparameters is None
delayed_features = []
if is_time_series(problem_type):
delayed_features = [DelayedFeatureTransformer]
else:
delayed_features = []
if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
estimator_components = [StandardScaler, estimator_class]
elif estimator_class.model_family == ModelFamily.CATBOOST:
Expand Down