Skip to content

Commit

Permalink
Remove dead code in _get_preprocessing_components (#896)
Browse files Browse the repository at this point in the history
* init name update

* linting and fixing tests

* Changelog

* past tense

* add comment

* clean up y for binary

* lint
  • Loading branch information
angela97lin committed Jul 1, 2020
1 parent f30a457 commit 3f20a06
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Changelog
* Renamed feature_importances and permutation_importances methods to use singular names (feature_importance and permutation_importance) :pr:`883`
* Updated `automl` default data splitter to train/validation split for large datasets :pr:`877`
* Added open source license, update some repo metadata :pr:`887`
* Removed dead code in `_get_preprocessing_components` :pr:`896`
* Documentation Changes
* Fix some typos and update the EvalML logo :pr:`872`
* Testing Changes
Expand Down
1 change: 0 additions & 1 deletion evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,6 @@ def _get_preprocessing_components(X, y, problem_type, estimator_class):
all_null_cols = X.columns[X.isnull().all()]
if len(all_null_cols) > 0:
pp_components.append(DropNullColumns)
X = X.drop(all_null_cols, axis=1)
pp_components.append(SimpleImputer)

datetime_cols = X.select_dtypes(include=[np.datetime64])
Expand Down
21 changes: 19 additions & 2 deletions evalml/tests/pipeline_tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,25 @@ def test_get_pipelines_core_dependencies_mock():
get_pipelines(problem_type="Not A Valid Problem Type")


def test_make_pipeline_all_nan_no_categoricals():
# testing that all_null column is not considered categorical
X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
"num": [1, 2, 3, 4, 5]})
y = pd.Series([0, 0, 1, 1, 0])
binary_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.BINARY)
assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, StandardScaler, LogisticRegressionClassifier]

binary_pipeline = make_pipeline(X, y, RandomForestClassifier, ProblemTypes.BINARY)
assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, RandomForestClassifier]


def test_make_pipeline():
X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
"categorical": ["a", "b", "a", "c", "c"],
"some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
y = pd.Series([0, 0, 1, 2, 0])
y = pd.Series([0, 0, 1, 0, 0])
binary_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.BINARY)
assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
Expand All @@ -157,6 +171,7 @@ def test_make_pipeline():
assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, DateTimeFeaturization, OneHotEncoder, RandomForestClassifier]

y = pd.Series([0, 2, 1, 2, 0])
multiclass_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.MULTICLASS)
assert isinstance(multiclass_pipeline, type(MulticlassClassificationPipeline))
assert multiclass_pipeline.component_graph == [DropNullColumns, SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
Expand Down Expand Up @@ -195,11 +210,13 @@ def test_make_pipeline_no_nulls():
X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
"categorical": ["a", "b", "a", "c", "c"],
"some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
y = pd.Series([0, 0, 1, 2, 0])
y = pd.Series([0, 1, 1, 0, 0])
binary_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.BINARY)
assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
assert binary_pipeline.component_graph == [SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
assert binary_pipeline.custom_hyperparameters == {'Simple Imputer': {'impute_strategy': ['most_frequent']}}

y = pd.Series([0, 2, 1, 2, 0])
multiclass_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.MULTICLASS)
assert isinstance(multiclass_pipeline, type(MulticlassClassificationPipeline))
assert multiclass_pipeline.component_graph == [SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
Expand Down

0 comments on commit 3f20a06

Please sign in to comment.