Remove dead code in _get_preprocessing_components (#896)

* init name update * linting and fixing tests * Changelog * past tense * add comment * clean up y for binary * lint
alteryx · Jul 1, 2020 · 3f20a06 · 3f20a06
1 parent f30a457
commit 3f20a06
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 3 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -48,6 +48,7 @@ Changelog
         * Renamed feature_importances and permutation_importances methods to use singular names (feature_importance and permutation_importance) :pr:`883`
         * Updated `automl` default data splitter to train/validation split for large datasets :pr:`877`
         * Added open source license, update some repo metadata :pr:`887`
+        * Removed dead code in `_get_preprocessing_components` :pr:`896`
     * Documentation Changes
         * Fix some typos and update the EvalML logo :pr:`872`
     * Testing Changes

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -215,7 +215,6 @@ def _get_preprocessing_components(X, y, problem_type, estimator_class):
     all_null_cols = X.columns[X.isnull().all()]
     if len(all_null_cols) > 0:
         pp_components.append(DropNullColumns)
-    X = X.drop(all_null_cols, axis=1)
     pp_components.append(SimpleImputer)
 
     datetime_cols = X.select_dtypes(include=[np.datetime64])

diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py
@@ -144,11 +144,25 @@ def test_get_pipelines_core_dependencies_mock():
         get_pipelines(problem_type="Not A Valid Problem Type")
 
 
+def test_make_pipeline_all_nan_no_categoricals():
+    # testing that all_null column is not considered categorical
+    X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
+                      "num": [1, 2, 3, 4, 5]})
+    y = pd.Series([0, 0, 1, 1, 0])
+    binary_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.BINARY)
+    assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
+    assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, StandardScaler, LogisticRegressionClassifier]
+
+    binary_pipeline = make_pipeline(X, y, RandomForestClassifier, ProblemTypes.BINARY)
+    assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
+    assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, RandomForestClassifier]
+
+
 def test_make_pipeline():
     X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
                       "categorical": ["a", "b", "a", "c", "c"],
                       "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
-    y = pd.Series([0, 0, 1, 2, 0])
+    y = pd.Series([0, 0, 1, 0, 0])
     binary_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.BINARY)
     assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
     assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
@@ -157,6 +171,7 @@ def test_make_pipeline():
     assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
     assert binary_pipeline.component_graph == [DropNullColumns, SimpleImputer, DateTimeFeaturization, OneHotEncoder, RandomForestClassifier]
 
+    y = pd.Series([0, 2, 1, 2, 0])
     multiclass_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.MULTICLASS)
     assert isinstance(multiclass_pipeline, type(MulticlassClassificationPipeline))
     assert multiclass_pipeline.component_graph == [DropNullColumns, SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
@@ -195,11 +210,13 @@ def test_make_pipeline_no_nulls():
     X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                       "categorical": ["a", "b", "a", "c", "c"],
                       "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
-    y = pd.Series([0, 0, 1, 2, 0])
+    y = pd.Series([0, 1, 1, 0, 0])
     binary_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.BINARY)
     assert isinstance(binary_pipeline, type(BinaryClassificationPipeline))
     assert binary_pipeline.component_graph == [SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]
     assert binary_pipeline.custom_hyperparameters == {'Simple Imputer': {'impute_strategy': ['most_frequent']}}
+
+    y = pd.Series([0, 2, 1, 2, 0])
     multiclass_pipeline = make_pipeline(X, y, LogisticRegressionClassifier, ProblemTypes.MULTICLASS)
     assert isinstance(multiclass_pipeline, type(MulticlassClassificationPipeline))
     assert multiclass_pipeline.component_graph == [SimpleImputer, DateTimeFeaturization, OneHotEncoder, StandardScaler, LogisticRegressionClassifier]