alteryx · chukarsten · Jul 20, 2021 · Jul 8, 2021 · Jul 8, 2021 · Jul 8, 2021
diff --git a/core-requirements.txt b/core-requirements.txt
@@ -11,7 +11,7 @@ psutil>=5.6.6
 requirements-parser>=0.2.0
 shap>=0.36.0
 texttable>=1.6.2
-woodwork>=0.4.1,<0.5.0
+woodwork>=0.5.0
 dask>=2.12.0
 featuretools>=0.21.0
 nlp-primitives>=1.1.0

diff --git a/docs/source/demos/text_input.ipynb b/docs/source/demos/text_input.ipynb
@@ -62,6 +62,22 @@
     "y.value_counts(normalize=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to properly utilize Woodwork's 'Natural Language' typing, we need to pass this argument in during initialization. Otherwise, this will be treated as an 'Unknown' type and dropped in the search."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X.ww.init(logical_types={\"Message\": \"NaturalLanguage\"})"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -6,6 +6,7 @@ Release Notes
         * Added details on how to fix error caused by broken ww schema :pr:`2466`
         * Added ability to use built-in pickle for saving AutoMLSearch :pr:`2463`
         * Updated our components and component graphs to use latest features of ww 0.4.1, e.g. ``concat_columns`` and drop in-place. :pr:`2465`
+        * Added support for new Woodwork ``Unknown`` type in AutoMLSearch :pr:`2477`
         * Updated our components with an attribute that describes if they modify features or targets and can be used in list API for pipeline initialization :pr:`2504`
         * Updated ``ComponentGraph`` to accept X and y as inputs :pr:`2507`
         * Removed unused ``TARGET_BINARY_INVALID_VALUES`` from ``DataCheckMessageCode`` enum and fixed formatting of objective documentation :pr:`2520`
@@ -32,6 +33,7 @@ Release Notes
 .. warning::
 
     **Breaking Changes**
+        * `NaN` values in the `Natural Language` type are no longer supported by the Imputer with the pandas upgrade. :pr:`2477`
 
 **v0.28.0 Jul. 2, 2021**
     * Enhancements

diff --git a/docs/source/user_guide/automl.ipynb b/docs/source/user_guide/automl.ipynb
@@ -64,6 +64,8 @@
     "\n",
     "EvalML also accepts ``pandas`` input, and will run type inference on top of the input ``pandas`` data. If you'd like to change the types inferred by EvalML, you can use the `infer_feature_types` utility method, which takes pandas or numpy input and converts it to a Woodwork data structure. The `feature_types` parameter can be used to specify what types specific columns should be.\n",
     "\n",
+    "Feature types such as `Natural Language` must be specified in this way, otherwise Woodwork will infer it as `Unknown` type and drop it during the AutoMLSearch.\n",
+    "\n",
     "In the example below, we reformat a couple features to make them easily consumable by the model, and then specify that the provider, which would have otherwise been inferred as a column with natural language, is a categorical column."
    ]
   },

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -491,12 +491,23 @@ def __init__(
                 if "Drop Columns Transformer" in self.pipeline_parameters
                 else None
             )
-            index_columns = list(
-                self.X_train.ww.select("index", return_schema=True).columns
+            index_and_unknown_columns = list(
+                self.X_train.ww.select(["index", "unknown"], return_schema=True).columns
             )
-            index_columns = _put_into_original_order(self.X_train, index_columns)
-            if len(index_columns) > 0 and drop_columns is None:
-                parameters["Drop Columns Transformer"] = {"columns": index_columns}
+            unknown_columns = list(
+                self.X_train.ww.select("unknown", return_schema=True).columns
+            )
+            index_and_unknown_columns = _put_into_original_order(
+                self.X_train, index_and_unknown_columns
+            )
+            if len(index_and_unknown_columns) > 0 and drop_columns is None:
+                parameters["Drop Columns Transformer"] = {
+                    "columns": index_and_unknown_columns
+                }
+                if len(unknown_columns):
+                    logger.info(
+                        f"Removing columns {unknown_columns} because they are of 'Unknown' type"
+                    )
             self.allowed_pipelines = [
                 make_pipeline(
                     self.X_train,

diff --git a/evalml/model_understanding/graphs.py b/evalml/model_understanding/graphs.py
@@ -652,6 +652,12 @@ def partial_dependence(
         X_features = (
             X.ww.iloc[:, [features]] if isinstance(features, int) else X.ww[[features]]
         )
+    X_unknown = X_features.ww.select("unknown")
+    if len(X_unknown.columns):
+        # We drop the unknown columns in the pipelines, so we cannot calculate partial dependence for these
+        raise ValueError(
+            f"Columns {X_unknown.columns.values} are of type 'Unknown', which cannot be used for partial dependence"
+        )
 
     X_cats = X_features.ww.select("categorical")
     if any(is_categorical):
@@ -679,7 +685,6 @@ def partial_dependence(
         )
 
     feature_list = X[feature_names]
-
     _raise_value_error_if_any_features_all_nan(feature_list)
 
     if feature_list.isnull().sum().any():

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -84,8 +84,10 @@ def _get_preprocessing_components(
     if len(text_columns) > 0:
         pp_components.append(TextFeaturizer)
 
-    index_columns = list(X.ww.select("index", return_schema=True).columns)
-    if len(index_columns) > 0:
+    index_and_unknown_columns = list(
+        X.ww.select(["index", "unknown"], return_schema=True).columns
+    )
+    if len(index_and_unknown_columns) > 0:
         pp_components.append(DropColumns)
 
     datetime_cols = list(X.ww.select(["Datetime"], return_schema=True).columns)

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -3077,6 +3077,7 @@ def test_search_with_text(AutoMLTestEnv):
             ],
         }
     )
+    X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
     y = [0, 1, 1, 0, 1, 0]
     automl = AutoMLSearch(
         X_train=X, y_train=y, problem_type="binary", optimize_thresholds=False
@@ -3134,6 +3135,7 @@ def test_search_with_text_and_ensembling(
 
     if df_text:
         X = X_with_text
+        X.ww.init(logical_types={"col_1": "NaturalLanguage"})
     else:
         X = X_no_text
     if problem_type == "binary":
@@ -4790,6 +4792,45 @@ def test_automl_thresholding_train_pipelines(mock_objective, threshold, X_y_bina
         assert all([p.threshold is None for p in pipes.values()])
 
 
+@pytest.mark.parametrize("columns", [[], ["unknown_col"], ["unknown1, unknown2"]])
+def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog):
+    X, y = X_y_binary
+    X = pd.DataFrame(X)
+    for col in columns:
+        X[col] = pd.Series(range(len(X)))
+    X.ww.init()
+    X.ww.set_types({col: "Unknown" for col in columns})
+    automl = AutoMLSearch(
+        X_train=X,
+        y_train=y,
+        problem_type="binary",
+        optimize_thresholds=False,
+        max_batches=2,
+    )
+    env = AutoMLTestEnv("binary")
+    with env.test_context(score_return_value={automl.objective.name: 1.0}):
+        automl.search()
+    if not len(columns):
+        for pipeline in automl.allowed_pipelines:
+            assert "Drop Columns Transformer" not in pipeline.name
+        assert "because they are of 'Unknown'" not in caplog.text
+        return
+
+    assert "because they are of 'Unknown'" in caplog.text
+    for pipeline in automl.allowed_pipelines:
+        assert pipeline.get_component("Drop Columns Transformer")
+        assert "Drop Columns Transformer" in pipeline.parameters
+        assert pipeline.parameters["Drop Columns Transformer"] == {"columns": columns}
+
+    all_drop_column_params = []
+    for _, row in automl.full_rankings.iterrows():
+        if "Baseline" not in row.pipeline_name:
+            all_drop_column_params.append(
+                row.parameters["Drop Columns Transformer"]["columns"]
+            )
+    assert all(param == columns for param in all_drop_column_params)
+
+
 @pytest.mark.parametrize(
     "automl_type",
     [

diff --git a/evalml/tests/component_tests/test_lsa.py b/evalml/tests/component_tests/test_lsa.py
@@ -34,6 +34,7 @@ def test_lsa_only_text(text_df):
 def test_lsa_with_nontext(text_df):
     X = text_df
     X["col_3"] = [73.7, 67.213, 92]
+    X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
     lsa = LSA()
 
     lsa.fit(X)
@@ -120,7 +121,8 @@ def test_index_col_names():
         ]
     )
     lsa = LSA()
-
+    X = pd.DataFrame(X)
+    X.ww.init(logical_types={0: "NaturalLanguage", 1: "NaturalLanguage"})
     lsa.fit(X)
     expected_col_names = set(["LSA(0)[0]", "LSA(0)[1]", "LSA(1)[0]", "LSA(1)[1]"])
     X_t = lsa.transform(X)
@@ -146,6 +148,7 @@ def test_float_col_names():
             ],
         }
     )
+    X.ww.init(logical_types={-1: "NaturalLanguage", 4.75: "NaturalLanguage"})
     lsa = LSA()
     lsa.fit(X)
     expected_col_names = set(
@@ -169,6 +172,7 @@ def test_lsa_output():
             ]
         }
     )
+    X.ww.init(logical_types={"lsa": "NaturalLanguage"})
     lsa = LSA()
     lsa.fit(X)
     expected_features = pd.DataFrame(
@@ -218,7 +222,7 @@ def test_lsa_woodwork_custom_overrides_returned_by_components(X_df):
     for logical_type in override_types:
         try:
             X = X_df
-            X.ww.init(logical_types={0: logical_type})
+            X.ww.init(logical_types={0: logical_type, "text col": "NaturalLanguage"})
         except ww.exceptions.TypeConversionError:
             continue
 

diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py
@@ -262,7 +262,7 @@ def test_per_column_imputer_woodwork_custom_overrides_returned_by_components(
     override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean]
     for logical_type in override_types:
         # Column with Nans to boolean used to fail. Now it doesn't
-        if has_nan and logical_type == Boolean:
+        if has_nan and logical_type in [Boolean, NaturalLanguage]:
             continue
         try:
             X = X_df.copy()

diff --git a/evalml/tests/component_tests/test_text_featurizer.py b/evalml/tests/component_tests/test_text_featurizer.py
@@ -41,6 +41,7 @@ def test_featurizer_only_text(text_df):
 def test_featurizer_with_nontext(text_df):
     X = text_df
     X["col_3"] = [73.7, 67.213, 92]
+    X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
     tf = TextFeaturizer()
 
     tf.fit(X)
@@ -147,6 +148,7 @@ def test_no_null_output():
             ],
         }
     )
+    X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
     X_t = tf.transform(X)
@@ -170,6 +172,8 @@ def test_index_col_names():
             ],
         ]
     )
+    X = pd.DataFrame(X)
+    X.ww.init(logical_types={0: "NaturalLanguage", 1: "NaturalLanguage"})
     tf = TextFeaturizer()
 
     tf.fit(X)
@@ -210,6 +214,7 @@ def test_float_col_names():
             ],
         }
     )
+    X.ww.init(logical_types={4.75: "NaturalLanguage", -1: "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
     expected_col_names = set(
@@ -249,6 +254,7 @@ def test_output_null():
             ],
         }
     )
+    X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
     X_t = tf.transform(X)
@@ -265,6 +271,7 @@ def test_diversity_primitive_output():
             ]
         }
     )
+    X.ww.init(logical_types={"diverse": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
 
@@ -284,6 +291,7 @@ def test_lsa_primitive_output():
             ]
         }
     )
+    X.ww.init(logical_types={"lsa": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
 
@@ -300,7 +308,9 @@ def test_featurizer_custom_types(text_df):
     # force one of the two provided columns to be a user-specified type.
     # if the output contains text features for col_2, then the text featurizer didn't pass the right
     # ww types to LSA, because LSA still thought col_2 was natural language even though the user said otherwise.
-    X = infer_feature_types(text_df, {"col_2": "categorical"})
+    X = infer_feature_types(
+        pd.DataFrame(text_df), {"col_1": "NaturalLanguage", "col_2": "categorical"}
+    )
     tf = TextFeaturizer()
     tf.fit(X)
 
@@ -339,6 +349,7 @@ def test_mean_characters_primitive_output():
             ]
         }
     )
+    X.ww.init(logical_types={"mean_characters": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
 
@@ -361,6 +372,7 @@ def test_polarity_primitive_output():
             ]
         }
     )
+    X.ww.init(logical_types={"polarity": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
 
@@ -415,7 +427,7 @@ def test_text_featurizer_woodwork_custom_overrides_returned_by_components(X_df):
     for logical_type in override_types:
         try:
             X = X_df.copy()
-            X.ww.init(logical_types={0: logical_type})
+            X.ww.init(logical_types={0: logical_type, "text col": "NaturalLanguage"})
         except ww.exceptions.TypeConversionError:
             continue
 
@@ -443,6 +455,7 @@ def test_text_featurizer_sets_max_depth_1(mock_dfs):
             ]
         }
     )
+    X.ww.init(logical_types={"polarity": "NaturalLanguage"})
     tf = TextFeaturizer()
     tf.fit(X)
     _, kwargs = mock_dfs.call_args

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -283,6 +283,7 @@ def text_df():
             ],
         }
     )
+    df.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
     yield df
 
 

diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py
@@ -224,8 +224,8 @@ def test_default_data_checks_classification(input_type):
 
     y = pd.Series([0, 1, np.nan, 1, 0])
     y_multiclass = pd.Series([0, 1, np.nan, 2, 0])
+    X.ww.init(logical_types={"natural_language_nan": "NaturalLanguage"})
     if input_type == "ww":
-        X.ww.init()
         y = ww.init_series(y)
         y_multiclass = ww.init_series(y_multiclass)
 
@@ -336,9 +336,8 @@ def test_default_data_checks_regression(input_type):
     X["nan_dt_col"][0] = None
     y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
     y_no_variance = pd.Series([5] * 5)
-
+    X.ww.init(logical_types={"natural_language_nan": "NaturalLanguage"})
     if input_type == "ww":
-        X.ww.init()
         y = ww.init_series(y)
         y_no_variance = ww.init_series(y_no_variance)
     null_leakage = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -64,6 +64,8 @@ @@
         "\n",
         "EvalML also accepts ``pandas`` input, and will run type inference on top of the input ``pandas`` data. If you'd like to change the types inferred by EvalML, you can use the `infer_feature_types` utility method, which takes pandas or numpy input and converts it to a Woodwork data structure. The `feature_types` parameter can be used to specify what types specific columns should be.\n",
         "\n",
+        "Feature types such as `Natural Language` must be specified in this way, otherwise Woodwork will infer it as `Unknown` type and drop it during the AutoMLSearch.\n",
+        "\n",
         "In the example below, we reformat a couple features to make them easily consumable by the model, and then specify that the provider, which would have otherwise been inferred as a column with natural language, is a categorical column."
        ]
       },
@@ Expand Down @@