Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
09944f3
handle woodwork unknown
bchen1116 Jul 8, 2021
7cf15e4
update release notes
bchen1116 Jul 8, 2021
f31bf9e
update ww version
bchen1116 Jul 8, 2021
285e125
fix deps
bchen1116 Jul 8, 2021
83bbaa3
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 8, 2021
63c16c9
update reqs
bchen1116 Jul 8, 2021
422ee1c
fixing tests
bchen1116 Jul 9, 2021
0eeff5e
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 9, 2021
515e818
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 9, 2021
0260c6f
update docs and clean tests
bchen1116 Jul 10, 2021
0391219
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 10, 2021
f6a179f
fixing test
bchen1116 Jul 10, 2021
e063860
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 12, 2021
1009baf
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 12, 2021
7e70340
update deps
bchen1116 Jul 12, 2021
99ec0ac
add line
bchen1116 Jul 12, 2021
ae36364
updat deps
bchen1116 Jul 13, 2021
d2d760e
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 13, 2021
2bcb251
update dependencies
bchen1116 Jul 13, 2021
5dd3480
Merge branch 'bc_2426_unknown' of github.com:alteryx/evalml into bc_2…
bchen1116 Jul 13, 2021
6aac33b
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 13, 2021
78233b1
fixing deps
bchen1116 Jul 13, 2021
837cb0f
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 14, 2021
4cd8368
adding test coverage
bchen1116 Jul 14, 2021
6c8d7c4
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 14, 2021
531e447
update breaking changes
bchen1116 Jul 14, 2021
c3778a3
fixing formatting
bchen1116 Jul 15, 2021
0472355
address comments
bchen1116 Jul 19, 2021
ea38c79
string to regex
bchen1116 Jul 19, 2021
ebefee5
address test failures
bchen1116 Jul 19, 2021
93363f8
update dep order
bchen1116 Jul 19, 2021
3ad383b
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 19, 2021
9c18416
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 19, 2021
bbc1021
Merge branch 'main' into bc_2426_unknown
bchen1116 Jul 20, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ psutil>=5.6.6
requirements-parser>=0.2.0
shap>=0.36.0
texttable>=1.6.2
woodwork>=0.4.1,<0.5.0
woodwork>=0.5.0
dask>=2.12.0
featuretools>=0.21.0
nlp-primitives>=1.1.0
Expand Down
16 changes: 16 additions & 0 deletions docs/source/demos/text_input.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,22 @@
"y.value_counts(normalize=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In order to properly utilize Woodwork's 'Natural Language' typing, we need to pass this argument in during initialization. Otherwise, this will be treated as an 'Unknown' type and dropped in the search."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X.ww.init(logical_types={\"Message\": \"NaturalLanguage\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Release Notes
* Added details on how to fix error caused by broken ww schema :pr:`2466`
* Added ability to use built-in pickle for saving AutoMLSearch :pr:`2463`
* Updated our components and component graphs to use latest features of ww 0.4.1, e.g. ``concat_columns`` and drop in-place. :pr:`2465`
* Added support for new Woodwork ``Unknown`` type in AutoMLSearch :pr:`2477`
* Updated our components with an attribute that describes if they modify features or targets and can be used in list API for pipeline initialization :pr:`2504`
* Updated ``ComponentGraph`` to accept X and y as inputs :pr:`2507`
* Removed unused ``TARGET_BINARY_INVALID_VALUES`` from ``DataCheckMessageCode`` enum and fixed formatting of objective documentation :pr:`2520`
Expand All @@ -32,6 +33,7 @@ Release Notes
.. warning::

**Breaking Changes**
* `NaN` values in the `Natural Language` type are no longer supported by the Imputer with the pandas upgrade. :pr:`2477`

**v0.28.0 Jul. 2, 2021**
* Enhancements
Expand Down
2 changes: 2 additions & 0 deletions docs/source/user_guide/automl.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
"\n",
"EvalML also accepts ``pandas`` input, and will run type inference on top of the input ``pandas`` data. If you'd like to change the types inferred by EvalML, you can use the `infer_feature_types` utility method, which takes pandas or numpy input and converts it to a Woodwork data structure. The `feature_types` parameter can be used to specify what types specific columns should be.\n",
"\n",
"Feature types such as `Natural Language` must be specified in this way, otherwise Woodwork will infer it as `Unknown` type and drop it during the AutoMLSearch.\n",
"\n",
"In the example below, we reformat a couple features to make them easily consumable by the model, and then specify that the provider, which would have otherwise been inferred as a column with natural language, is a categorical column."
]
},
Expand Down
21 changes: 16 additions & 5 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,12 +491,23 @@ def __init__(
if "Drop Columns Transformer" in self.pipeline_parameters
else None
)
index_columns = list(
self.X_train.ww.select("index", return_schema=True).columns
index_and_unknown_columns = list(
self.X_train.ww.select(["index", "unknown"], return_schema=True).columns
)
index_columns = _put_into_original_order(self.X_train, index_columns)
if len(index_columns) > 0 and drop_columns is None:
parameters["Drop Columns Transformer"] = {"columns": index_columns}
unknown_columns = list(
self.X_train.ww.select("unknown", return_schema=True).columns
)
index_and_unknown_columns = _put_into_original_order(
self.X_train, index_and_unknown_columns
)
if len(index_and_unknown_columns) > 0 and drop_columns is None:
parameters["Drop Columns Transformer"] = {
"columns": index_and_unknown_columns
}
if len(unknown_columns):
logger.info(
f"Removing columns {unknown_columns} because they are of 'Unknown' type"
)
self.allowed_pipelines = [
make_pipeline(
self.X_train,
Expand Down
7 changes: 6 additions & 1 deletion evalml/model_understanding/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,12 @@ def partial_dependence(
X_features = (
X.ww.iloc[:, [features]] if isinstance(features, int) else X.ww[[features]]
)
X_unknown = X_features.ww.select("unknown")
if len(X_unknown.columns):
# We drop the unknown columns in the pipelines, so we cannot calculate partial dependence for these
raise ValueError(
f"Columns {X_unknown.columns.values} are of type 'Unknown', which cannot be used for partial dependence"
)

X_cats = X_features.ww.select("categorical")
if any(is_categorical):
Expand Down Expand Up @@ -679,7 +685,6 @@ def partial_dependence(
)

feature_list = X[feature_names]

_raise_value_error_if_any_features_all_nan(feature_list)

if feature_list.isnull().sum().any():
Expand Down
6 changes: 4 additions & 2 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,10 @@ def _get_preprocessing_components(
if len(text_columns) > 0:
pp_components.append(TextFeaturizer)

index_columns = list(X.ww.select("index", return_schema=True).columns)
if len(index_columns) > 0:
index_and_unknown_columns = list(
X.ww.select(["index", "unknown"], return_schema=True).columns
)
if len(index_and_unknown_columns) > 0:
pp_components.append(DropColumns)

datetime_cols = list(X.ww.select(["Datetime"], return_schema=True).columns)
Expand Down
41 changes: 41 additions & 0 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3077,6 +3077,7 @@ def test_search_with_text(AutoMLTestEnv):
],
}
)
X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
y = [0, 1, 1, 0, 1, 0]
automl = AutoMLSearch(
X_train=X, y_train=y, problem_type="binary", optimize_thresholds=False
Expand Down Expand Up @@ -3134,6 +3135,7 @@ def test_search_with_text_and_ensembling(

if df_text:
X = X_with_text
X.ww.init(logical_types={"col_1": "NaturalLanguage"})
else:
X = X_no_text
if problem_type == "binary":
Expand Down Expand Up @@ -4790,6 +4792,45 @@ def test_automl_thresholding_train_pipelines(mock_objective, threshold, X_y_bina
assert all([p.threshold is None for p in pipes.values()])


@pytest.mark.parametrize("columns", [[], ["unknown_col"], ["unknown1, unknown2"]])
def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog):
X, y = X_y_binary
X = pd.DataFrame(X)
for col in columns:
X[col] = pd.Series(range(len(X)))
X.ww.init()
X.ww.set_types({col: "Unknown" for col in columns})
automl = AutoMLSearch(
X_train=X,
y_train=y,
problem_type="binary",
optimize_thresholds=False,
max_batches=2,
)
env = AutoMLTestEnv("binary")
with env.test_context(score_return_value={automl.objective.name: 1.0}):
automl.search()
if not len(columns):
for pipeline in automl.allowed_pipelines:
assert "Drop Columns Transformer" not in pipeline.name
assert "because they are of 'Unknown'" not in caplog.text
return

assert "because they are of 'Unknown'" in caplog.text
for pipeline in automl.allowed_pipelines:
assert pipeline.get_component("Drop Columns Transformer")
assert "Drop Columns Transformer" in pipeline.parameters
assert pipeline.parameters["Drop Columns Transformer"] == {"columns": columns}

all_drop_column_params = []
for _, row in automl.full_rankings.iterrows():
if "Baseline" not in row.pipeline_name:
all_drop_column_params.append(
row.parameters["Drop Columns Transformer"]["columns"]
)
assert all(param == columns for param in all_drop_column_params)


@pytest.mark.parametrize(
"automl_type",
[
Expand Down
8 changes: 6 additions & 2 deletions evalml/tests/component_tests/test_lsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def test_lsa_only_text(text_df):
def test_lsa_with_nontext(text_df):
X = text_df
X["col_3"] = [73.7, 67.213, 92]
X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
lsa = LSA()

lsa.fit(X)
Expand Down Expand Up @@ -120,7 +121,8 @@ def test_index_col_names():
]
)
lsa = LSA()

X = pd.DataFrame(X)
X.ww.init(logical_types={0: "NaturalLanguage", 1: "NaturalLanguage"})
lsa.fit(X)
expected_col_names = set(["LSA(0)[0]", "LSA(0)[1]", "LSA(1)[0]", "LSA(1)[1]"])
X_t = lsa.transform(X)
Expand All @@ -146,6 +148,7 @@ def test_float_col_names():
],
}
)
X.ww.init(logical_types={-1: "NaturalLanguage", 4.75: "NaturalLanguage"})
lsa = LSA()
lsa.fit(X)
expected_col_names = set(
Expand All @@ -169,6 +172,7 @@ def test_lsa_output():
]
}
)
X.ww.init(logical_types={"lsa": "NaturalLanguage"})
lsa = LSA()
lsa.fit(X)
expected_features = pd.DataFrame(
Expand Down Expand Up @@ -218,7 +222,7 @@ def test_lsa_woodwork_custom_overrides_returned_by_components(X_df):
for logical_type in override_types:
try:
X = X_df
X.ww.init(logical_types={0: logical_type})
X.ww.init(logical_types={0: logical_type, "text col": "NaturalLanguage"})
except ww.exceptions.TypeConversionError:
continue

Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/component_tests/test_per_column_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def test_per_column_imputer_woodwork_custom_overrides_returned_by_components(
override_types = [Integer, Double, Categorical, NaturalLanguage, Boolean]
for logical_type in override_types:
# Column with Nans to boolean used to fail. Now it doesn't
if has_nan and logical_type == Boolean:
if has_nan and logical_type in [Boolean, NaturalLanguage]:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave out NaturalLanguage since casting this will result in np.nan becoming pd.NA, which fails the imputer

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But how come this wouldn't happen before? Woodwork doesn't convert np.nan to pd.NA in 0.4.2? Is this because of the pandas upgrade?

Copy link
Contributor Author

@bchen1116 bchen1116 Jul 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@freddyaboulton I believe it's due to the pandas upgrade!
image
Looking at the 1.3.0 release docs, seems like there's a lot of changes with NaN handling, and they're using <NA> for scalar types

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking into this @bchen1116 Let's list this as a breaking change for now. I imagine we might want to file an issue to discuss if there are any changes we need to make to the simpleimputer? If users run it on natural language after this pr they'll get a stacktrace they didn't get before.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@freddyaboulton updated the release notes with the breaking change and filed the issue here!

continue
try:
X = X_df.copy()
Expand Down
17 changes: 15 additions & 2 deletions evalml/tests/component_tests/test_text_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def test_featurizer_only_text(text_df):
def test_featurizer_with_nontext(text_df):
X = text_df
X["col_3"] = [73.7, 67.213, 92]
X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
tf = TextFeaturizer()

tf.fit(X)
Expand Down Expand Up @@ -147,6 +148,7 @@ def test_no_null_output():
],
}
)
X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)
X_t = tf.transform(X)
Expand All @@ -170,6 +172,8 @@ def test_index_col_names():
],
]
)
X = pd.DataFrame(X)
X.ww.init(logical_types={0: "NaturalLanguage", 1: "NaturalLanguage"})
tf = TextFeaturizer()

tf.fit(X)
Expand Down Expand Up @@ -210,6 +214,7 @@ def test_float_col_names():
],
}
)
X.ww.init(logical_types={4.75: "NaturalLanguage", -1: "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)
expected_col_names = set(
Expand Down Expand Up @@ -249,6 +254,7 @@ def test_output_null():
],
}
)
X.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)
X_t = tf.transform(X)
Expand All @@ -265,6 +271,7 @@ def test_diversity_primitive_output():
]
}
)
X.ww.init(logical_types={"diverse": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)

Expand All @@ -284,6 +291,7 @@ def test_lsa_primitive_output():
]
}
)
X.ww.init(logical_types={"lsa": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)

Expand All @@ -300,7 +308,9 @@ def test_featurizer_custom_types(text_df):
# force one of the two provided columns to be a user-specified type.
# if the output contains text features for col_2, then the text featurizer didn't pass the right
# ww types to LSA, because LSA still thought col_2 was natural language even though the user said otherwise.
X = infer_feature_types(text_df, {"col_2": "categorical"})
X = infer_feature_types(
pd.DataFrame(text_df), {"col_1": "NaturalLanguage", "col_2": "categorical"}
)
tf = TextFeaturizer()
tf.fit(X)

Expand Down Expand Up @@ -339,6 +349,7 @@ def test_mean_characters_primitive_output():
]
}
)
X.ww.init(logical_types={"mean_characters": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)

Expand All @@ -361,6 +372,7 @@ def test_polarity_primitive_output():
]
}
)
X.ww.init(logical_types={"polarity": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)

Expand Down Expand Up @@ -415,7 +427,7 @@ def test_text_featurizer_woodwork_custom_overrides_returned_by_components(X_df):
for logical_type in override_types:
try:
X = X_df.copy()
X.ww.init(logical_types={0: logical_type})
X.ww.init(logical_types={0: logical_type, "text col": "NaturalLanguage"})
except ww.exceptions.TypeConversionError:
continue

Expand Down Expand Up @@ -443,6 +455,7 @@ def test_text_featurizer_sets_max_depth_1(mock_dfs):
]
}
)
X.ww.init(logical_types={"polarity": "NaturalLanguage"})
tf = TextFeaturizer()
tf.fit(X)
_, kwargs = mock_dfs.call_args
Expand Down
1 change: 1 addition & 0 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ def text_df():
],
}
)
df.ww.init(logical_types={"col_1": "NaturalLanguage", "col_2": "NaturalLanguage"})
yield df


Expand Down
5 changes: 2 additions & 3 deletions evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ def test_default_data_checks_classification(input_type):

y = pd.Series([0, 1, np.nan, 1, 0])
y_multiclass = pd.Series([0, 1, np.nan, 2, 0])
X.ww.init(logical_types={"natural_language_nan": "NaturalLanguage"})
if input_type == "ww":
X.ww.init()
y = ww.init_series(y)
y_multiclass = ww.init_series(y_multiclass)

Expand Down Expand Up @@ -336,9 +336,8 @@ def test_default_data_checks_regression(input_type):
X["nan_dt_col"][0] = None
y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
y_no_variance = pd.Series([5] * 5)

X.ww.init(logical_types={"natural_language_nan": "NaturalLanguage"})
if input_type == "ww":
X.ww.init()
y = ww.init_series(y)
y_no_variance = ww.init_series(y_no_variance)
null_leakage = [
Expand Down
Loading