Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make EvalML compatible with Woodwork changes #4066

Merged
merged 23 commits into from
Mar 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ outputs:
- click>=8.0.0
- shap >=0.40.0
- texttable >=1.6.2
- woodwork >=0.21.1
- woodwork >=0.22.0
- featuretools>=1.16.0
- nlp-primitives>=2.9.0
- python >=3.8.*
Expand Down
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Release Notes
* Testing Changes
* Use ``release.yaml`` for performance tests on merge to main :pr:`4007`
* Pin ``github-action-check-linked-issues`` at v1.4.5 :pr:`4042`
* Updated tests to support Woodwork's object dtype inference for numeric columns :pr:`4066`
* Updated ``TargetLeakageDataCheck`` tests to handle boolean targets properly :pr:`4066`

.. warning::

Expand Down
1 change: 1 addition & 0 deletions docs/source/user_guide/timeseries.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
"outputs": [],
"source": [
"X[\"Categorical\"] = [str(i % 4) for i in range(len(X))]\n",
"X[\"Categorical\"] = X[\"Categorical\"].astype(\"category\")\n",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you want to add a comment here and below explaining why we're setting it as category instead of leaving it as object? Might be good to remember.

"X[\"Numeric\"] = [i for i in range(len(X))]\n",
"\n",
"# Re-split the data since we modified X\n",
Expand Down
3 changes: 1 addition & 2 deletions evalml/pipelines/classification_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Pipeline subclass for all classification pipelines."""
import numpy as np
import pandas as pd
import woodwork as ww

from evalml.pipelines import PipelineBase
from evalml.problem_types import is_binary, is_multiclass
Expand Down Expand Up @@ -71,7 +70,7 @@ def fit(self, X, y):

# TODO: Added this in because numpy's unique() does not support pandas.NA
try:
self._classes_ = list(ww.init_series(np.unique(y)))
self._classes_ = list(np.unique(y))
except TypeError as e:
if "boolean value of NA is ambiguous" in str(e):
self._classes_ = y.unique()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_select_by_type_empty_X():
[
lambda X, X_t: X_t.empty,
lambda X, X_t: X_t.empty,
lambda X, X_t: X_t.equals(X[["three"]].astype("int64")),
lambda X, X_t: X_t.equals(X[["one", "three"]].astype("int64")),
lambda X, X_t: X_t.astype(str).equals(X.astype(str)),
],
),
Expand Down Expand Up @@ -135,7 +135,7 @@ def test_column_transformer_transform(class_to_test, checking_functions):
SelectByType,
[
lambda X, X_t: X_t.empty,
lambda X, X_t: X_t.equals(X[["three"]].astype("int64")),
lambda X, X_t: X_t.equals(X[["one", "three"]].astype("int64")),
lambda X, X_t: X_t.astype(str).equals(X.astype(str)),
],
),
Expand Down
15 changes: 9 additions & 6 deletions evalml/tests/component_tests/test_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,29 +121,32 @@ def test_cols():
{
"col_1": [1, 2, 1, 1, 2] * 2,
"col_2": ["2", "1", "1", "1", "1"] * 2,
"col_3": ["a", "a", "a", "a", "a"] * 2,
"col_3": ["a", "a", "a", "a", "b"] * 2,
},
)
X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"})
X_expected = X.astype({"col_1": "int64", "col_2": "int64", "col_3": "category"})
y = pd.Series([0, 1, 1, 1, 0] * 2)
encoder = TargetEncoder(cols=[])
encoder.fit(X, y)
X_t = encoder.transform(X)
assert_frame_equal(X_expected, X_t)

encoder = TargetEncoder(cols=["col_2"])
encoder = TargetEncoder(cols=["col_3"])
encoder.fit(X, y)
X_t = encoder.transform(X)
X_expected = pd.DataFrame(
{
"col_1": pd.Series([1, 2, 1, 1, 2] * 2, dtype="int64"),
"col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863] * 2,
"col_3": pd.Series(["a", "a", "a", "a", "a"] * 2, dtype="category"),
"col_2": [2, 1, 1, 1, 1] * 2,
"col_3": pd.Series(
[0.749863, 0.749863, 0.749863, 0.749863, 0.161365] * 2,
dtype="float64",
),
},
)
assert_frame_equal(X_expected, X_t, check_less_precise=True)

encoder = TargetEncoder(cols=["col_2", "col_3"])
encoder = TargetEncoder(cols=["col_3"])
encoder.fit(X, y)
X_t = encoder.transform(X)
encoder2 = TargetEncoder()
Expand Down
18 changes: 12 additions & 6 deletions evalml/tests/data_checks_tests/test_id_columns_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,15 @@ def test_id_columns_strings():
id_cols_check = IDColumnsDataCheck(id_threshold=1.0)
assert id_cols_check.validate(X) == [
DataCheckWarning(
message="Columns 'Id' are 100.0% or more likely to be an ID column",
message="Columns 'Id', 'col_3_id' are 100.0% or more likely to be an ID column",
data_check_name=id_data_check_name,
message_code=DataCheckMessageCode.HAS_ID_COLUMN,
details={"columns": ["Id"]},
details={"columns": ["Id", "col_3_id"]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=id_data_check_name,
metadata={"columns": ["Id"]},
metadata={"columns": ["Id", "col_3_id"]},
),
],
).to_dict(),
Expand Down Expand Up @@ -293,17 +293,23 @@ def test_unidentified_first_col_primary_key(
)

id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
if input_type == "string":
order = ["col_2", "col_3_id", "col_1_id"]
else:
order = ["col_2", "col_1_id", "col_3_id"]
order_msg = f"Columns '{order[0]}', '{order[1]}', '{order[2]}' are 95.0% or more likely to be an ID column"

assert id_cols_check.validate(X) == [
DataCheckWarning(
message="Columns 'col_2', 'col_1_id', 'col_3_id' are 95.0% or more likely to be an ID column",
message=order_msg,
data_check_name=id_data_check_name,
message_code=DataCheckMessageCode.HAS_ID_COLUMN,
details={"columns": ["col_2", "col_1_id", "col_3_id"]},
details={"columns": order},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=id_data_check_name,
metadata={"columns": ["col_2", "col_1_id", "col_3_id"]},
metadata={"columns": order},
),
],
).to_dict(),
Expand Down
26 changes: 18 additions & 8 deletions evalml/tests/data_checks_tests/test_target_leakage_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,15 @@ def test_target_leakage_types():

expected = [
DataCheckWarning(
message="Columns 'a', 'b' are 80.0% or more correlated with the target",
message="Columns 'a', 'b', 'c' are 80.0% or more correlated with the target",
data_check_name=target_leakage_data_check_name,
message_code=DataCheckMessageCode.TARGET_LEAKAGE,
details={"columns": ["a", "b"]},
details={"columns": ["a", "b", "c"]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=target_leakage_data_check_name,
metadata={"columns": ["a", "b"]},
metadata={"columns": ["a", "b", "c"]},
),
],
).to_dict(),
Expand Down Expand Up @@ -356,8 +356,21 @@ def test_target_leakage_data_check_warnings_pearson():
y = y.astype(bool)

leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.5, method="pearson")
# pearsons does not support boolean columns
assert leakage_check.validate(X, y) == []
assert leakage_check.validate(X, y) == [
DataCheckWarning(
message="Columns 'a', 'b', 'c', 'd' are 50.0% or more correlated with the target",
data_check_name=target_leakage_data_check_name,
message_code=DataCheckMessageCode.TARGET_LEAKAGE,
details={"columns": ["a", "b", "c", "d"]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.DROP_COL,
data_check_name=target_leakage_data_check_name,
metadata={"columns": ["a", "b", "c", "d"]},
),
],
).to_dict(),
]

y = y.astype(int)
assert leakage_check.validate(X, y) == [
Expand Down Expand Up @@ -447,9 +460,6 @@ def test_target_leakage_none_measures(measures):
X["b"] = y
y = y.astype(bool)

if measures in ["pearson", "spearman"]:
assert leakage_check.validate(X, y) == []
return
assert len(leakage_check.validate(X, y))


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ statsmodels==0.13.5
texttable==1.6.7
tomli==2.0.1
vowpalwabbit==9.7.0
woodwork==0.21.2
woodwork==0.22.0
xgboost==1.7.4
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ statsmodels==0.12.2
texttable==1.6.2
tomli==2.0.1
vowpalwabbit==8.11.0
woodwork==0.21.1
woodwork==0.22.0
xgboost==1.7.0
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ statsmodels==0.12.2
texttable==1.6.2
tomli==2.0.1
vowpalwabbit==8.11.0
woodwork==0.21.1
woodwork==0.22.0
xgboost==1.7.0
Original file line number Diff line number Diff line change
Expand Up @@ -1132,11 +1132,11 @@ def test_json_serialization(
pipeline = linear_regression_pipeline
elif problem_type == problem_type.BINARY:
X, y = X_y_binary
y = pd.Series(y).astype("str")
y = pd.Series(y).astype("string")
pipeline = logistic_regression_binary_pipeline
else:
X, y = X_y_multi
y = pd.Series(y).astype("str")
y = pd.Series(y).astype("string")
pipeline = logistic_regression_multiclass_pipeline

pipeline.fit(X, y)
Expand All @@ -1148,6 +1148,7 @@ def test_json_serialization(
num_to_explain=1,
output_format="dict",
)

assert json.loads(json.dumps(best_worst)) == best_worst

report = explain_predictions(
Expand Down
23 changes: 17 additions & 6 deletions evalml/tests/model_understanding_tests/test_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1425,10 +1425,14 @@ def test_graph_partial_dependence_regression_and_binary_categorical(

X = pd.DataFrame(X)
X.columns = [str(i) for i in range(X.shape[1])]
X["categorical_column"] = pd.Series([i % 3 for i in range(X.shape[0])]).astype(
X["categorical_column"] = pd.Series(
[f"cat_{i % 3}" for i in range(X.shape[0])],
).astype(
"str",
)
X["categorical_column_2"] = pd.Series([i % 6 for i in range(X.shape[0])]).astype(
X["categorical_column_2"] = pd.Series(
[f"cat_{i % 6}" for i in range(X.shape[0])],
).astype(
"str",
)

Expand All @@ -1442,7 +1446,7 @@ def test_graph_partial_dependence_regression_and_binary_categorical(
)
plot_data = fig.to_dict()["data"][0]
assert plot_data["type"] == "bar"
assert list(plot_data["x"]) == ["0", "1", "2"]
assert list(plot_data["x"]) == ["cat_0", "cat_1", "cat_2"]

fig = graph_partial_dependence(
pipeline,
Expand All @@ -1453,7 +1457,7 @@ def test_graph_partial_dependence_regression_and_binary_categorical(
fig_dict = fig.to_dict()
plot_data = fig_dict["data"][0]
assert plot_data["type"] == "contour"
assert fig_dict["layout"]["yaxis"]["ticktext"] == ["0", "1", "2"]
assert fig_dict["layout"]["yaxis"]["ticktext"] == ["cat_0", "cat_1", "cat_2"]
assert (
fig_dict["layout"]["title"]["text"]
== "Partial Dependence of 'categorical_column' vs. '0'"
Expand All @@ -1468,8 +1472,15 @@ def test_graph_partial_dependence_regression_and_binary_categorical(
fig_dict = fig.to_dict()
plot_data = fig_dict["data"][0]
assert plot_data["type"] == "contour"
assert fig_dict["layout"]["xaxis"]["ticktext"] == ["0", "1", "2"]
assert fig_dict["layout"]["yaxis"]["ticktext"] == ["0", "1", "2", "3", "4", "5"]
assert fig_dict["layout"]["xaxis"]["ticktext"] == ["cat_0", "cat_1", "cat_2"]
assert fig_dict["layout"]["yaxis"]["ticktext"] == [
"cat_0",
"cat_1",
"cat_2",
"cat_3",
"cat_4",
"cat_5",
]
assert (
fig_dict["layout"]["title"]["text"]
== "Partial Dependence of 'categorical_column_2' vs. 'categorical_column'"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dependencies = [
"shap >= 0.40.0",
"statsmodels >= 0.12.2",
"texttable >= 1.6.2",
"woodwork >= 0.21.1",
"woodwork >= 0.22.0",
"dask >= 2022.2.0, != 2022.10.1",
"featuretools >= 1.16.0",
"nlp-primitives >= 2.9.0",
Expand Down