Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated partial dependence methods to support non-numeric columns #1150

Merged
merged 26 commits into from Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d5055ac
init
angela97lin Sep 8, 2020
0421d79
release notes
angela97lin Sep 8, 2020
1f409d4
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 8, 2020
2e512d0
add tests and update
angela97lin Sep 9, 2020
ecd9aa8
Merge branch '1125_non_numeric' of github.com:FeatureLabs/evalml into…
angela97lin Sep 9, 2020
ac6b2d5
fix test
angela97lin Sep 9, 2020
9c5f148
fix
angela97lin Sep 10, 2020
6626fad
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 10, 2020
476f1f3
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 10, 2020
aa94e3d
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 10, 2020
667fc89
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 11, 2020
a48f61b
cleanup, add docstr about support for numeric only
angela97lin Sep 13, 2020
cfed916
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 14, 2020
2272e2a
add test and raise error
angela97lin Sep 14, 2020
e13d5df
fix catboost and categorical
angela97lin Sep 14, 2020
00ffd9a
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 17, 2020
6036fe7
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 17, 2020
d1355da
move release notes
angela97lin Sep 17, 2020
da2014f
empty for circleci
angela97lin Sep 21, 2020
cdc35af
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 24, 2020
875495b
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 24, 2020
91aaa73
fix docs
angela97lin Sep 24, 2020
0da2c13
some cleanup, still need to delete temporarily set attributes
angela97lin Sep 24, 2020
2c41287
Merge branch 'main' into 1125_non_numeric
angela97lin Sep 25, 2020
d04cdc9
linting
angela97lin Sep 25, 2020
9b22e89
remove line about only supporting numerical features
angela97lin Sep 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 7 additions & 2 deletions evalml/model_understanding/graphs.py
Expand Up @@ -20,6 +20,7 @@
from evalml.objectives.utils import get_objective
from evalml.problem_types import ProblemTypes
from evalml.utils import import_or_raise, jupyter_check
from evalml.utils.gen_utils import numeric_dtypes


def confusion_matrix(y_true, y_predicted, normalize_method='true'):
Expand Down Expand Up @@ -406,7 +407,7 @@ def partial_dependence(pipeline, X, feature, grid_resolution=100):

Arguments:
pipeline (PipelineBase or subclass): Fitted pipeline
X (pd.DataFrame, npermutation importance.array): The input data used to generate a grid of values
X (pd.DataFrame, np.array): The input data used to generate a grid of values
for feature where partial dependence will be calculated at
feature (int, string): The target features for which to create the partial dependence plot for.
If feature is an int, it must be the index of the feature to use.
Expand All @@ -417,6 +418,10 @@ def partial_dependence(pipeline, X, feature, grid_resolution=100):
over all samples of X and the values used to calculate those predictions.

"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if X[feature].dtype not in numeric_dtypes:
raise ValueError(f"Partial dependence is is currently only supported for numeric dtypes.")
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
if pipeline.model_family == ModelFamily.BASELINE:
raise ValueError("Partial dependence plots are not supported for Baseline pipelines")
if not pipeline._is_fitted:
Expand Down Expand Up @@ -445,7 +450,7 @@ def graph_partial_dependence(pipeline, X, feature, grid_resolution=100):

Arguments:
pipeline (PipelineBase or subclass): Fitted pipeline
X (pd.DataFrame, npermutation importance.array): The input data used to generate a grid of values
X (pd.DataFrame, np.array): The input data used to generate a grid of values
for feature where partial dependence will be calculated at
feature (int, string): The target feature for which to create the partial dependence plot for.
If feature is an int, it must be the index of the feature to use.
Expand Down
18 changes: 18 additions & 0 deletions evalml/tests/model_understanding_tests/test_graphs.py
Expand Up @@ -638,6 +638,24 @@ def test_partial_dependence_string_feature_name(logistic_regression_binary_pipel
assert not part_dep.isnull().any(axis=None)


def test_partial_dependence_nonsupported_dtypes(logistic_regression_binary_pipeline_class):
X = pd.DataFrame({
"categorical col": pd.Series(["zero", "one", "two", "zero", "three"], dtype='category'),
"object col": ["b", "b", "a", "c", "d"],
"bool col": [True, False, False, True, True],
"int col": [0, 1, 2, 4, 3]
})
y = pd.Series([1, 1, 0, 0, 1])
pipeline = logistic_regression_binary_pipeline_class(parameters={})
pipeline.fit(X, y)
with pytest.raises(ValueError, match="Partial dependence is is currently only supported for numeric dtypes"):
partial_dependence(pipeline, X, feature="categorical col", grid_resolution=20)
with pytest.raises(ValueError, match="Partial dependence is is currently only supported for numeric dtypes"):
partial_dependence(pipeline, X, feature="object col", grid_resolution=20)
with pytest.raises(ValueError, match="Partial dependence is is currently only supported for numeric dtypes"):
partial_dependence(pipeline, X, feature="bool col", grid_resolution=20)


def test_partial_dependence_with_non_numeric_columns(linear_regression_pipeline_class):
X = pd.DataFrame({'numeric': [1, 2, 3, 0], 'also numeric': [2, 3, 4, 1], 'string': ['a', 'b', 'a', 'c'], 'also string': ['c', 'b', 'a', 'd']})
y = [0, 0.2, 1.4, 1]
Expand Down