Skip to content

Commit

Permalink
Merge branch 'main' into bc_296_lgbm_regressor
Browse files Browse the repository at this point in the history
  • Loading branch information
bchen1116 committed Dec 16, 2020
2 parents c426f9b + 7e21b20 commit 6283a2c
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 46 deletions.
2 changes: 1 addition & 1 deletion core-requirements.txt
Expand Up @@ -10,7 +10,7 @@ psutil>=5.6.3
requirements-parser>=0.2.0
shap>=0.35.0
texttable>=1.6.2
woodwork==0.0.6
woodwork==0.0.7
featuretools>=0.20.0
nlp-primitives>=1.1.0
networkx>=2.5
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Expand Up @@ -17,6 +17,7 @@ Release Notes
* Add woodwork support for more utility and graph methods :pr:`1544`
* Changed ``DateTimeFeaturizer`` to encode features as int :pr:`1479`
* Added `Linear Discriminant Analysis Transformer` component for dimensionality reduction :pr:`1331`
* Added multiclass support for ``partial_dependence`` and ``graph_partial_dependence`` :pr:`1554`
* Fixes
* Fix Windows CI jobs: install ``numba`` via conda, required for ``shap`` :pr:`1490`
* Added custom-index support for `reset-index-get_prediction_vs_actual_over_time_data` :pr:`1494`
Expand All @@ -29,6 +30,7 @@ Release Notes
* Added script to generate github markdown for releases :pr:`1487`
* Updated dependencies to fix ``ImportError: cannot import name 'MaskedArray' from 'sklearn.utils.fixes'`` error and to address Woodwork and Featuretool dependencies :pr:`1540`
* Made ``get_prediction_vs_actual_data()`` a public method :pr:`1553`
* Updated ``Woodwork`` version requirement to v0.0.7 :pr:`1560`
* Documentation Changes
* Added partial dependence methods to API reference :pr:`1537`
* Testing Changes
Expand Down
67 changes: 57 additions & 10 deletions evalml/model_understanding/graphs.py
Expand Up @@ -439,7 +439,12 @@ def partial_dependence(pipeline, X, feature, grid_resolution=100):
Returns:
pd.DataFrame: DataFrame with averaged predictions for all points in the grid averaged
over all samples of X and the values used to calculate those predictions.
over all samples of X and the values used to calculate those predictions. The dataframe will
contain two columns: "feature_values" (grid points at which the partial dependence was calculated) and
"partial_dependence" (the partial dependence at that feature value). For classification problems, there
will be a third column called "class_label" (the class label for which the partial
dependence was calculated). For binary classification, the partial dependence is only calculated for the
"positive" class.
"""
X = _convert_to_woodwork_structure(X)
Expand All @@ -462,11 +467,21 @@ def partial_dependence(pipeline, X, feature, grid_resolution=100):
# Delete scikit-learn attributes that were temporarily set
del pipeline._estimator_type
del pipeline.feature_importances_
return pd.DataFrame({"feature_values": values[0],
"partial_dependence": avg_pred[0]})
classes = None
if isinstance(pipeline, evalml.pipelines.BinaryClassificationPipeline):
classes = [pipeline.classes_[1]]
elif isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline):
classes = pipeline.classes_

data = pd.DataFrame({"feature_values": np.tile(values[0], avg_pred.shape[0]),
"partial_dependence": np.concatenate([pred for pred in avg_pred])})
if classes is not None:
data['class_label'] = np.repeat(classes, len(values[0]))

return data

def graph_partial_dependence(pipeline, X, feature, grid_resolution=100):

def graph_partial_dependence(pipeline, X, feature, class_label=None, grid_resolution=100):
"""Create an one-way partial dependence plot.
Arguments:
Expand All @@ -476,6 +491,10 @@ def graph_partial_dependence(pipeline, X, feature, grid_resolution=100):
feature (int, string): The target feature for which to create the partial dependence plot for.
If feature is an int, it must be the index of the feature to use.
If feature is a string, it must be a valid column name in X.
class_label (string, optional): Name of class to plot for multiclass problems. If None, will plot
the partial dependence for each class. This argument does not change behavior for regression or binary
classification pipelines. For binary classification, the partial dependence for the positive label will
always be displayed. Defaults to None.
Returns:
pd.DataFrame: pd.DataFrame with averaged predictions for all points in the grid averaged
Expand All @@ -485,19 +504,47 @@ def graph_partial_dependence(pipeline, X, feature, grid_resolution=100):
_go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects")
if jupyter_check():
import_or_raise("ipywidgets", warning=True)
if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline) and class_label is not None:
if class_label not in pipeline.classes_:
msg = f"Class {class_label} is not one of the classes the pipeline was fit on: {', '.join(list(pipeline.classes_))}"
raise ValueError(msg)

part_dep = partial_dependence(pipeline, X, feature=feature, grid_resolution=grid_resolution)
feature_name = str(feature)
title = f"Partial Dependence of '{feature_name}'"
layout = _go.Layout(title={'text': title},
xaxis={'title': f'{feature_name}', 'range': _calculate_axis_range(part_dep['feature_values'])},
yaxis={'title': 'Partial Dependence', 'range': _calculate_axis_range(part_dep['partial_dependence'])})
data = []
data.append(_go.Scatter(x=part_dep['feature_values'],
xaxis={'title': f'{feature_name}'},
yaxis={'title': 'Partial Dependence'},
showlegend=False)
if isinstance(pipeline, evalml.pipelines.MulticlassClassificationPipeline):
class_labels = [class_label] if class_label is not None else pipeline.classes_
_subplots = import_or_raise("plotly.subplots", error_msg="Cannot find dependency plotly.graph_objects")

# If the user passes in a value for class_label, we want to create a 1 x 1 subplot or else there would
# be an empty column in the plot and it would look awkward
rows, cols = ((len(class_labels) + 1) // 2, 2) if len(class_labels) > 1 else (1, len(class_labels))

# Don't specify share_xaxis and share_yaxis so that we get tickmarks in each subplot
fig = _subplots.make_subplots(rows=rows, cols=cols, subplot_titles=class_labels)
for i, label in enumerate(class_labels):

# Plotly trace indexing begins at 1 so we add 1 to i
fig.add_trace(_go.Scatter(x=part_dep.loc[part_dep.class_label == label, 'feature_values'],
y=part_dep.loc[part_dep.class_label == label, 'partial_dependence'],
line=dict(width=3),
name=label),
row=(i + 2) // 2, col=(i % 2) + 1)
fig.update_layout(layout)
fig.update_xaxes(title=f'{feature_name}', range=_calculate_axis_range(part_dep['feature_values']))
fig.update_yaxes(range=_calculate_axis_range(part_dep['partial_dependence']))
else:
trace = _go.Scatter(x=part_dep['feature_values'],
y=part_dep['partial_dependence'],
name='Partial Dependence',
line=dict(width=3)))
return _go.Figure(layout=layout, data=data)
line=dict(width=3))
fig = _go.Figure(layout=layout, data=[trace])

return fig


def _calculate_axis_range(arr):
Expand Down
4 changes: 2 additions & 2 deletions evalml/tests/data_checks_tests/test_id_columns_data_check.py
Expand Up @@ -132,11 +132,11 @@ def test_id_cols_data_check_input_formats():
"warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column",
data_check_name=id_data_check_name,
message_code=DataCheckMessageCode.HAS_ID_COLUMN,
details={"column": '0'}).to_dict(),
details={"column": 0}).to_dict(),
DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column",
data_check_name=id_data_check_name,
message_code=DataCheckMessageCode.HAS_ID_COLUMN,
details={"column": '1'}).to_dict()],
details={"column": 1}).to_dict()],
"errors": []
}

Expand Down
Expand Up @@ -18,5 +18,5 @@ scikit-optimize==0.8.1
scipy==1.5.4
shap==0.37.0
texttable==1.6.3
woodwork==0.0.6
woodwork==0.0.7
xgboost==1.2.1
94 changes: 71 additions & 23 deletions evalml/tests/model_understanding_tests/test_graphs.py
Expand Up @@ -11,7 +11,7 @@
from sklearn.preprocessing import label_binarize
from skopt.space import Real

from evalml.demos import load_breast_cancer
from evalml.demos import load_breast_cancer, load_wine
from evalml.exceptions import NullsInColumnWarning
from evalml.model_family import ModelFamily
from evalml.model_understanding.graphs import (
Expand Down Expand Up @@ -39,6 +39,7 @@
from evalml.objectives import CostBenefitMatrix
from evalml.pipelines import (
BinaryClassificationPipeline,
ClassificationPipeline,
MulticlassClassificationPipeline,
RegressionPipeline
)
Expand Down Expand Up @@ -703,6 +704,19 @@ def test_graph_binary_objective_vs_threshold(mock_cb_thresholds, data_type, X_y_
assert np.array_equal(data['y'], mock_cb_thresholds.return_value['score'])


def check_partial_dependence_dataframe(pipeline, part_dep, grid_size=20):
columns = ["feature_values", "partial_dependence"]
if isinstance(pipeline, ClassificationPipeline):
columns.append("class_label")
n_rows_for_class = len(pipeline.classes_) if isinstance(pipeline, MulticlassClassificationPipeline) else 1
assert list(part_dep.columns) == columns
assert len(part_dep["partial_dependence"]) == grid_size * n_rows_for_class
assert len(part_dep["feature_values"]) == grid_size * n_rows_for_class
if isinstance(pipeline, ClassificationPipeline):
per_class_counts = part_dep['class_label'].value_counts()
assert all(value == grid_size for value in per_class_counts.values)


@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.REGRESSION])
def test_partial_dependence_problem_types(data_type, problem_type, X_y_binary, X_y_multi, X_y_regression,
Expand All @@ -728,9 +742,7 @@ def test_partial_dependence_problem_types(data_type, problem_type, X_y_binary, X

pipeline.fit(X, y)
part_dep = partial_dependence(pipeline, X, feature=0, grid_resolution=20)
assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
assert len(part_dep["partial_dependence"]) == 20
assert len(part_dep["feature_values"]) == 20
check_partial_dependence_dataframe(pipeline, part_dep)
assert not part_dep.isnull().any(axis=None)
with pytest.raises(AttributeError):
pipeline._estimator_type
Expand All @@ -757,7 +769,7 @@ def test_partial_dependence_string_feature_name(logistic_regression_binary_pipel
pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
pipeline.fit(X, y)
part_dep = partial_dependence(pipeline, X, feature="mean radius", grid_resolution=20)
assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
assert list(part_dep.columns) == ["feature_values", "partial_dependence", "class_label"]
assert len(part_dep["partial_dependence"]) == 20
assert len(part_dep["feature_values"]) == 20
assert not part_dep.isnull().any(axis=None)
Expand Down Expand Up @@ -796,29 +808,35 @@ class BaselineTestPipeline(BinaryClassificationPipeline):
partial_dependence(pipeline, X, feature=0, grid_resolution=20)


def test_partial_dependence_catboost(X_y_binary, has_minimal_dependencies):
@pytest.mark.parametrize("problem_type", [ProblemTypes.BINARY, ProblemTypes.MULTICLASS])
def test_partial_dependence_catboost(problem_type, X_y_binary, X_y_multi, has_minimal_dependencies):
if not has_minimal_dependencies:
X, y = X_y_binary

class CatBoostTestPipeline(BinaryClassificationPipeline):
component_graph = ["CatBoost Classifier"]
if problem_type == ProblemTypes.BINARY:
X, y = X_y_binary
y_small = ['a', 'b', 'a']

class CatBoostTestPipeline(BinaryClassificationPipeline):
component_graph = ["CatBoost Classifier"]
else:
X, y = X_y_multi
y_small = ['a', 'b', 'c']

class CatBoostTestPipeline(MulticlassClassificationPipeline):
component_graph = ["CatBoost Classifier"]

pipeline = CatBoostTestPipeline({"CatBoost Classifier": {'thread_count': 1}})
pipeline.fit(X, y)
part_dep = partial_dependence(pipeline, X, feature=0, grid_resolution=20)
assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
assert len(part_dep["partial_dependence"]) == 20
assert len(part_dep["feature_values"]) == 20
check_partial_dependence_dataframe(pipeline, part_dep)
assert not part_dep.isnull().all().all()

# test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence
X = pd.DataFrame({'numeric': [1, 2, 3], 'also numeric': [2, 3, 4], 'string': ['a', 'b', 'c'], 'also string': ['c', 'b', 'a']})
y = ['a', 'b', 'a']
pipeline = CatBoostTestPipeline({"CatBoost Classifier": {'thread_count': 1}})
pipeline.fit(X, y)
pipeline.fit(X, y_small)
part_dep = partial_dependence(pipeline, X, feature='string')
assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
assert len(part_dep["partial_dependence"]) == 3
assert len(part_dep["feature_values"]) == 3
check_partial_dependence_dataframe(pipeline, part_dep, grid_size=3)
assert not part_dep.isnull().all().all()


Expand Down Expand Up @@ -848,15 +866,11 @@ class XGBoostPipeline(MulticlassClassificationPipeline):
pipeline = XGBoostPipeline({'XGBoost Classifier': {'nthread': 1}})
pipeline.fit(X, y)
part_dep = partial_dependence(pipeline, X, feature="<[0]", grid_resolution=20)
assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
assert len(part_dep["partial_dependence"]) == 20
assert len(part_dep["feature_values"]) == 20
check_partial_dependence_dataframe(pipeline, part_dep)
assert not part_dep.isnull().all().all()

part_dep = partial_dependence(pipeline, X, feature=1, grid_resolution=20)
assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
assert len(part_dep["partial_dependence"]) == 20
assert len(part_dep["feature_values"]) == 20
check_partial_dependence_dataframe(pipeline, part_dep)
assert not part_dep.isnull().all().all()


Expand Down Expand Up @@ -890,12 +904,46 @@ def test_graph_partial_dependence(test_pipeline):
fig_dict = fig.to_dict()
assert fig_dict['layout']['title']['text'] == "Partial Dependence of 'mean radius'"
assert len(fig_dict['data']) == 1
assert fig_dict['data'][0]['name'] == "Partial Dependence"

part_dep_data = partial_dependence(clf, X, feature='mean radius', grid_resolution=20)
assert np.array_equal(fig_dict['data'][0]['x'], part_dep_data['feature_values'])
assert np.array_equal(fig_dict['data'][0]['y'], part_dep_data['partial_dependence'].values)


def test_graph_partial_dependence_multiclass(logistic_regression_multiclass_pipeline_class):
go = pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')
X, y = load_wine()
pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
pipeline.fit(X, y)
fig = graph_partial_dependence(pipeline, X, feature='magnesium', grid_resolution=20)
assert isinstance(fig, go.Figure)
fig_dict = fig.to_dict()
assert len(fig_dict['data']) == len(pipeline.classes_)
for data, label in zip(fig_dict['data'], pipeline.classes_):
assert len(data['x']) == 20
assert len(data['y']) == 20
assert data['name'] == label

# Check that all the subplots axes have the same range
for suplot_1_axis, suplot_2_axis in [('axis2', 'axis3'), ('axis2', 'axis4'), ('axis3', 'axis4')]:
for axis_type in ['x', 'y']:
assert fig_dict['layout'][axis_type + suplot_1_axis]['range'] == fig_dict['layout'][axis_type + suplot_2_axis]['range']

fig = graph_partial_dependence(pipeline, X, feature='magnesium', class_label='class_1', grid_resolution=20)

assert isinstance(fig, go.Figure)
fig_dict = fig.to_dict()
assert len(fig_dict['data']) == 1
assert len(fig_dict['data'][0]['x']) == 20
assert len(fig_dict['data'][0]['y']) == 20
assert fig_dict['data'][0]['name'] == 'class_1'

msg = "Class wine is not one of the classes the pipeline was fit on: class_0, class_1, class_2"
with pytest.raises(ValueError, match=msg):
graph_partial_dependence(pipeline, X, feature='alcohol', class_label='wine')


@patch('evalml.model_understanding.graphs.jupyter_check')
@patch('evalml.model_understanding.graphs.import_or_raise')
def test_jupyter_graph_check(import_check, jupyter_check, X_y_binary, X_y_regression, test_pipeline):
Expand Down
10 changes: 1 addition & 9 deletions evalml/utils/gen_utils.py
Expand Up @@ -291,20 +291,12 @@ def _convert_to_woodwork_structure(data):
ww_data = data
if isinstance(data, ww.DataTable) or isinstance(data, ww.DataColumn):
return ww_data
# Convert numpy data structures to pandas data structures
if isinstance(data, list):
ww_data = np.array(data)

if isinstance(ww_data, pd.api.extensions.ExtensionArray) or (isinstance(ww_data, np.ndarray) and len(ww_data.shape) == 1):
ww_data = pd.Series(ww_data)
elif isinstance(ww_data, np.ndarray):
ww_data = pd.DataFrame(ww_data)

# Convert pandas data structures to Woodwork data structures
ww_data = ww_data.copy()
if isinstance(ww_data, pd.Series):
if len(ww_data.shape) == 1:
return ww.DataColumn(ww_data)

return ww.DataTable(ww_data)


Expand Down

0 comments on commit 6283a2c

Please sign in to comment.