Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes summary for Pipelines without estimators #707

Merged
merged 15 commits into from
Apr 24, 2020
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Changelog
* Update make_pipeline_graph to not accidentally create empty file when testing if path is valid :pr:`649`
* Fix pip installation warning about docsutils version, from boto dependency :pr:`664`
* Removed zero division warning for F1/precision/recall metrics :pr:`671`
* Fixed `summary` for pipelines without estimators :pr:`707`
* Changes
* Updated default objective for binary/multiseries classification to log loss :pr:`613`
* Created classification and regression pipeline subclasses and removed objective as an attribute of pipeline classes :pr:`405`
Expand Down
33 changes: 2 additions & 31 deletions docs/source/objectives/custom_objectives.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
" name = \"Fraud Cost\"\n",
" needs_fitting = True\n",
" greater_is_better = False\n",
" uses_extra_columns = True\n",
" score_needs_proba = False\n",
"\n",
" def __init__(self, retry_percentage=.5, interchange_fee=.02,\n",
Expand All @@ -63,35 +62,7 @@
"\n",
" def decision_function(self, y_predicted, extra_cols, threshold):\n",
" \"\"\"Determine if transaction is fraud given predicted probabilities,\n",
" dataframe with transaction amount, and threshold\"\"\"\n",
"\n",
" transformed_probs = (y_predicted * extra_cols[self.amount_col])\n",
" return transformed_probs > threshold\n",
"\n",
" def objective_function(self, y_predicted, y_true, extra_cols):\n",
" \"\"\"Calculate amount lost to fraud given predictions, true values, and dataframe\n",
" with transaction amount\"\"\"\n",
"\n",
" # extract transaction using the amount columns in users data\n",
" transaction_amount = extra_cols[self.amount_col]\n",
"\n",
" # amount paid if transaction is fraud\n",
" fraud_cost = transaction_amount * self.fraud_payout_percentage\n",
"\n",
" # money made from interchange fees on transaction\n",
" interchange_cost = transaction_amount * (1 - self.retry_percentage) * self.interchange_fee\n",
"\n",
" # calculate cost of missing fraudulent transactions\n",
" false_negatives = (y_true & ~y_predicted) * fraud_cost\n",
"\n",
" # calculate money lost from fees\n",
" false_positives = (~y_true & y_predicted) * interchange_cost\n",
"\n",
" loss = false_negatives.sum() + false_positives.sum()\n",
"\n",
" loss_per_total_processed = loss / transaction_amount.sum()\n",
"\n",
" return loss_per_total_processed\n"
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
" "
]
}
],
Expand All @@ -116,4 +87,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
28 changes: 12 additions & 16 deletions evalml/pipelines/pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,19 @@ def summary(cls):
"""Returns a short summary of the pipeline structure, describing the list of components used.
Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder
"""
def _generate_summary(component_graph):
component_graph = copy.copy(component_graph)
component_graph[-1] = handle_component(component_graph[-1])
estimator = component_graph[-1] if isinstance(component_graph[-1], Estimator) else None
if estimator is not None:
summary = "{}".format(estimator.name)
else:
summary = "Pipeline"
for index, component in enumerate(component_graph[:-1]):
component = handle_component(component)
if index == 0:
summary += " w/ {}".format(component.name)
else:
summary += " + {}".format(component.name)
component_graph = copy.copy(cls.component_graph)
if len(component_graph) == 0:
return ""
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
summary = "Pipeline"
component_graph[-1] = handle_component(component_graph[-1])

if isinstance(component_graph[-1], Estimator):
estimator = component_graph.pop()
summary = estimator.name
if len(component_graph) == 0:
return summary

return _generate_summary(cls.component_graph)
component_names = [handle_component(component).name for component in component_graph]
angela97lin marked this conversation as resolved.
Show resolved Hide resolved
return '{} w/ {}'.format(summary, ' + '.join(component_names))

def _validate_estimator_problem_type(self):
"""Validates this pipeline's problem_type against that of the estimator from `self.component_graph`"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_catboost_init():
clf = CatBoostBinaryClassificationPipeline(parameters=parameters, random_state=2)
assert clf.parameters == parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0])
assert clf.summary == 'CatBoost Classifier w/ Simple Imputer'
angela97lin marked this conversation as resolved.
Show resolved Hide resolved


def test_catboost_objective_tuning(X_y):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_lor_init(X_y):
clf = LogisticRegressionBinaryPipeline(parameters=parameters, random_state=1)
assert clf.parameters == parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(1).get_state()[0])
assert clf.summary == 'Logistic Regression Classifier w/ One Hot Encoder + Simple Imputer + Standard Scaler'


def test_lor_objective_tuning(X_y):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def test_rf_init(X_y):

assert clf.parameters == expected_parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0])
assert clf.summary == 'Random Forest Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model'


def test_rf_objective_tuning(X_y):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def test_xg_init(X_y):

assert clf.parameters == expected_parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(1).get_state()[0])
assert clf.summary == 'XGBoost Classifier w/ One Hot Encoder + Simple Imputer + RF Classifier Select From Model'


def test_xgboost_objective_tuning(X_y):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_catboost_init():
clf = CatBoostRegressionPipeline(parameters=parameters, random_state=2)
assert clf.parameters == parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0])
assert clf.summary == 'CatBoost Regressor w/ Simple Imputer'


def test_catboost_regression(X_y_reg):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_lr_init(X_y_categorical_regression):
clf = LinearRegressionPipeline(parameters=parameters, random_state=2)
assert clf.parameters == parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0])
assert clf.summary == 'Linear Regressor w/ One Hot Encoder + Simple Imputer + Standard Scaler'


def test_linear_regression(X_y_categorical_regression):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_rf_init(X_y_reg):

assert clf.parameters == expected_parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(2).get_state()[0])
assert clf.summary == 'Random Forest Regressor w/ One Hot Encoder + Simple Imputer + RF Regressor Select From Model'


def test_rf_regression(X_y_categorical_regression):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def test_xg_init(X_y_reg):

assert clf.parameters == parameters
assert (clf.random_state.get_state()[0] == np.random.RandomState(1).get_state()[0])
assert clf.summary == 'XGBoost Regressor w/ One Hot Encoder + Simple Imputer + RF Regressor Select From Model'


def test_xgboost_regression(X_y_reg):
Expand Down
29 changes: 22 additions & 7 deletions evalml/tests/pipeline_tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,6 @@ class testillformattednamepipeline(BinaryClassificationPipeline):
testillformattednamepipeline.name == "Test Illformatted Name Pipeline"


def test_summary(X_y, lr_pipeline):
X, y = X_y
clf = lr_pipeline
assert clf.summary == 'Logistic Regression Classifier w/ One Hot Encoder + Simple Imputer + Standard Scaler'
assert LogisticRegressionBinaryPipeline.summary == 'Logistic Regression Classifier w/ One Hot Encoder + Simple Imputer + Standard Scaler'
angela97lin marked this conversation as resolved.
Show resolved Hide resolved


def test_estimator_not_last(X_y):
X, y = X_y

Expand Down Expand Up @@ -470,3 +463,25 @@ def test_score_with_objective_that_requires_predict_proba(mock_predict, dummy_re
with pytest.raises(ValueError, match="Objective `AUC` does not support score_needs_proba"):
dummy_regression_pipeline.score(X, y, ['recall', 'auc'])
mock_predict.assert_called()


def test_pipeline_summary():
class MockPipelineWithoutEstimator(PipelineBase):
component_graph = ["Simple Imputer", "One Hot Encoder"]
assert MockPipelineWithoutEstimator.summary == "Pipeline w/ Simple Imputer + One Hot Encoder"

class MockPipelineWithSingleComponent(PipelineBase):
component_graph = ["Simple Imputer"]
assert MockPipelineWithSingleComponent.summary == "Pipeline w/ Simple Imputer"

class MockPipelineWithSingleEstimator(PipelineBase):
component_graph = ["Random Forest Classifier"]
assert MockPipelineWithSingleEstimator.summary == "Random Forest Classifier"
angela97lin marked this conversation as resolved.
Show resolved Hide resolved

class MockPipelineWithNoComponents(PipelineBase):
component_graph = []
assert MockPipelineWithNoComponents.summary == ""

class MockPipeline(PipelineBase):
component_graph = ["Simple Imputer", "One Hot Encoder", "Random Forest Classifier"]
assert MockPipeline.summary == "Random Forest Classifier w/ Simple Imputer + One Hot Encoder"
angela97lin marked this conversation as resolved.
Show resolved Hide resolved