diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py index cd69bd63e6..da8854262a 100644 --- a/evalml/models/auto_base.py +++ b/evalml/models/auto_base.py @@ -10,6 +10,7 @@ from evalml import preprocessing from evalml.objectives import get_objective, get_objectives from evalml.pipelines import get_pipelines +from evalml.problem_types import ProblemTypes from evalml.tuners import SKOptTuner from evalml.utils import Logger @@ -21,6 +22,7 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time, if tuner is None: tuner = SKOptTuner self.objective = get_objective(objective) + self.problem_type = problem_type self.max_pipelines = max_pipelines self.max_time = max_time self.model_types = model_types @@ -31,13 +33,16 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time, self.verbose = verbose self.logger = Logger(self.verbose) - self.possible_pipelines = get_pipelines(problem_type=problem_type, model_types=model_types) - objective = get_objective(objective) + self.possible_pipelines = get_pipelines(problem_type=self.problem_type, model_types=model_types) + self.objective = get_objective(objective) + + if self.problem_type not in self.objective.problem_types: + raise ValueError("Given objective {} is not compatible with a {} problem.".format(self.objective.name, self.problem_type.value)) if additional_objectives is not None: additional_objectives = [get_objective(o) for o in additional_objectives] else: - additional_objectives = get_objectives(problem_type) + additional_objectives = get_objectives(self.problem_type) # if our main objective is part of default set of objectives for problem_type, remove it existing_main_objective = next((obj for obj in additional_objectives if obj.name == self.objective.name), None) @@ -84,6 +89,9 @@ def fit(self, X, y, feature_types=None, raise_errors=False): if not isinstance(y, pd.Series): y = pd.Series(y) + if self.problem_type != ProblemTypes.REGRESSION: + self.check_multiclass(y) + self.logger.log_title("Beginning pipeline search") self.logger.log("Optimizing for %s. " % self.objective.name, new_line=False) @@ -119,6 +127,15 @@ def fit(self, X, y, feature_types=None, raise_errors=False): self.logger.log("\n✔ Optimization finished") + def check_multiclass(self, y): + if y.nunique() <= 2: + return + if ProblemTypes.MULTICLASS not in self.objective.problem_types: + raise ValueError("Given objective {} is not compatible with a multiclass problem.".format(self.objective.name)) + for obj in self.additional_objectives: + if ProblemTypes.MULTICLASS not in obj.problem_types: + raise ValueError("Additional objective {} is not compatible with a multiclass problem.".format(obj.name)) + def _do_iteration(self, X, y, pbar, raise_errors): # determine which pipeline to build pipeline_class = self._select_pipeline() diff --git a/evalml/models/auto_classifier.py b/evalml/models/auto_classifier.py index 9734e157e0..d8da1d89c2 100644 --- a/evalml/models/auto_classifier.py +++ b/evalml/models/auto_classifier.py @@ -3,6 +3,7 @@ from .auto_base import AutoBase +from evalml.objectives import get_objective from evalml.problem_types import ProblemTypes @@ -58,15 +59,20 @@ def __init__(self, verbose (boolean): If True, turn verbosity on. Defaults to True """ - if objective is None: - objective = "precision" if cv is None: cv = StratifiedKFold(n_splits=3, random_state=random_state) - problem_type = ProblemTypes.BINARY - if multiclass: + # set default objective if none provided + if objective is None and not multiclass: + objective = "precision" + problem_type = ProblemTypes.BINARY + elif objective is None and multiclass: + objective = "precision_micro" problem_type = ProblemTypes.MULTICLASS + else: + problem_type = self.set_problem_type(objective, multiclass) + super().__init__( tuner=tuner, objective=objective, @@ -82,3 +88,18 @@ def __init__(self, verbose=verbose, additional_objectives=additional_objectives ) + + def set_problem_type(self, objective, multiclass): + """ + If there is an objective either: + a. Set problem_type to MULTICLASS if objective is only multiclass and multiclass is false + b. Set problem_type to MUTLICLASS if multiclass is true + c. Default to BINARY + """ + problem_type = ProblemTypes.BINARY + # if exclusively multiclass: infer + if [ProblemTypes.MULTICLASS] == get_objective(objective).problem_types: + problem_type = ProblemTypes.MULTICLASS + elif multiclass: + problem_type = ProblemTypes.MULTICLASS + return problem_type diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py index 0e0b99d61c..e8bb17b6cc 100644 --- a/evalml/objectives/fraud_cost.py +++ b/evalml/objectives/fraud_cost.py @@ -1,9 +1,12 @@ from .objective_base import ObjectiveBase +from evalml.problem_types import ProblemTypes + class FraudCost(ObjectiveBase): """Score the percentage of money lost of the total transaction amount process due to fraud""" name = "Fraud Cost" + problem_types = [ProblemTypes.BINARY] needs_fitting = True greater_is_better = False uses_extra_columns = True diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py index 631a4059a9..ae7f66e3af 100644 --- a/evalml/objectives/lead_scoring.py +++ b/evalml/objectives/lead_scoring.py @@ -1,9 +1,13 @@ from .objective_base import ObjectiveBase +from evalml.problem_types import ProblemTypes + class LeadScoring(ObjectiveBase): """Lead scoring""" name = "Lead Scoring" + problem_types = [ProblemTypes.BINARY] + needs_fitting = True greater_is_better = True fit_needs_proba = True diff --git a/evalml/problem_types/problem_types.py b/evalml/problem_types/problem_types.py index 4e0ea50373..f55e8dd0bb 100644 --- a/evalml/problem_types/problem_types.py +++ b/evalml/problem_types/problem_types.py @@ -3,6 +3,6 @@ class ProblemTypes(Enum): """Enum for type of machine learning problem: BINARY, MULTICLASS, or REGRESSION""" - BINARY = 'BINARY' - MULTICLASS = 'MULTICLASS' - REGRESSION = 'REGRESSION' + BINARY = 'binary' + MULTICLASS = 'multiclass' + REGRESSION = 'regression' diff --git a/evalml/tests/automl_tests/test_autoclassifier.py b/evalml/tests/automl_tests/test_autoclassifier.py index 914b424f48..8e1443a4f1 100644 --- a/evalml/tests/automl_tests/test_autoclassifier.py +++ b/evalml/tests/automl_tests/test_autoclassifier.py @@ -106,6 +106,15 @@ def test_binary_auto(X_y): assert len(np.unique(y_pred)) == 2 +def test_multi_error(X_y_multi): + X, y = X_y_multi + error_clfs = [AutoClassifier(objective='recall'), AutoClassifier(objective='recall_micro', additional_objectives=['recall'], multiclass=True)] + error_msg = 'not compatible with a multiclass problem.' + for clf in error_clfs: + with pytest.raises(ValueError, match=error_msg): + clf.fit(X, y) + + def test_multi_auto(X_y_multi): X, y = X_y_multi clf = AutoClassifier(objective="recall_micro", multiclass=True) @@ -125,6 +134,27 @@ def test_multi_auto(X_y_multi): assert clf.additional_objectives == expected_additional_objectives +def test_multi_objective(X_y_multi): + error_msg = 'Given objective Recall is not compatible with a multiclass problem' + with pytest.raises(ValueError, match=error_msg): + clf = AutoClassifier(objective="recall", multiclass=True) + + clf = AutoClassifier(objective="log_loss") + assert clf.problem_type == ProblemTypes.BINARY + + clf = AutoClassifier(objective='recall_micro') + assert clf.problem_type == ProblemTypes.MULTICLASS + + clf = AutoClassifier(objective='recall') + assert clf.problem_type == ProblemTypes.BINARY + + clf = AutoClassifier(multiclass=True) + assert clf.problem_type == ProblemTypes.MULTICLASS + + clf = AutoClassifier() + assert clf.problem_type == ProblemTypes.BINARY + + def test_categorical_classification(X_y_categorical_classification): X, y = X_y_categorical_classification clf = AutoClassifier(objective="recall", max_pipelines=5, multiclass=False)