alteryx · bchen1116 · Sep 29, 2020 · Sep 24, 2020 · Sep 24, 2020 · Sep 24, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -8,6 +8,8 @@ Release Notes
         * Added `detect_problem_type` to `problem_type/utils.py` to automatically detect the problem type given targets :pr:`1194`
         * Added LightGBM to AutoMLSearch :pr:`1199`
         * Updates scikit-learn and scikit-optimize to use latest versions - 0.23.2 and 0.8.1 respectively :pr:`1141`
+        * Included internal target check for both training and validation data in AutoMLSearch :pr:`1226`
+        * Add `ProblemTypes.all_problem_types` helper to get list of supported problem types :pr:`1219`
         * Added `DecisionTreeClassifier` and `DecisionTreeRegressor` classes :pr:`1223`
         * Added `ProblemTypes.all_problem_types` helper to get list of supported problem types :pr:`1219`
         * `DataChecks` can now be parametrized by passing a list of `DataCheck` classes and a parameter dictionary :pr:`1167`

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -615,6 +615,13 @@ def _compute_cv_scores(self, pipeline, X, y):
             logger.debug(f"\t\tTraining and scoring on fold {i}")
             X_train, X_test = X.iloc[train], X.iloc[test]
             y_train, y_test = y.iloc[train], y.iloc[test]
+            if self.problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
+                diff_train = set(np.setdiff1d(y, y_train))
+                diff_test = set(np.setdiff1d(y, y_test))
+                diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else ""
+                diff_string += f"Missing target values in the test set after data split: {diff_test}." if diff_test else ""
+                if diff_string:
+                    raise Exception(diff_string)
             objectives_to_score = [self.objective] + self.additional_objectives
             cv_pipeline = None
             try:

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -1082,3 +1082,57 @@ def test_max_batches_must_be_non_negative(max_batches):
 
 def test_can_print_out_automl_objective_names():
     AutoMLSearch.print_objective_names_allowed_in_automl()
+
+
+def test_data_split_binary(X_y_binary):
+    X, y = X_y_binary
+    y[:] = 0
+    y[0] = 1
+
+    automl = AutoMLSearch(problem_type='binary')
+    with pytest.raises(Exception, match="Missing target values in the"):
+        automl.search(X, y)
+    with pytest.raises(Exception, match="Missing target values in the"):
+        automl.search(X, y, data_checks="disabled")
+
+    y[1] = 1
+    with pytest.raises(Exception, match="Missing target values in the"):
+        automl.search(X, y)
+    with pytest.raises(Exception, match="Missing target values in the"):
+        automl.search(X, y, data_checks="disabled")
+
+    y[2] = 1
+    automl.search(X, y, data_checks="disabled")
+
+
+def test_data_split_multi(X_y_multi):
+    X, y = X_y_multi
+    y[:] = 1
+    y[0] = 0
+
+    automl = AutoMLSearch(problem_type='multiclass')
+    with pytest.raises(Exception, match="Missing target values"):
+        automl.search(X, y)
+    with pytest.raises(Exception, match="Missing target values"):
+        automl.search(X, y, data_checks="disabled")
+
+    y[1] = 2
+    # match based on regex, since data split doesn't have a random seed for reproducibility
+    # regex matches the set {} and expects either 2 sets (missing in both train and test)
+    #   or 1 set of multiple elements (both missing in train or both in test)
+    with pytest.raises(Exception, match=r"(\{\d?\}.+\{\d?\})|(\{.+\,.+\})"):
+        automl.search(X, y)
+    with pytest.raises(Exception, match=r"(\{\d?\}.+\{\d?\})|(\{.+\,.+\})"):
+        automl.search(X, y, data_checks="disabled")
+
+    y[1] = 0
+    y[2:4] = 2
+    with pytest.raises(Exception, match="Missing target values"):
+        automl.search(X, y, data_checks="disabled")
+
+    y[4] = 2
+    with pytest.raises(Exception, match="Missing target values"):
+        automl.search(X, y, data_checks="disabled")
+
+    y[5] = 0
+    automl.search(X, y, data_checks="disabled")