alteryx · angela97lin · Nov 2, 2021 · Oct 7, 2021 · Oct 14, 2021 · Oct 14, 2021
diff --git a/.github/workflows/linux_integration_tests.yml b/.github/workflows/linux_integration_tests.yml
@@ -0,0 +1,51 @@
+name: Integration tests, linux
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+  push:
+    branches:
+      - main
+
+jobs:
+  integration_tests:
+    name: ${{ matrix.python_version }} unit ${{matrix.command}} tests
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python_version: "3.8"
+            core_dependencies: false
+            command: 'git-test-integration'
+
+    steps:
+      - name: Set up Python ${{ matrix.python_version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python_version }}
+      - name: Checkout repository
+        uses: actions/checkout@v2
+        with:
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          fetch-depth: 2
+      - name: Update apt and install Graphviz
+        run: sudo apt update && sudo apt install -y graphviz
+        name: Installing Dependencies
+        run: |
+          pip install virtualenv
+          virtualenv test_python -q
+          source test_python/bin/activate
+          make installdeps
+          make installdeps-test
+          pip freeze
+      - name: Run integration tests
+        run: |
+          source test_python/bin/activate
+          make ${{matrix.command}}
+      - name: Upload pytest duration artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: pytest-duration-report
+          path: test-reports/${{matrix.command}}-junit.xml
diff --git a/Makefile b/Makefile
@@ -63,17 +63,22 @@ git-test-modelunderstanding:
 
 .PHONY: git-test-other-core
 git-test-other-core:
-	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ -n 2 --durations 0 --cov=evalml --junitxml=test-reports/git-test-other-core-junit.xml --has-minimal-dependencies
+	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ --ignore evalml/tests/integration/ -n 2 --durations 0 --cov=evalml --junitxml=test-reports/git-test-other-core-junit.xml --has-minimal-dependencies
 	make doctests
 
 .PHONY: git-test-other
 git-test-other:
-	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ --ignore evalml/tests/pipeline_tests/ --ignore evalml/tests/utils_tests/ --ignore evalml/tests/component_tests/test_prophet_regressor.py --ignore evalml/tests/component_tests/test_components.py --ignore evalml/tests/component_tests/test_utils.py -n 2 --durations 0 --timeout 300 --doctest-modules --cov=evalml --junitxml=test-reports/git-test-other-junit.xml
+	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ --ignore evalml/tests/pipeline_tests/ --ignore evalml/tests/utils_tests/ --ignore evalml/tests/component_tests/test_prophet_regressor.py --ignore evalml/tests/component_tests/test_components.py --ignore evalml/tests/component_tests/test_utils.py --ignore evalml/tests/integration/ -n 2 --durations 0 --timeout 300 --doctest-modules --cov=evalml --junitxml=test-reports/git-test-other-junit.xml
 	make doctests
 
 .PHONY: git-test-prophet
 git-test-prophet:
-	pytest evalml/tests/component_tests/test_prophet_regressor.py evalml/tests/component_tests/test_components.py evalml/tests/component_tests/test_utils.py evalml/tests/pipeline_tests/ evalml/tests/utils_tests/ -n 2 --durations 0 --timeout 300 --doctest-modules --cov=evalml --junitxml=test-reports/git-test-prophet-junit.xml --doctest-continue-on-failure
+	pytest evalml/tests/component_tests/test_prophet_regressor.py evalml/tests/component_tests/test_components.py evalml/tests/component_tests/test_utils.py evalml/tests/pipeline_tests/ evalml/tests/utils_tests/ -n 2 --durations 0 --timeout 300 --doctest-modules --junitxml=test-reports/git-test-prophet-junit.xml --doctest-continue-on-failure
+
+.PHONY: git-test-integration
+git-test-integration:
+	pytest evalml/tests/integration -n 2 --durations 0 --timeout 300 --doctest-modules --cov=evalml --junitxml=test-reports/git-test-integration-junit.xml --doctest-continue-on-failure
+
 
 .PHONY: installdeps
 installdeps:

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -25,6 +25,8 @@ Release Notes
     * Testing Changes
         * Fixed dependency checker to catch full names of packages :pr:`2930`
         * Refactored ``build_conda_pkg`` to work from a local recipe :pr:`2925`
+        * Added integration tests for data checks and data checks actions workflow :pr:`2883`
+
 
 .. warning::
 

diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py
@@ -1,5 +1,7 @@
 """Recommended action returned by a DataCheck."""
 
+from evalml.data_checks.data_check_action_code import DataCheckActionCode
+
 
 class DataCheckAction:
     """A recommended action returned by a DataCheck.
@@ -32,3 +34,33 @@ def to_dict(self):
         """Return a dictionary form of the data check action."""
         action_dict = {"code": self.action_code.name, "metadata": self.metadata}
         return action_dict
+
+    @staticmethod
+    def convert_dict_to_action(action_dict):
+        """Convert a dictionary into a DataCheckAction.
+
+        Args:
+            action_dict: Dictionary to convert into action. Should have keys "code" and "metadata".
+
+        Raises:
+            ValueError: If input dictionary does not have keys `code` and `metadata` and if the `metadata` dictionary does not have keys `columns` and `rows`.
+
+        Returns:
+            DataCheckAction object from the input dictionary.
+        """
+        if "code" not in action_dict or "metadata" not in action_dict:
+            raise ValueError(
+                "The input dictionary should have the keys `code` and `metadata`."
+            )
+        if (
+            "columns" not in action_dict["metadata"]
+            or "rows" not in action_dict["metadata"]
+        ):
+            raise ValueError(
+                "The metadata dictionary should have the keys `columns` and `rows`. Set to None if not using."
+            )
+
+        return DataCheckAction(
+            action_code=DataCheckActionCode._all_values[action_dict["code"]],
+            metadata=action_dict["metadata"],
+        )
diff --git a/evalml/data_checks/data_check_action_code.py b/evalml/data_checks/data_check_action_code.py
@@ -1,6 +1,8 @@
 """Enum for data check action code."""
 from enum import Enum
 
+from evalml.utils import classproperty
+
 
 class DataCheckActionCode(Enum):
     """Enum for data check action code."""
@@ -16,3 +18,7 @@ class DataCheckActionCode(Enum):
 
     TRANSFORM_TARGET = "transform_target"
     """Action code for transforming the target data."""
+
+    @classproperty
+    def _all_values(cls):
+        return {code.value.upper(): code for code in list(cls)}
diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py
@@ -55,10 +55,9 @@ def validate(self, X, y):
 
         Example:
             >>> import pandas as pd
-            >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
             >>> y = pd.Series([0, 1, None, None])
             >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary')
-            >>> assert target_check.validate(X, y) == {
+            >>> assert target_check.validate(None, y) == {
             ...     "errors": [{"message": "2 row(s) (50.0%) of target values are null",
             ...                 "data_check_name": "InvalidTargetDataCheck",
             ...                 "level": "error",

diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py
@@ -28,7 +28,6 @@ def validate(self, X, y):
             dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data.
 
         Example:
-            >>> from scipy.stats import lognorm
             >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897]
             >>> target_check = TargetDistributionDataCheck()
             >>> assert target_check.validate(None, y) == {

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -381,16 +381,16 @@ def _make_component_list_from_actions(actions):
     cols_to_drop = []
     for action in actions:
         if action.action_code == DataCheckActionCode.DROP_COL:
-            cols_to_drop.append(action.metadata["column"])
+            cols_to_drop.extend(action.metadata["columns"])
         elif action.action_code == DataCheckActionCode.IMPUTE_COL:
             metadata = action.metadata
             if metadata["is_target"]:
                 components.append(
                     TargetImputer(impute_strategy=metadata["impute_strategy"])
                 )
         elif action.action_code == DataCheckActionCode.DROP_ROWS:
-            indices = action.metadata["indices"]
-            components.append(DropRowsTransformer(indices_to_drop=indices))
+            rows = action.metadata["rows"]
+            components.append(DropRowsTransformer(indices_to_drop=rows))
     if cols_to_drop:
         components.append(DropColumns(columns=cols_to_drop))
     return components

diff --git a/evalml/tests/data_checks_tests/test_data_check_action.py b/evalml/tests/data_checks_tests/test_data_check_action.py
@@ -1,3 +1,5 @@
+import pytest
+
 from evalml.data_checks import DataCheckAction, DataCheckActionCode
 
 
@@ -72,3 +74,61 @@ def test_data_check_action_to_dict():
             "rows": None,
         },
     }
+
+
+def test_convert_dict_to_action_bad_input():
+    data_check_action_dict_no_code = {
+        "metadata": {"columns": None, "rows": None},
+    }
+    with pytest.raises(ValueError, match="The input dictionary should have the keys"):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_code)
+
+    data_check_action_dict_no_metadata = {
+        "code": DataCheckActionCode.DROP_COL.name,
+    }
+    with pytest.raises(ValueError, match="The input dictionary should have the keys"):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_metadata)
+
+    data_check_action_dict_no_columns = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {"rows": None},
+    }
+    with pytest.raises(
+        ValueError, match="The metadata dictionary should have the keys"
+    ):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_columns)
+
+    data_check_action_dict_no_rows = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {"columns": None},
+    }
+    with pytest.raises(
+        ValueError, match="The metadata dictionary should have the keys"
+    ):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_rows)
+
+
+def test_convert_dict_to_action():
+    data_check_action_dict = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {"columns": None, "rows": None},
+    }
+    expected_data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL)
+    data_check_action = DataCheckAction.convert_dict_to_action(data_check_action_dict)
+    assert data_check_action == expected_data_check_action
+
+    data_check_action_dict_with_other_metadata = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {
+            "some detail": ["this is different"],
+            "columns": None,
+            "rows": None,
+        },
+    }
+    expected_data_check_action = DataCheckAction(
+        DataCheckActionCode.DROP_COL, metadata={"some detail": ["this is different"]}
+    )
+    data_check_action = DataCheckAction.convert_dict_to_action(
+        data_check_action_dict_with_other_metadata
+    )
+    assert data_check_action == expected_data_check_action
diff --git a/evalml/tests/integration/test_data_checks_and_actions_integration.py b/evalml/tests/integration/test_data_checks_and_actions_integration.py
@@ -0,0 +1,132 @@
+import numpy as np
+import pandas as pd
+import woodwork as ww
+from pandas.testing import assert_frame_equal, assert_series_equal
+
+from evalml.automl import get_default_primary_search_objective
+from evalml.data_checks import (
+    DataCheckAction,
+    DefaultDataChecks,
+    OutliersDataCheck,
+)
+from evalml.data_checks.highly_null_data_check import HighlyNullDataCheck
+from evalml.data_checks.invalid_targets_data_check import (
+    InvalidTargetDataCheck,
+)
+from evalml.pipelines.components import (
+    DropColumns,
+    DropRowsTransformer,
+    TargetImputer,
+)
+from evalml.pipelines.utils import _make_component_list_from_actions
+
+
+def test_data_checks_with_healthy_data(X_y_binary):
+    # Checks do not return any error.
+    X, y = X_y_binary
+    data_check = DefaultDataChecks(
+        "binary", get_default_primary_search_objective("binary")
+    )
+    data_check_output = data_check.validate(X, y)
+    assert _make_component_list_from_actions(data_check_output["actions"]) == []
+
+
+def test_data_checks_suggests_drop_cols():
+    X = pd.DataFrame(
+        {
+            "lots_of_null": [None, 2, None, 3, 5],
+            "all_null": [None, None, None, None, None],
+            "no_null": [1, 2, 3, 4, 5],
+        }
+    )
+    y = pd.Series([1, 0, 0, 1, 1])
+    data_check = HighlyNullDataCheck()
+    data_checks_output = data_check.validate(X, y)
+
+    actions = [
+        DataCheckAction.convert_dict_to_action(action)
+        for action in data_checks_output["actions"]
+    ]
+
+    action_components = _make_component_list_from_actions(actions)
+    assert action_components == [DropColumns(columns=["all_null"])]
+
+    X_t = pd.DataFrame(
+        {
+            "lots_of_null": [None, 2, None, 3, 5],
+            "all_null": [None, None, None, None, None],
+            "no_null": [1, 2, 3, 4, 5],
+        }
+    )
+    X_expected = pd.DataFrame(
+        {
+            "lots_of_null": [None, 2, None, 3, 5],
+            "no_null": [1, 2, 3, 4, 5],
+        }
+    )
+    for component in action_components:
+        X_t = component.fit_transform(X_t)
+    assert_frame_equal(X_expected, X_t)
+
+
+def test_data_checks_impute_cols():
+    y = ww.init_series(pd.Series([0, 1, 1, None, None]))
+
+    data_check = InvalidTargetDataCheck("binary", "Log Loss Binary")
+    data_checks_output = data_check.validate(None, y)
+
+    actions = [
+        DataCheckAction.convert_dict_to_action(action)
+        for action in data_checks_output["actions"]
+    ]
+
+    action_components = _make_component_list_from_actions(actions)
+    assert action_components == [
+        TargetImputer(impute_strategy="most_frequent", fill_value=None)
+    ]
+
+    y_expected = ww.init_series(pd.Series([0, 1, 1, 1, 1]), logical_type="double")
+    y_t = ww.init_series(pd.Series([0, 1, 1, None, None]))
+    for component in action_components:
+        _, y_t = component.fit_transform(None, y_t)
+    assert_series_equal(y_expected, y_t)
+
+
+def test_data_checks_suggests_drop_rows():
+    a = np.arange(10) * 0.01
+    data = np.tile(a, (100, 10))
+
+    X = pd.DataFrame(data=data)
+    X.iloc[0, 3] = 1000
+    X.iloc[3, 25] = 1000
+    X.iloc[5, 55] = 10000
+    X.iloc[10, 72] = -1000
+    X.iloc[:, 90] = "string_values"
+    y = pd.Series(np.tile([0, 1], 50))
+
+    outliers_check = OutliersDataCheck()
+    data_checks_output = outliers_check.validate(X)
+
+    actions = [
+        DataCheckAction.convert_dict_to_action(action)
+        for action in data_checks_output["actions"]
+    ]
+    action_components = _make_component_list_from_actions(actions)
+    assert action_components == [DropRowsTransformer()]
+
+    X_t = pd.DataFrame(data=data)
+    X_t.iloc[0, 3] = 1000
+    X_t.iloc[3, 25] = 1000
+    X_t.iloc[5, 55] = 10000
+    X_t.iloc[10, 72] = -1000
+    X_t.iloc[:, 90] = "string_values"
+    X_t.ww.init()
+    y_t = pd.Series(np.tile([0, 1], 50))
+
+    X_expected = X.drop([0, 3, 5, 10])
+    X_expected.ww.init()
+    y_expected = y.drop([0, 3, 5, 10])
+    for component in action_components:
+        X_t, y_t = component.fit_transform(X_t, y_t)
+    assert_frame_equal(X_expected, X_t)
+    assert_series_equal(y_expected, y_t)