alteryx · angela97lin · Nov 2, 2021 · Oct 7, 2021 · Oct 14, 2021 · Oct 14, 2021
diff --git a/.github/workflows/lint_tests.yml b/.github/workflows/lint_tests.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python_version: ["3.9" ]
+        python_version: ["3.9"]
     steps:
       - name: Set up Python ${{ matrix.python_version }}
         uses: actions/setup-python@v2

diff --git a/.github/workflows/linux_integration_tests.yml b/.github/workflows/linux_integration_tests.yml
@@ -0,0 +1,61 @@
+name: Integration tests, linux
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+  push:
+    branches:
+      - main
+
+jobs:
+  integration_tests:
+    name: ${{ matrix.python_version }} unit ${{matrix.command}} tests
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python_version: "3.8"
+            command: 'git-test-integration'
+
+    steps:
+      - name: Set up Python ${{ matrix.python_version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python_version }}
+      - name: Checkout repository
+        uses: actions/checkout@v2
+        with:
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          fetch-depth: 2
+      - name: Update apt and install Graphviz
+        run: sudo apt update && sudo apt install -y graphviz
+      - name: Installing Dependencies
+        run: |
+          pip install virtualenv
+          virtualenv test_python -q
+          source test_python/bin/activate
+          make installdeps
+          make installdeps-test
+          pip freeze
+      - name: Erase Coverage
+        run: |
+          source test_python/bin/activate
+          coverage erase
+      - name: Run integration tests
+        run: |
+          source test_python/bin/activate
+          make ${{matrix.command}}
+      - name: Upload pytest duration artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: pytest-duration-report
+          path: test-reports/${{matrix.command}}-junit.xml
+      - name: install coverage
+        run: pip install coverage
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: true
diff --git a/.github/workflows/linux_unit_tests_with_latest_deps.yml b/.github/workflows/linux_unit_tests_with_latest_deps.yml
@@ -107,3 +107,4 @@ jobs:
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           fail_ci_if_error: true
+
diff --git a/Makefile b/Makefile
@@ -63,18 +63,23 @@ git-test-modelunderstanding:
 
 .PHONY: git-test-other-core
 git-test-other-core:
-	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ -n 2 --durations 0 --cov=evalml --junitxml=test-reports/git-test-other-core-junit.xml --has-minimal-dependencies
+	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ --ignore evalml/tests/integration_tests/ -n 2 --durations 0 --cov=evalml --junitxml=test-reports/git-test-other-core-junit.xml --has-minimal-dependencies
 	make doctests
 
 .PHONY: git-test-other
 git-test-other:
-	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ --ignore evalml/tests/pipeline_tests/ --ignore evalml/tests/utils_tests/ --ignore evalml/tests/component_tests/test_prophet_regressor.py --ignore evalml/tests/component_tests/test_components.py --ignore evalml/tests/component_tests/test_utils.py -n 2 --durations 0 --timeout 300 --cov=evalml --junitxml=test-reports/git-test-other-junit.xml
+	pytest evalml/tests --ignore evalml/tests/automl_tests/ --ignore evalml/tests/tuner_tests/ --ignore evalml/tests/model_understanding_tests/ --ignore evalml/tests/pipeline_tests/ --ignore evalml/tests/utils_tests/ --ignore evalml/tests/component_tests/test_prophet_regressor.py --ignore evalml/tests/component_tests/test_components.py --ignore evalml/tests/component_tests/test_utils.py --ignore evalml/tests/integration_tests/ -n 2 --durations 0 --timeout 300 --cov=evalml --junitxml=test-reports/git-test-other-junit.xml
 	make doctests
 
 .PHONY: git-test-prophet
 git-test-prophet:
 	pytest evalml/tests/component_tests/test_prophet_regressor.py evalml/tests/component_tests/test_components.py evalml/tests/component_tests/test_utils.py evalml/tests/pipeline_tests/ evalml/tests/utils_tests/ -n 2 --durations 0 --timeout 300 --cov=evalml --junitxml=test-reports/git-test-prophet-junit.xml
 
+.PHONY: git-test-integration
+git-test-integration:
+	pytest evalml/tests/integration_tests -n 2 --durations 0 --timeout 300 --cov=evalml --junitxml=test-reports/git-test-integration-junit.xml
+
+
 .PHONY: installdeps
 installdeps:
 	pip install --upgrade pip -q

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
     * Enhancements
         * Limit computationally-intensive models during ``AutoMLSearch`` for certain multiclass problems, allow for opt-in with parameter ``allow_long_running_models`` :pr:`2982`
         * Added support for stacked ensemble pipelines to prediction explanations module :pr:`2971`
+        * Added integration tests for data checks and data checks actions workflow :pr:`2883`
     * Fixes
         * Fixed bug where ``Oversampler`` didn't consider boolean columns to be categorical :pr:`2980`
     * Changes

diff --git a/evalml/data_checks/data_check_action.py b/evalml/data_checks/data_check_action.py
@@ -1,5 +1,7 @@
 """Recommended action returned by a DataCheck."""
 
+from evalml.data_checks.data_check_action_code import DataCheckActionCode
+
 
 class DataCheckAction:
     """A recommended action returned by a DataCheck.
@@ -32,3 +34,33 @@ def to_dict(self):
         """Return a dictionary form of the data check action."""
         action_dict = {"code": self.action_code.name, "metadata": self.metadata}
         return action_dict
+
+    @staticmethod
+    def convert_dict_to_action(action_dict):
+        """Convert a dictionary into a DataCheckAction.
+
+        Args:
+            action_dict: Dictionary to convert into action. Should have keys "code" and "metadata".
+
+        Raises:
+            ValueError: If input dictionary does not have keys `code` and `metadata` and if the `metadata` dictionary does not have keys `columns` and `rows`.
+
+        Returns:
+            DataCheckAction object from the input dictionary.
+        """
+        if "code" not in action_dict or "metadata" not in action_dict:
+            raise ValueError(
+                "The input dictionary should have the keys `code` and `metadata`."
+            )
+        if (
+            "columns" not in action_dict["metadata"]
+            or "rows" not in action_dict["metadata"]
+        ):
+            raise ValueError(
+                "The metadata dictionary should have the keys `columns` and `rows`. Set to None if not using."
+            )
+
+        return DataCheckAction(
+            action_code=DataCheckActionCode._all_values[action_dict["code"]],
+            metadata=action_dict["metadata"],
+        )
diff --git a/evalml/data_checks/data_check_action_code.py b/evalml/data_checks/data_check_action_code.py
@@ -1,6 +1,8 @@
 """Enum for data check action code."""
 from enum import Enum
 
+from evalml.utils import classproperty
+
 
 class DataCheckActionCode(Enum):
     """Enum for data check action code."""
@@ -16,3 +18,7 @@ class DataCheckActionCode(Enum):
 
     TRANSFORM_TARGET = "transform_target"
     """Action code for transforming the target data."""
+
+    @classproperty
+    def _all_values(cls):
+        return {code.value.upper(): code for code in list(cls)}
diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py
@@ -55,10 +55,9 @@ def validate(self, X, y):
 
         Example:
             >>> import pandas as pd
-            >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
             >>> y = pd.Series([0, 1, None, None])
             >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary')
-            >>> assert target_check.validate(X, y) == {
+            >>> assert target_check.validate(None, y) == {
             ...     "errors": [{"message": "2 row(s) (50.0%) of target values are null",
             ...                 "data_check_name": "InvalidTargetDataCheck",
             ...                 "level": "error",

diff --git a/evalml/data_checks/target_distribution_data_check.py b/evalml/data_checks/target_distribution_data_check.py
@@ -28,7 +28,6 @@ def validate(self, X, y):
             dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data.
 
         Example:
-            >>> from scipy.stats import lognorm
             >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897]
             >>> target_check = TargetDistributionDataCheck()
             >>> assert target_check.validate(None, y) == {

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -404,15 +404,15 @@ def _make_component_list_from_actions(actions):
     indices_to_drop = []
     for action in actions:
         if action.action_code == DataCheckActionCode.DROP_COL:
-            cols_to_drop.append(action.metadata["column"])
+            cols_to_drop.extend(action.metadata["columns"])
         elif action.action_code == DataCheckActionCode.IMPUTE_COL:
             metadata = action.metadata
             if metadata["is_target"]:
                 components.append(
                     TargetImputer(impute_strategy=metadata["impute_strategy"])
                 )
         elif action.action_code == DataCheckActionCode.DROP_ROWS:
-            indices_to_drop.extend(action.metadata["indices"])
+            indices_to_drop.extend(action.metadata["rows"])
     if cols_to_drop:
         cols_to_drop = sorted(set(cols_to_drop))
         components.append(DropColumns(columns=cols_to_drop))

diff --git a/evalml/tests/data_checks_tests/test_data_check_action.py b/evalml/tests/data_checks_tests/test_data_check_action.py
@@ -1,3 +1,5 @@
+import pytest
+
 from evalml.data_checks import DataCheckAction, DataCheckActionCode
 
 
@@ -72,3 +74,61 @@ def test_data_check_action_to_dict():
             "rows": None,
         },
     }
+
+
+def test_convert_dict_to_action_bad_input():
+    data_check_action_dict_no_code = {
+        "metadata": {"columns": None, "rows": None},
+    }
+    with pytest.raises(ValueError, match="The input dictionary should have the keys"):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_code)
+
+    data_check_action_dict_no_metadata = {
+        "code": DataCheckActionCode.DROP_COL.name,
+    }
+    with pytest.raises(ValueError, match="The input dictionary should have the keys"):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_metadata)
+
+    data_check_action_dict_no_columns = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {"rows": None},
+    }
+    with pytest.raises(
+        ValueError, match="The metadata dictionary should have the keys"
+    ):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_columns)
+
+    data_check_action_dict_no_rows = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {"columns": None},
+    }
+    with pytest.raises(
+        ValueError, match="The metadata dictionary should have the keys"
+    ):
+        DataCheckAction.convert_dict_to_action(data_check_action_dict_no_rows)
+
+
+def test_convert_dict_to_action():
+    data_check_action_dict = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {"columns": None, "rows": None},
+    }
+    expected_data_check_action = DataCheckAction(DataCheckActionCode.DROP_COL)
+    data_check_action = DataCheckAction.convert_dict_to_action(data_check_action_dict)
+    assert data_check_action == expected_data_check_action
+
+    data_check_action_dict_with_other_metadata = {
+        "code": DataCheckActionCode.DROP_COL.name,
+        "metadata": {
+            "some detail": ["this is different"],
+            "columns": None,
+            "rows": None,
+        },
+    }
+    expected_data_check_action = DataCheckAction(
+        DataCheckActionCode.DROP_COL, metadata={"some detail": ["this is different"]}
+    )
+    data_check_action = DataCheckAction.convert_dict_to_action(
+        data_check_action_dict_with_other_metadata
+    )
+    assert data_check_action == expected_data_check_action
diff --git a/evalml/tests/integration_tests/__init__.py b/evalml/tests/integration_tests/__init__.py
diff --git a/evalml/tests/integration_tests/test_data_checks_and_actions_integration.py b/evalml/tests/integration_tests/test_data_checks_and_actions_integration.py
@@ -0,0 +1,124 @@
+import numpy as np
+import pandas as pd
+import woodwork as ww
+from pandas.testing import assert_frame_equal, assert_series_equal
+
+from evalml.automl import get_default_primary_search_objective
+from evalml.data_checks import (
+    DataCheckAction,
+    DefaultDataChecks,
+    OutliersDataCheck,
+)
+from evalml.data_checks.highly_null_data_check import HighlyNullDataCheck
+from evalml.data_checks.invalid_targets_data_check import (
+    InvalidTargetDataCheck,
+)
+from evalml.pipelines.components import (
+    DropColumns,
+    DropRowsTransformer,
+    TargetImputer,
+)
+from evalml.pipelines.utils import _make_component_list_from_actions
+
+
+def test_data_checks_with_healthy_data(X_y_binary):
+    # Checks do not return any error.
+    X, y = X_y_binary
+    data_check = DefaultDataChecks(
+        "binary", get_default_primary_search_objective("binary")
+    )
+    data_check_output = data_check.validate(X, y)
+    assert _make_component_list_from_actions(data_check_output["actions"]) == []
+
+
+def test_data_checks_suggests_drop_cols():
+    X = pd.DataFrame(
+        {
+            "lots_of_null": [None, 2, None, 3, 5],
+            "all_null": [None, None, None, None, None],
+            "no_null": [1, 2, 3, 4, 5],
+        }
+    )
+    y = pd.Series([1, 0, 0, 1, 1])
+    data_check = HighlyNullDataCheck()
+    data_checks_output = data_check.validate(X, y)
+
+    actions = [
+        DataCheckAction.convert_dict_to_action(action)
+        for action in data_checks_output["actions"]
+    ]
+
+    action_components = _make_component_list_from_actions(actions)
+    assert action_components == [DropColumns(columns=["all_null"])]
+
+    X_t = pd.DataFrame(
+        {
+            "lots_of_null": [None, 2, None, 3, 5],
+            "all_null": [None, None, None, None, None],
+            "no_null": [1, 2, 3, 4, 5],
+        }
+    )
+    X_expected = pd.DataFrame(
+        {
+            "lots_of_null": [None, 2, None, 3, 5],
+            "no_null": [1, 2, 3, 4, 5],
+        }
+    )
+    for component in action_components:
+        X_t = component.fit_transform(X_t)
+    assert_frame_equal(X_expected, X_t)
+
+
+def test_data_checks_impute_cols():
+    y = ww.init_series(pd.Series([0, 1, 1, None, None]))
+
+    data_check = InvalidTargetDataCheck("binary", "Log Loss Binary")
+    data_checks_output = data_check.validate(None, y)
+
+    actions = [
+        DataCheckAction.convert_dict_to_action(action)
+        for action in data_checks_output["actions"]
+    ]
+
+    action_components = _make_component_list_from_actions(actions)
+    assert action_components == [
+        TargetImputer(impute_strategy="most_frequent", fill_value=None)
+    ]
+
+    y_expected = ww.init_series(pd.Series([0, 1, 1, 1, 1]), logical_type="double")
+    y_t = ww.init_series(pd.Series([0, 1, 1, None, None]))
+    for component in action_components:
+        _, y_t = component.fit_transform(None, y_t)
+    assert_series_equal(y_expected, y_t)
+
+
+def test_data_checks_suggests_drop_rows():
+    a = np.arange(10) * 0.01
+    data = np.tile(a, (100, 10))
+
+    X = pd.DataFrame(data=data)
+    X.iloc[0, 3] = 1000
+    X.iloc[3, 25] = 1000
+    X.iloc[5, 55] = 10000
+    X.iloc[10, 72] = -1000
+    X.iloc[:, 90] = "string_values"
+    y = pd.Series(np.tile([0, 1], 50))
+
+    outliers_check = OutliersDataCheck()
+    data_checks_output = outliers_check.validate(X)
+
+    actions = [
+        DataCheckAction.convert_dict_to_action(action)
+        for action in data_checks_output["actions"]
+    ]
+    action_components = _make_component_list_from_actions(actions)
+    assert action_components == [DropRowsTransformer(indices_to_drop=[0, 3, 5, 10])]
+
+    X_expected = X.drop([0, 3, 5, 10])
+    X_expected.ww.init()
+    y_expected = y.drop([0, 3, 5, 10])
+
+    for component in action_components:
+        X, y = component.fit_transform(X, y)
+    assert_frame_equal(X_expected, X)
+    assert_series_equal(y_expected, y)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -107,3 +107,4 @@ jobs:
		with:
		token: ${{ secrets.CODECOV_TOKEN }}
		fail_ci_if_error: true