-
Notifications
You must be signed in to change notification settings - Fork 86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add integration tests for end to end flow for data checks --> data check actions #2883
Changes from all commits
8ac51dc
887d8b7
2a3e898
7fba44d
fb2c330
2281c68
deb121d
00fc198
0d40228
14d8af2
0e799c1
c11f33e
8602e35
fe1cc9e
38ac09e
650f7ab
ff475f3
b3b6e4b
259c2bf
a45edde
4ffb6f9
c14ca5e
c491e7c
c7483f7
c43327c
8c4bd98
434b08a
d063aa4
6e88b18
515765e
d463c6f
9001544
0fe1b8c
e49756c
e5f9efc
55a7474
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
name: Integration tests, linux | ||
|
||
on: | ||
pull_request: | ||
types: [opened, synchronize] | ||
push: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
integration_tests: | ||
name: ${{ matrix.python_version }} unit ${{matrix.command}} tests | ||
runs-on: ubuntu-latest | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
include: | ||
- python_version: "3.8" | ||
command: 'git-test-integration' | ||
|
||
steps: | ||
- name: Set up Python ${{ matrix.python_version }} | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: ${{ matrix.python_version }} | ||
- name: Checkout repository | ||
uses: actions/checkout@v2 | ||
with: | ||
ref: ${{ github.event.pull_request.head.ref }} | ||
repository: ${{ github.event.pull_request.head.repo.full_name }} | ||
fetch-depth: 2 | ||
- name: Update apt and install Graphviz | ||
run: sudo apt update && sudo apt install -y graphviz | ||
- name: Installing Dependencies | ||
run: | | ||
pip install virtualenv | ||
virtualenv test_python -q | ||
source test_python/bin/activate | ||
make installdeps | ||
make installdeps-test | ||
pip freeze | ||
- name: Erase Coverage | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Honestly, surprised I can call |
||
run: | | ||
source test_python/bin/activate | ||
coverage erase | ||
- name: Run integration tests | ||
run: | | ||
source test_python/bin/activate | ||
make ${{matrix.command}} | ||
- name: Upload pytest duration artifact | ||
uses: actions/upload-artifact@v2 | ||
with: | ||
name: pytest-duration-report | ||
path: test-reports/${{matrix.command}}-junit.xml | ||
- name: install coverage | ||
run: pip install coverage | ||
- name: Upload coverage to Codecov | ||
uses: codecov/codecov-action@v1 | ||
with: | ||
token: ${{ secrets.CODECOV_TOKEN }} | ||
fail_ci_if_error: true |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,3 +107,4 @@ jobs: | |
with: | ||
token: ${{ secrets.CODECOV_TOKEN }} | ||
fail_ci_if_error: true | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -404,15 +404,15 @@ def _make_component_list_from_actions(actions): | |
indices_to_drop = [] | ||
for action in actions: | ||
if action.action_code == DataCheckActionCode.DROP_COL: | ||
cols_to_drop.append(action.metadata["column"]) | ||
cols_to_drop.extend(action.metadata["columns"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cleanup after the standardization in #2869 |
||
elif action.action_code == DataCheckActionCode.IMPUTE_COL: | ||
metadata = action.metadata | ||
if metadata["is_target"]: | ||
components.append( | ||
TargetImputer(impute_strategy=metadata["impute_strategy"]) | ||
) | ||
elif action.action_code == DataCheckActionCode.DROP_ROWS: | ||
indices_to_drop.extend(action.metadata["indices"]) | ||
indices_to_drop.extend(action.metadata["rows"]) | ||
if cols_to_drop: | ||
cols_to_drop = sorted(set(cols_to_drop)) | ||
components.append(DropColumns(columns=cols_to_drop)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import woodwork as ww | ||
from pandas.testing import assert_frame_equal, assert_series_equal | ||
|
||
from evalml.automl import get_default_primary_search_objective | ||
from evalml.data_checks import ( | ||
DataCheckAction, | ||
DefaultDataChecks, | ||
OutliersDataCheck, | ||
) | ||
from evalml.data_checks.highly_null_data_check import HighlyNullDataCheck | ||
from evalml.data_checks.invalid_targets_data_check import ( | ||
InvalidTargetDataCheck, | ||
) | ||
from evalml.pipelines.components import ( | ||
DropColumns, | ||
DropRowsTransformer, | ||
TargetImputer, | ||
) | ||
from evalml.pipelines.utils import _make_component_list_from_actions | ||
|
||
|
||
def test_data_checks_with_healthy_data(X_y_binary): | ||
# Checks do not return any error. | ||
X, y = X_y_binary | ||
data_check = DefaultDataChecks( | ||
"binary", get_default_primary_search_objective("binary") | ||
) | ||
data_check_output = data_check.validate(X, y) | ||
assert _make_component_list_from_actions(data_check_output["actions"]) == [] | ||
|
||
|
||
def test_data_checks_suggests_drop_cols(): | ||
X = pd.DataFrame( | ||
{ | ||
"lots_of_null": [None, 2, None, 3, 5], | ||
"all_null": [None, None, None, None, None], | ||
"no_null": [1, 2, 3, 4, 5], | ||
} | ||
) | ||
y = pd.Series([1, 0, 0, 1, 1]) | ||
data_check = HighlyNullDataCheck() | ||
data_checks_output = data_check.validate(X, y) | ||
|
||
actions = [ | ||
DataCheckAction.convert_dict_to_action(action) | ||
for action in data_checks_output["actions"] | ||
] | ||
|
||
action_components = _make_component_list_from_actions(actions) | ||
assert action_components == [DropColumns(columns=["all_null"])] | ||
|
||
X_t = pd.DataFrame( | ||
{ | ||
"lots_of_null": [None, 2, None, 3, 5], | ||
"all_null": [None, None, None, None, None], | ||
"no_null": [1, 2, 3, 4, 5], | ||
} | ||
) | ||
X_expected = pd.DataFrame( | ||
{ | ||
"lots_of_null": [None, 2, None, 3, 5], | ||
"no_null": [1, 2, 3, 4, 5], | ||
} | ||
) | ||
for component in action_components: | ||
X_t = component.fit_transform(X_t) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test highlights how nicely DataCheck/Actions fits into the fit/transform syntax. Very nicely done. I like this, very intuitive. |
||
assert_frame_equal(X_expected, X_t) | ||
|
||
|
||
def test_data_checks_impute_cols(): | ||
y = ww.init_series(pd.Series([0, 1, 1, None, None])) | ||
|
||
data_check = InvalidTargetDataCheck("binary", "Log Loss Binary") | ||
data_checks_output = data_check.validate(None, y) | ||
|
||
actions = [ | ||
DataCheckAction.convert_dict_to_action(action) | ||
for action in data_checks_output["actions"] | ||
] | ||
|
||
action_components = _make_component_list_from_actions(actions) | ||
assert action_components == [ | ||
TargetImputer(impute_strategy="most_frequent", fill_value=None) | ||
] | ||
|
||
y_expected = ww.init_series(pd.Series([0, 1, 1, 1, 1]), logical_type="double") | ||
y_t = ww.init_series(pd.Series([0, 1, 1, None, None])) | ||
for component in action_components: | ||
_, y_t = component.fit_transform(None, y_t) | ||
assert_series_equal(y_expected, y_t) | ||
|
||
|
||
def test_data_checks_suggests_drop_rows(): | ||
a = np.arange(10) * 0.01 | ||
data = np.tile(a, (100, 10)) | ||
|
||
X = pd.DataFrame(data=data) | ||
X.iloc[0, 3] = 1000 | ||
X.iloc[3, 25] = 1000 | ||
X.iloc[5, 55] = 10000 | ||
X.iloc[10, 72] = -1000 | ||
X.iloc[:, 90] = "string_values" | ||
y = pd.Series(np.tile([0, 1], 50)) | ||
|
||
outliers_check = OutliersDataCheck() | ||
data_checks_output = outliers_check.validate(X) | ||
|
||
actions = [ | ||
DataCheckAction.convert_dict_to_action(action) | ||
for action in data_checks_output["actions"] | ||
] | ||
action_components = _make_component_list_from_actions(actions) | ||
assert action_components == [DropRowsTransformer(indices_to_drop=[0, 3, 5, 10])] | ||
|
||
X_expected = X.drop([0, 3, 5, 10]) | ||
X_expected.ww.init() | ||
y_expected = y.drop([0, 3, 5, 10]) | ||
|
||
for component in action_components: | ||
X, y = component.fit_transform(X, y) | ||
assert_frame_equal(X_expected, X) | ||
assert_series_equal(y_expected, y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's fine to run just one python version for now, but curious if there are other opinions!