Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update data checks to accept Woodwork data structures #1481

Merged
merged 19 commits into from Dec 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Expand Up @@ -4,6 +4,7 @@ Release Notes
**Future Releases**
* Enhancements
* Added ``graph_prediction_vs_actual_over_time`` and ``get_prediction_vs_actual_over_time_data`` to the model understanding module for time series problems :pr:`1483`
* Updated data checks to accept ``Woodwork`` data structures :pr:`1481`
* Added parameter to ``InvalidTargetDataCheck`` to show only top unique values rather than all unique values :pr:`1485`
* Fixes
* Fix Windows CI jobs: install ``numba`` via conda, required for ``shap`` :pr:`1490`
Expand Down
18 changes: 12 additions & 6 deletions evalml/data_checks/class_imbalance_data_check.py
@@ -1,11 +1,14 @@
import pandas as pd

from evalml.data_checks import (
DataCheck,
DataCheckError,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)


class ClassImbalanceDataCheck(DataCheck):
Expand Down Expand Up @@ -33,15 +36,16 @@ def validate(self, X, y):
Ignores NaN values in target labels if they appear.

Arguments:
X (pd.DataFrame, pd.Series, np.ndarray, list): Features. Ignored.
y: Target labels to check for imbalanced data.
X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
y (ww.DataColumn, pd.Series, np.ndarray): Target labels to check for imbalanced data.

Returns:
dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold,
and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds.

Example:
>>> X = pd.DataFrame({})
>>> import pandas as pd
>>> X = pd.DataFrame()
>>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
>>> target_check = ClassImbalanceDataCheck(threshold=0.10)
>>> assert target_check.validate(X, y) == {"errors": [{"message": "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0]",\
Expand All @@ -59,8 +63,10 @@ def validate(self, X, y):
"warnings": [],
"errors": []
}
if not isinstance(y, pd.Series):
y = pd.Series(y)

y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())

fold_counts = y.value_counts(normalize=False)
# search for targets that occur less than twice the number of cv folds first
below_threshold_folds = fold_counts.where(fold_counts < self.cv_folds).dropna()
Expand Down
13 changes: 9 additions & 4 deletions evalml/data_checks/data_checks.py
@@ -1,8 +1,8 @@
import inspect

from .data_check import DataCheck

from evalml.data_checks import DataCheck
from evalml.exceptions import DataCheckInitError
from evalml.utils.gen_utils import _convert_to_woodwork_structure


def _has_defaults_for_all_args(init):
Expand Down Expand Up @@ -77,8 +77,8 @@ def validate(self, X, y=None):
Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable.

Arguments:
X (pd.DataFrame): The input data of shape [n_samples, n_features]
y (pd.Series): The target data of length [n_samples]
X (ww.DataTable, pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features]
y (ww.DataColumn, pd.Series, np.ndarray): The target data of length [n_samples]

Returns:
dict: Dictionary containing DataCheckMessage objects
Expand All @@ -88,6 +88,11 @@ def validate(self, X, y=None):
"warnings": [],
"errors": []
}

X = _convert_to_woodwork_structure(X)
if y is not None:
y = _convert_to_woodwork_structure(y)

for data_check in self.data_checks:
messages_new = data_check.validate(X, y)
messages["warnings"].extend(messages_new["warnings"])
Expand Down
17 changes: 11 additions & 6 deletions evalml/data_checks/highly_null_data_check.py
@@ -1,10 +1,12 @@
import pandas as pd

from evalml.data_checks import (
DataCheck,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)


class HighlyNullDataCheck(DataCheck):
Expand All @@ -26,13 +28,14 @@ def validate(self, X, y=None):
"""Checks if there are any highly-null columns in the input.

Arguments:
X (pd.DataFrame, pd.Series, np.ndarray, list): Features
y: Ignored.
X (ww.DataTable, pd.DataFrame, np.ndarray): Features
y (ww.DataColumn, pd.Series, np.ndarray): Ignored.

Returns:
dict (DataCheckWarning): dict with a DataCheckWarning if there are any highly-null columns.

Example:
>>> import pandas as pd
>>> df = pd.DataFrame({
... 'lots_of_null': [None, None, None, None, 5],
... 'no_null': [1, 2, 3, 4, 5]
Expand All @@ -49,8 +52,10 @@ def validate(self, X, y=None):
"warnings": [],
"errors": []
}
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())

percent_null = (X.isnull().mean()).to_dict()
if self.pct_null_threshold == 0.0:
all_null_cols = {key: value for key, value in percent_null.items() if value > 0.0}
Expand Down
14 changes: 10 additions & 4 deletions evalml/data_checks/id_columns_data_check.py
@@ -1,10 +1,13 @@
import pandas as pd

from evalml.data_checks import (
DataCheck,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)


class IDColumnsDataCheck(DataCheck):
Expand All @@ -28,13 +31,14 @@ def validate(self, X, y=None):
- column contains all unique values (and is not float / boolean)

Arguments:
X (pd.DataFrame): The input features to check
X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0

Returns:
dict: A dictionary of features with column name or index and their probability of being ID columns

Example:
>>> import pandas as pd
>>> df = pd.DataFrame({
... 'df_id': [0, 1, 2, 3, 4],
... 'x': [10, 42, 31, 51, 61],
Expand All @@ -52,8 +56,10 @@ def validate(self, X, y=None):
"warnings": [],
"errors": []
}
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())

col_names = [col for col in X.columns.tolist()]
cols_named_id = [col for col in col_names if (str(col).lower() == "id")] # columns whose name is "id"
id_cols = {col: 0.95 for col in cols_named_id}
Expand Down
15 changes: 10 additions & 5 deletions evalml/data_checks/invalid_targets_data_check.py
@@ -1,4 +1,3 @@
import pandas as pd

from evalml.data_checks import (
DataCheck,
Expand All @@ -8,6 +7,8 @@
)
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper,
categorical_dtypes,
numeric_and_boolean_dtypes
)
Expand All @@ -32,13 +33,14 @@ def validate(self, X, y):
"""Checks if the target data contains missing or invalid values.

Arguments:
X (pd.DataFrame, pd.Series, np.ndarray, list): Features. Ignored.
y: Target data to check for invalid values.
X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values.

Returns:
dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data.

Example:
>>> import pandas as pd
>>> X = pd.DataFrame({})
>>> y = pd.Series([0, 1, None, None])
>>> target_check = InvalidTargetDataCheck('binary')
Expand All @@ -53,9 +55,12 @@ def validate(self, X, y):
"warnings": [],
"errors": []
}
if y is None:
raise ValueError("y cannot be None")

y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())

if not isinstance(y, pd.Series):
y = pd.Series(y)
null_rows = y.isnull()
if null_rows.any():
num_null_rows = null_rows.sum()
Expand Down
28 changes: 16 additions & 12 deletions evalml/data_checks/no_variance_data_check.py
@@ -1,9 +1,13 @@
import pandas as pd

from .data_check import DataCheck
from .data_check_message import DataCheckError, DataCheckWarning
from .data_check_message_code import DataCheckMessageCode

from evalml.data_checks import (
DataCheck,
DataCheckError,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper
)
from evalml.utils.logger import get_logger

logger = get_logger(__file__)
Expand Down Expand Up @@ -53,8 +57,8 @@ def validate(self, X, y):
"""Check if the target or any of the features have no variance (1 unique value).

Arguments:
X (pd.DataFrame): The input features.
y (pd.Series): The target data.
X (ww.DataTable, pd.DataFrame, np.ndarray): The input features.
y (ww.DataColumn, pd.Series, np.ndarray): The target data.

Returns:
dict (DataCheckWarning or DataCheckError): dict of warnings/errors corresponding to features or target with no variance.
Expand All @@ -64,10 +68,10 @@ def validate(self, X, y):
"errors": []
}

if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if not isinstance(y, pd.Series):
y = pd.Series(y)
X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_to_woodwork_structure(y)
y = _convert_woodwork_types_wrapper(y.to_series())

unique_counts = X.nunique(dropna=self._dropnan).to_dict()
any_nulls = (X.isnull().any()).to_dict()
Expand Down
17 changes: 11 additions & 6 deletions evalml/data_checks/outliers_data_check.py
Expand Up @@ -6,7 +6,11 @@
DataCheckWarning
)
from evalml.utils import get_random_state
from evalml.utils.gen_utils import numeric_dtypes
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper,
numeric_dtypes
)


class OutliersDataCheck(DataCheck):
Expand All @@ -24,8 +28,8 @@ def validate(self, X, y=None):
"""Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.

Arguments:
X (pd.DataFrame): Features
y: Ignored.
X (ww.DataTable, pd.DataFrame, np.ndarray): Features
y (ww.DataColumn, pd.Series, np.ndarray): Ignored.

Returns:
dict: A set of columns that may have outlier data.
Expand All @@ -48,10 +52,11 @@ def validate(self, X, y=None):
"warnings": [],
"errors": []
}
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = X.select_dtypes(include=numeric_dtypes)

X = _convert_to_woodwork_structure(X)
X = _convert_woodwork_types_wrapper(X.to_dataframe())

X = X.select_dtypes(include=numeric_dtypes)
if len(X.columns) == 0:
return messages

Expand Down
22 changes: 13 additions & 9 deletions evalml/data_checks/target_leakage_data_check.py
@@ -1,11 +1,13 @@
import pandas as pd

from evalml.data_checks import (
DataCheck,
DataCheckMessageCode,
DataCheckWarning
)
from evalml.utils.gen_utils import numeric_and_boolean_dtypes
from evalml.utils.gen_utils import (
_convert_to_woodwork_structure,
_convert_woodwork_types_wrapper,
numeric_and_boolean_dtypes
)


class TargetLeakageDataCheck(DataCheck):
Expand All @@ -30,13 +32,14 @@ def validate(self, X, y):
Currently only supports binary and numeric targets and features.

Arguments:
X (pd.DataFrame): The input features to check
y (pd.Series): The target data
X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
y (ww.DataColumn, pd.Series, np.ndarray): The target data

Returns:
dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected.

Example:
>>> import pandas as pd
>>> X = pd.DataFrame({
... 'leak': [10, 42, 31, 51, 61],
... 'x': [42, 54, 12, 64, 12],
Expand All @@ -55,10 +58,11 @@ def validate(self, X, y):
"warnings": [],
"errors": []
}
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if not isinstance(y, pd.Series):
y = pd.Series(y)

X = _convert_to_woodwork_structure(X)
y = _convert_to_woodwork_structure(y)
X = _convert_woodwork_types_wrapper(X.to_dataframe())
y = _convert_woodwork_types_wrapper(y.to_series())

if y.dtype not in numeric_and_boolean_dtypes:
return messages
Expand Down