alteryx · angela97lin · Dec 3, 2020 · Nov 30, 2020 · Nov 30, 2020 · Nov 30, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
 **Future Releases**
     * Enhancements
         * Added ``graph_prediction_vs_actual_over_time`` and ``get_prediction_vs_actual_over_time_data`` to the model understanding module for time series problems :pr:`1483`
+        * Updated data checks to accept ``Woodwork`` data structures :pr:`1481`
         * Added parameter to ``InvalidTargetDataCheck`` to show only top unique values rather than all unique values :pr:`1485`
     * Fixes
         * Fix Windows CI jobs: install ``numba`` via conda, required for ``shap`` :pr:`1490`

diff --git a/evalml/data_checks/class_imbalance_data_check.py b/evalml/data_checks/class_imbalance_data_check.py
@@ -1,11 +1,14 @@
-import pandas as pd
 
 from evalml.data_checks import (
     DataCheck,
     DataCheckError,
     DataCheckMessageCode,
     DataCheckWarning
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class ClassImbalanceDataCheck(DataCheck):
@@ -33,15 +36,16 @@ def validate(self, X, y):
             Ignores NaN values in target labels if they appear.
 
         Arguments:
-            X (pd.DataFrame, pd.Series, np.ndarray, list): Features. Ignored.
-            y: Target labels to check for imbalanced data.
+            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
+            y (ww.DataColumn, pd.Series, np.ndarray): Target labels to check for imbalanced data.
 
         Returns:
             dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold,
                   and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds.
 
         Example:
-            >>> X = pd.DataFrame({})
+            >>> import pandas as pd
+            >>> X = pd.DataFrame()
             >>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
             >>> target_check = ClassImbalanceDataCheck(threshold=0.10)
         >>> assert target_check.validate(X, y) == {"errors": [{"message": "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0]",\
@@ -59,8 +63,10 @@ def validate(self, X, y):
             "warnings": [],
             "errors": []
         }
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
+
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
+
         fold_counts = y.value_counts(normalize=False)
         # search for targets that occur less than twice the number of cv folds first
         below_threshold_folds = fold_counts.where(fold_counts < self.cv_folds).dropna()

diff --git a/evalml/data_checks/data_checks.py b/evalml/data_checks/data_checks.py
@@ -1,8 +1,8 @@
 import inspect
 
-from .data_check import DataCheck
-
+from evalml.data_checks import DataCheck
 from evalml.exceptions import DataCheckInitError
+from evalml.utils.gen_utils import _convert_to_woodwork_structure
 
 
 def _has_defaults_for_all_args(init):
@@ -77,8 +77,8 @@ def validate(self, X, y=None):
         Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable.
 
         Arguments:
-            X (pd.DataFrame): The input data of shape [n_samples, n_features]
-            y (pd.Series): The target data of length [n_samples]
+            X (ww.DataTable, pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features]
+            y (ww.DataColumn, pd.Series, np.ndarray): The target data of length [n_samples]
 
         Returns:
             dict: Dictionary containing DataCheckMessage objects
@@ -88,6 +88,11 @@ def validate(self, X, y=None):
             "warnings": [],
             "errors": []
         }
+
+        X = _convert_to_woodwork_structure(X)
+        if y is not None:
+            y = _convert_to_woodwork_structure(y)
+
         for data_check in self.data_checks:
             messages_new = data_check.validate(X, y)
             messages["warnings"].extend(messages_new["warnings"])

diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py
@@ -1,10 +1,12 @@
-import pandas as pd
-
 from evalml.data_checks import (
     DataCheck,
     DataCheckMessageCode,
     DataCheckWarning
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class HighlyNullDataCheck(DataCheck):
@@ -26,13 +28,14 @@ def validate(self, X, y=None):
         """Checks if there are any highly-null columns in the input.
 
         Arguments:
-            X (pd.DataFrame, pd.Series, np.ndarray, list): Features
-            y: Ignored.
+            X (ww.DataTable, pd.DataFrame, np.ndarray): Features
+            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
 
         Returns:
             dict (DataCheckWarning): dict with a DataCheckWarning if there are any highly-null columns.
 
         Example:
+            >>> import pandas as pd
             >>> df = pd.DataFrame({
             ...    'lots_of_null': [None, None, None, None, 5],
             ...    'no_null': [1, 2, 3, 4, 5]
@@ -49,8 +52,10 @@ def validate(self, X, y=None):
             "warnings": [],
             "errors": []
         }
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+
         percent_null = (X.isnull().mean()).to_dict()
         if self.pct_null_threshold == 0.0:
             all_null_cols = {key: value for key, value in percent_null.items() if value > 0.0}

diff --git a/evalml/data_checks/id_columns_data_check.py b/evalml/data_checks/id_columns_data_check.py
@@ -1,10 +1,13 @@
-import pandas as pd
 
 from evalml.data_checks import (
     DataCheck,
     DataCheckMessageCode,
     DataCheckWarning
 )
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 
 
 class IDColumnsDataCheck(DataCheck):
@@ -28,13 +31,14 @@ def validate(self, X, y=None):
             - column contains all unique values (and is not float / boolean)
 
         Arguments:
-            X (pd.DataFrame): The input features to check
+            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
             threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0
 
         Returns:
             dict: A dictionary of features with column name or index and their probability of being ID columns
 
         Example:
+            >>> import pandas as pd
             >>> df = pd.DataFrame({
             ...     'df_id': [0, 1, 2, 3, 4],
             ...     'x': [10, 42, 31, 51, 61],
@@ -52,8 +56,10 @@ def validate(self, X, y=None):
             "warnings": [],
             "errors": []
         }
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
+
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+
         col_names = [col for col in X.columns.tolist()]
         cols_named_id = [col for col in col_names if (str(col).lower() == "id")]  # columns whose name is "id"
         id_cols = {col: 0.95 for col in cols_named_id}

diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py
@@ -1,4 +1,3 @@
-import pandas as pd
 
 from evalml.data_checks import (
     DataCheck,
@@ -8,6 +7,8 @@
 )
 from evalml.problem_types import ProblemTypes, handle_problem_types
 from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
     categorical_dtypes,
     numeric_and_boolean_dtypes
 )
@@ -32,13 +33,14 @@ def validate(self, X, y):
         """Checks if the target data contains missing or invalid values.
 
         Arguments:
-            X (pd.DataFrame, pd.Series, np.ndarray, list): Features. Ignored.
-            y: Target data to check for invalid values.
+            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
+            y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values.
 
         Returns:
             dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data.
 
         Example:
+            >>> import pandas as pd
             >>> X = pd.DataFrame({})
             >>> y = pd.Series([0, 1, None, None])
             >>> target_check = InvalidTargetDataCheck('binary')
@@ -53,9 +55,12 @@ def validate(self, X, y):
             "warnings": [],
             "errors": []
         }
+        if y is None:
+            raise ValueError("y cannot be None")
+
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
         null_rows = y.isnull()
         if null_rows.any():
             num_null_rows = null_rows.sum()

diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py
@@ -1,9 +1,13 @@
-import pandas as pd
-
-from .data_check import DataCheck
-from .data_check_message import DataCheckError, DataCheckWarning
-from .data_check_message_code import DataCheckMessageCode
-
+from evalml.data_checks import (
+    DataCheck,
+    DataCheckError,
+    DataCheckMessageCode,
+    DataCheckWarning
+)
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper
+)
 from evalml.utils.logger import get_logger
 
 logger = get_logger(__file__)
@@ -53,8 +57,8 @@ def validate(self, X, y):
         """Check if the target or any of the features have no variance (1 unique value).
 
         Arguments:
-            X (pd.DataFrame): The input features.
-            y (pd.Series): The target data.
+            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features.
+            y (ww.DataColumn, pd.Series, np.ndarray): The target data.
 
         Returns:
             dict (DataCheckWarning or DataCheckError): dict of warnings/errors corresponding to features or target with no variance.
@@ -64,10 +68,10 @@ def validate(self, X, y):
             "errors": []
         }
 
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_to_woodwork_structure(y)
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         unique_counts = X.nunique(dropna=self._dropnan).to_dict()
         any_nulls = (X.isnull().any()).to_dict()

diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py
@@ -6,7 +6,11 @@
     DataCheckWarning
 )
 from evalml.utils import get_random_state
-from evalml.utils.gen_utils import numeric_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    numeric_dtypes
+)
 
 
 class OutliersDataCheck(DataCheck):
@@ -24,8 +28,8 @@ def validate(self, X, y=None):
         """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.
 
         Arguments:
-            X (pd.DataFrame): Features
-            y: Ignored.
+            X (ww.DataTable, pd.DataFrame, np.ndarray): Features
+            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
 
         Returns:
             dict: A set of columns that may have outlier data.
@@ -48,10 +52,11 @@ def validate(self, X, y=None):
             "warnings": [],
             "errors": []
         }
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        X = X.select_dtypes(include=numeric_dtypes)
 
+        X = _convert_to_woodwork_structure(X)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+
+        X = X.select_dtypes(include=numeric_dtypes)
         if len(X.columns) == 0:
             return messages
 

diff --git a/evalml/data_checks/target_leakage_data_check.py b/evalml/data_checks/target_leakage_data_check.py
@@ -1,11 +1,13 @@
-import pandas as pd
-
 from evalml.data_checks import (
     DataCheck,
     DataCheckMessageCode,
     DataCheckWarning
 )
-from evalml.utils.gen_utils import numeric_and_boolean_dtypes
+from evalml.utils.gen_utils import (
+    _convert_to_woodwork_structure,
+    _convert_woodwork_types_wrapper,
+    numeric_and_boolean_dtypes
+)
 
 
 class TargetLeakageDataCheck(DataCheck):
@@ -30,13 +32,14 @@ def validate(self, X, y):
         Currently only supports binary and numeric targets and features.
 
         Arguments:
-            X (pd.DataFrame): The input features to check
-            y (pd.Series): The target data
+            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check
+            y (ww.DataColumn, pd.Series, np.ndarray): The target data
 
         Returns:
             dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected.
 
         Example:
+            >>> import pandas as pd
             >>> X = pd.DataFrame({
             ...    'leak': [10, 42, 31, 51, 61],
             ...    'x': [42, 54, 12, 64, 12],
@@ -55,10 +58,11 @@ def validate(self, X, y):
             "warnings": [],
             "errors": []
         }
-        if not isinstance(X, pd.DataFrame):
-            X = pd.DataFrame(X)
-        if not isinstance(y, pd.Series):
-            y = pd.Series(y)
+
+        X = _convert_to_woodwork_structure(X)
+        y = _convert_to_woodwork_structure(y)
+        X = _convert_woodwork_types_wrapper(X.to_dataframe())
+        y = _convert_woodwork_types_wrapper(y.to_series())
 
         if y.dtype not in numeric_and_boolean_dtypes:
             return messages