diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index d2ec75d58c..fb97f837b4 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -15,6 +15,7 @@ Release Notes * Updated estimator and pipelines' ``predict``, ``predict_proba``, ``transform``, ``inverse_transform`` methods to preserve input indices :pr:`2979` * Updated demo dataset link for daily min temperatures :pr:`3023` * Changes + * Updated ``OutliersDataCheck`` and ``UniquenessDataCheck`` and allow for the suspension of the Nullable types error :pr:`3018` * Documentation Changes * Fixed cost benefit matrix demo formatting :pr:`2990` * Update ReadMe.md with new badge links and updated installation instructions for conda :pr:`2998` diff --git a/evalml/data_checks/highly_null_data_check.py b/evalml/data_checks/highly_null_data_check.py index 19a3705b43..debbd71044 100644 --- a/evalml/data_checks/highly_null_data_check.py +++ b/evalml/data_checks/highly_null_data_check.py @@ -105,7 +105,7 @@ def validate(self, X, y=None): """ results = {"warnings": [], "errors": [], "actions": []} - X = infer_feature_types(X) + X = infer_feature_types(X, ignore_nullable_types=True) percent_null_rows = X.isnull().mean(axis=1) highly_null_rows = percent_null_rows[ diff --git a/evalml/data_checks/no_variance_data_check.py b/evalml/data_checks/no_variance_data_check.py index 2ea0d40ba8..e881ffa5a4 100644 --- a/evalml/data_checks/no_variance_data_check.py +++ b/evalml/data_checks/no_variance_data_check.py @@ -108,8 +108,8 @@ def validate(self, X, y): """ results = {"warnings": [], "errors": [], "actions": []} - X = infer_feature_types(X) - y = infer_feature_types(y) + X = infer_feature_types(X, ignore_nullable_types=True) + y = infer_feature_types(y, ignore_nullable_types=True) unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py index cf7df9ebb0..4d6fa83cd5 100644 --- a/evalml/data_checks/outliers_data_check.py +++ b/evalml/data_checks/outliers_data_check.py @@ -128,8 +128,7 @@ def get_boxplot_data(data_): Returns: dict: A payload of box plot statistics. """ - if not data_.ww._schema: - data_.ww.init() + data_ = infer_feature_types(data_, ignore_nullable_types=True) num_records = data_.count() box_plot_dict = data_.ww.box_plot_dict() quantiles = box_plot_dict["quantiles"] diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py index 119655112e..2b254f4f5d 100644 --- a/evalml/data_checks/uniqueness_data_check.py +++ b/evalml/data_checks/uniqueness_data_check.py @@ -143,18 +143,21 @@ def validate(self, X, y=None): return results @staticmethod - def uniqueness_score(col): + def uniqueness_score(col, drop_na=True): """Calculate a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. Based on the Herfindahl–Hirschman Index. Args: col (pd.Series): Feature values. + drop_na (bool): Whether to drop null values when computing the uniqueness score. Defaults to True. Returns: (float): Uniqueness score. """ - norm_counts = col.value_counts() / col.value_counts().sum() + norm_counts = ( + col.value_counts(dropna=drop_na) / col.value_counts(dropna=drop_na).sum() + ) square_counts = norm_counts ** 2 score = 1 - square_counts.sum() return score diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py index 225a118870..609f7ad030 100644 --- a/evalml/utils/woodwork_utils.py +++ b/evalml/utils/woodwork_utils.py @@ -45,7 +45,7 @@ def _raise_value_error_if_nullable_types_detected(data): ) -def infer_feature_types(data, feature_types=None): +def infer_feature_types(data, feature_types=None, ignore_nullable_types=False): """Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. If a column's type is not specified, it will be inferred by Woodwork. Args: @@ -53,6 +53,7 @@ def infer_feature_types(data, feature_types=None): feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage") + ignore_nullable_types (bool): Whether to ignore raising an error upon detection of Nullable types. Defaults to False. Returns: A Woodwork data structure where the data type of each column was either specified or inferred. @@ -65,7 +66,8 @@ def infer_feature_types(data, feature_types=None): elif isinstance(data, np.ndarray): data = _numpy_to_pandas(data) - _raise_value_error_if_nullable_types_detected(data) + if not ignore_nullable_types: + _raise_value_error_if_nullable_types_detected(data) def convert_all_nan_unknown_to_double(data): def is_column_pd_na(data, col):