Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Release Notes
* Updated estimator and pipelines' ``predict``, ``predict_proba``, ``transform``, ``inverse_transform`` methods to preserve input indices :pr:`2979`
* Updated demo dataset link for daily min temperatures :pr:`3023`
* Changes
* Updated ``OutliersDataCheck`` and ``UniquenessDataCheck`` and allow for the suspension of the Nullable types error :pr:`3018`
* Documentation Changes
* Fixed cost benefit matrix demo formatting :pr:`2990`
* Update ReadMe.md with new badge links and updated installation instructions for conda :pr:`2998`
Expand Down
2 changes: 1 addition & 1 deletion evalml/data_checks/highly_null_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def validate(self, X, y=None):
"""
results = {"warnings": [], "errors": [], "actions": []}

X = infer_feature_types(X)
X = infer_feature_types(X, ignore_nullable_types=True)

percent_null_rows = X.isnull().mean(axis=1)
highly_null_rows = percent_null_rows[
Expand Down
4 changes: 2 additions & 2 deletions evalml/data_checks/no_variance_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def validate(self, X, y):

"""
results = {"warnings": [], "errors": [], "actions": []}
X = infer_feature_types(X)
y = infer_feature_types(y)
X = infer_feature_types(X, ignore_nullable_types=True)
y = infer_feature_types(y, ignore_nullable_types=True)

unique_counts = X.nunique(dropna=self._dropnan).to_dict()
any_nulls = (X.isnull().any()).to_dict()
Expand Down
3 changes: 1 addition & 2 deletions evalml/data_checks/outliers_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ def get_boxplot_data(data_):
Returns:
dict: A payload of box plot statistics.
"""
if not data_.ww._schema:
data_.ww.init()
data_ = infer_feature_types(data_, ignore_nullable_types=True)
num_records = data_.count()
box_plot_dict = data_.ww.box_plot_dict()
quantiles = box_plot_dict["quantiles"]
Expand Down
7 changes: 5 additions & 2 deletions evalml/data_checks/uniqueness_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,18 +143,21 @@ def validate(self, X, y=None):
return results

@staticmethod
def uniqueness_score(col):
def uniqueness_score(col, drop_na=True):
"""Calculate a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation.

Based on the Herfindahl–Hirschman Index.

Args:
col (pd.Series): Feature values.
drop_na (bool): Whether to drop null values when computing the uniqueness score. Defaults to True.

Returns:
(float): Uniqueness score.
"""
norm_counts = col.value_counts() / col.value_counts().sum()
norm_counts = (
col.value_counts(dropna=drop_na) / col.value_counts(dropna=drop_na).sum()
)
square_counts = norm_counts ** 2
score = 1 - square_counts.sum()
return score
6 changes: 4 additions & 2 deletions evalml/utils/woodwork_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@ def _raise_value_error_if_nullable_types_detected(data):
)


def infer_feature_types(data, feature_types=None):
def infer_feature_types(data, feature_types=None, ignore_nullable_types=False):
"""Create a Woodwork structure from the given list, pandas, or numpy input, with specified types for columns. If a column's type is not specified, it will be inferred by Woodwork.

Args:
data (pd.DataFrame, pd.Series): Input data to convert to a Woodwork data structure.
feature_types (string, ww.logical_type obj, dict, optional): If data is a 2D structure, feature_types must be a dictionary
mapping column names to the type of data represented in the column. If data is a 1D structure, then feature_types must be
a Woodwork logical type or a string representing a Woodwork logical type ("Double", "Integer", "Boolean", "Categorical", "Datetime", "NaturalLanguage")
ignore_nullable_types (bool): Whether to ignore raising an error upon detection of Nullable types. Defaults to False.

Returns:
A Woodwork data structure where the data type of each column was either specified or inferred.
Expand All @@ -65,7 +66,8 @@ def infer_feature_types(data, feature_types=None):
elif isinstance(data, np.ndarray):
data = _numpy_to_pandas(data)

_raise_value_error_if_nullable_types_detected(data)
if not ignore_nullable_types:
_raise_value_error_if_nullable_types_detected(data)

def convert_all_nan_unknown_to_double(data):
def is_column_pd_na(data, col):
Expand Down