Skip to content

Commit

Permalink
Updated tests and docstrings and added a test.
Browse files Browse the repository at this point in the history
  • Loading branch information
chukarsten committed Feb 9, 2021
1 parent 13d1f40 commit 24f79ba
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 8 deletions.
4 changes: 2 additions & 2 deletions docs/source/user_guide/data_checks.ipynb
Expand Up @@ -610,9 +610,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
15 changes: 9 additions & 6 deletions evalml/data_checks/uniqueness_data_check.py
Expand Up @@ -18,7 +18,7 @@


class UniquenessDataCheck(DataCheck):
"""Checks if there are any columns in the input that are either too unique for categorical problems
"""Checks if there are any columns in the input that are either too unique for classification problems
or not unique enough for regression problems."""

def __init__(self, problem_type, threshold=0.50):
Expand All @@ -27,8 +27,8 @@ def __init__(self, problem_type, threshold=0.50):
Arguments:
problem_type (str or ProblemTypes): The specific problem type to data check for.
e.g. 'binary', 'multiclass', 'regression, 'time series regression'
threshold(float): The threshold to set as either an upper or lower bound on uniqueness.
Defaults to 0.50.
threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems
or lower bound on for regression type problems. Defaults to 0.50.
"""
self.problem_type = handle_problem_types(problem_type)
Expand All @@ -37,12 +37,12 @@ def __init__(self, problem_type, threshold=0.50):
self.threshold = threshold

def validate(self, X, y=None):
"""Checks if there are any columns in the input that are too unique in the case of categorical
"""Checks if there are any columns in the input that are too unique in the case of classification
problems or not unique enough in the case of regression problems.
Arguments:
X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None.
Returns:
dict: dict with a DataCheckWarning if there are any too unique or not
Expand Down Expand Up @@ -92,7 +92,10 @@ def validate(self, X, y=None):

@staticmethod
def uniqueness_score(col):
"""This function calculates a uniqueness score for the provided field.
"""This function calculates a uniqueness score for the provided field. NaN values are
not considered as unique values in the calculation.
Based on the Herfindahl–Hirschman Index.
Arguments:
col (pd.Series): Feature values.
Expand Down
8 changes: 8 additions & 0 deletions evalml/tests/data_checks_tests/test_uniqueness_data_check.py
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest

Expand Down Expand Up @@ -46,6 +47,13 @@ def test_uniqueness_data_check_uniqueness_score():
ans = 0.66
assert scores == ans

# Test uniqueness for a simple series with NaN.
# [0,1,2,0,1,2,0,1,2,0]
data = pd.Series([x % 3 for x in range(10)] + [np.nan])
scores = uniqueness_score(data)
ans = 0.66
assert scores == ans

# Test uniqueness in each column of a DataFrame
data = pd.DataFrame({'most_unique': [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9]
'more_unique': [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4]
Expand Down

0 comments on commit 24f79ba

Please sign in to comment.