Updated tests and docstrings and added a test.

alteryx · Feb 9, 2021 · 24f79ba · 24f79ba
1 parent 13d1f40
commit 24f79ba
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 8 deletions.
diff --git a/docs/source/user_guide/data_checks.ipynb b/docs/source/user_guide/data_checks.ipynb
@@ -610,9 +610,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.0"
+   "version": "3.8.6"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/evalml/data_checks/uniqueness_data_check.py b/evalml/data_checks/uniqueness_data_check.py
@@ -18,7 +18,7 @@
 
 
 class UniquenessDataCheck(DataCheck):
-    """Checks if there are any columns in the input that are either too unique for categorical problems
+    """Checks if there are any columns in the input that are either too unique for classification problems
     or not unique enough for regression problems."""
 
     def __init__(self, problem_type, threshold=0.50):
@@ -27,8 +27,8 @@ def __init__(self, problem_type, threshold=0.50):
         Arguments:
             problem_type (str or ProblemTypes): The specific problem type to data check for.
                 e.g. 'binary', 'multiclass', 'regression, 'time series regression'
-            threshold(float): The threshold to set as either an upper or lower bound on uniqueness.
-                Defaults to 0.50.
+            threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems
+                or lower bound on for regression type problems.  Defaults to 0.50.
 
         """
         self.problem_type = handle_problem_types(problem_type)
@@ -37,12 +37,12 @@ def __init__(self, problem_type, threshold=0.50):
         self.threshold = threshold
 
     def validate(self, X, y=None):
-        """Checks if there are any columns in the input that are too unique in the case of categorical
+        """Checks if there are any columns in the input that are too unique in the case of classification
         problems or not unique enough in the case of regression problems.
 
         Arguments:
             X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
-            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.
+            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.
 
         Returns:
             dict: dict with a DataCheckWarning if there are any too unique or not
@@ -92,7 +92,10 @@ def validate(self, X, y=None):
 
     @staticmethod
     def uniqueness_score(col):
-        """This function calculates a uniqueness score for the provided field.
+        """This function calculates a uniqueness score for the provided field.  NaN values are
+        not considered as unique values in the calculation.
+
+        Based on the Herfindahl–Hirschman Index.
 
         Arguments:
             col (pd.Series): Feature values.

diff --git a/evalml/tests/data_checks_tests/test_uniqueness_data_check.py b/evalml/tests/data_checks_tests/test_uniqueness_data_check.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -46,6 +47,13 @@ def test_uniqueness_data_check_uniqueness_score():
     ans = 0.66
     assert scores == ans
 
+    # Test uniqueness for a simple series with NaN.
+    # [0,1,2,0,1,2,0,1,2,0]
+    data = pd.Series([x % 3 for x in range(10)] + [np.nan])
+    scores = uniqueness_score(data)
+    ans = 0.66
+    assert scores == ans
+
     # Test uniqueness in each column of a DataFrame
     data = pd.DataFrame({'most_unique': [float(x) for x in range(10)],  # [0,1,2,3,4,5,6,7,8,9]
                          'more_unique': [x % 5 for x in range(10)],  # [0,1,2,3,4,0,1,2,3,4]