Fix CLA

alteryx · Nov 3, 2020 · 183f118 · 183f118
1 parent b16000e
commit 183f118
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 33 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -7,6 +7,7 @@ Release Notes
     * Fixes
         * Updated enum classes to show possible enum values as attributes :pr:`1391`
     * Changes
+        * Changed ``OutliersDataCheck`` to return the list of columns, rather than rows, that contain outliers :pr:`1377`
     * Documentation Changes
         * Added description of CLA to contributing guide, updated description of draft PRs :pr:`1402`
     * Testing Changes

diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py
@@ -1,5 +1,4 @@
 import pandas as pd
-from sklearn.ensemble import IsolationForest
 
 from .data_check import DataCheck
 from .data_check_message import DataCheckWarning
@@ -9,8 +8,7 @@
 
 
 class OutliersDataCheck(DataCheck):
-    """Checks if there are any outliers in input data by using an Isolation Forest to obtain the anomaly score
-        of each index and then using IQR to determine score anomalies. Indices with score anomalies are considered outliers."""
+    """Checks if there are any outliers in input data by using IQR to determine score anomalies. Columns with score anomalies are considered to contain outliers."""
 
     def __init__(self, random_state=0):
         """Checks if there are any outliers in the input data.
@@ -21,24 +19,23 @@ def __init__(self, random_state=0):
         self.random_state = get_random_state(random_state)
 
     def validate(self, X, y=None):
-        """Checks if there are any outliers in a dataframe by using an Isolation Forest to obtain the anomaly score
-        of each index and then using IQR to determine score anomalies. Indices with score anomalies are considered outliers.
+        """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.
 
         Arguments:
             X (pd.DataFrame): Features
             y: Ignored.
 
         Returns:
-            A set of indices that may have outlier data.
+            A set of columns that may have outlier data.
 
         Example:
             >>> df = pd.DataFrame({
-            ...     'x': [1, 2, 3, 40, 5],
-            ...     'y': [6, 7, 8, 990, 10],
+            ...     'x': [1, 2, 3, 4, 5],
+            ...     'y': [6, 7, 8, 9, 10],
             ...     'z': [-1, -2, -3, -1201, -4]
             ... })
             >>> outliers_check = OutliersDataCheck()
-            >>> assert outliers_check.validate(df) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck")]
+            >>> assert outliers_check.validate(df) == [DataCheckWarning("Column 'z' is likely to have outlier data", "OutliersDataCheck")]
         """
 
         if not isinstance(X, pd.DataFrame):
@@ -56,11 +53,16 @@ def get_IQR(df, k=2.0):
             upper_bound = q3 + (k * iqr)
             return (lower_bound, upper_bound)
 
-        clf = IsolationForest(random_state=self.random_state)
-        clf.fit(X)
-        scores = pd.Series(clf.decision_function(X))
-        lower_bound, upper_bound = get_IQR(scores, k=2)
-        outliers = (scores < lower_bound) | (scores > upper_bound)
-        outliers_indices = outliers[outliers].index.values.tolist()
-        warning_msg = "Row '{}' is likely to have outlier data"
-        return [DataCheckWarning(warning_msg.format(row_index), self.name) for row_index in outliers_indices]
+        lower_bound, upper_bound = get_IQR(X)
+        indices = set()
+        # get the columns that fall out of the bounds, which means they contain outliers
+        for idx, bound in enumerate([lower_bound, upper_bound]):
+            cols_in_range = (X >= bound.values) if idx == 0 else (X <= bound.values)
+            cols_in_range = cols_in_range.all()
+            outlier_cols = cols_in_range[~cols_in_range].keys()
+            indices.update(outlier_cols.tolist())
+        # order the columns by how they appear in the dataframe
+        indices = sorted(list(indices), key=lambda x: X.columns.tolist().index(x))
+        warning_msg = "Column '{}' is likely to have outlier data"
+        s = [DataCheckWarning(warning_msg.format(row_index), self.name) for row_index in indices]
+        return s
diff --git a/evalml/tests/data_checks_tests/test_outliers_data_check.py b/evalml/tests/data_checks_tests/test_outliers_data_check.py
@@ -19,16 +19,17 @@ def test_outliers_data_check_warnings():
     data = np.tile(a, (100, 10))
 
     X = pd.DataFrame(data=data)
-    X.iloc[3, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[25, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[55, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[72, :] = pd.Series(np.random.randn(100) * 1000)
+    X.iloc[0, 3] = 1000
+    X.iloc[3, 25] = 1000
+    X.iloc[5, 55] = 10000
+    X.iloc[10, 72] = -1000
+    X.iloc[:, 90] = 'string_values'
 
     outliers_check = OutliersDataCheck()
-    assert outliers_check.validate(X) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck"),
-                                          DataCheckWarning("Row '25' is likely to have outlier data", "OutliersDataCheck"),
-                                          DataCheckWarning("Row '55' is likely to have outlier data", "OutliersDataCheck"),
-                                          DataCheckWarning("Row '72' is likely to have outlier data", "OutliersDataCheck")]
+    assert outliers_check.validate(X) == [DataCheckWarning("Column '3' is likely to have outlier data", "OutliersDataCheck"),
+                                          DataCheckWarning("Column '25' is likely to have outlier data", "OutliersDataCheck"),
+                                          DataCheckWarning("Column '55' is likely to have outlier data", "OutliersDataCheck"),
+                                          DataCheckWarning("Column '72' is likely to have outlier data", "OutliersDataCheck")]
 
 
 def test_outliers_data_check_input_formats():
@@ -42,13 +43,13 @@ def test_outliers_data_check_input_formats():
     data = np.tile(a, (100, 10))
 
     X = pd.DataFrame(data=data)
-    X.iloc[3, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[25, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[55, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[72, :] = pd.Series(np.random.randn(100) * 1000)
+    X.iloc[0, 3] = 1000
+    X.iloc[3, 25] = 1000
+    X.iloc[5, 55] = 10000
+    X.iloc[10, 72] = -1000
 
     outliers_check = OutliersDataCheck()
-    assert outliers_check.validate(X.to_numpy()) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck"),
-                                                     DataCheckWarning("Row '25' is likely to have outlier data", "OutliersDataCheck"),
-                                                     DataCheckWarning("Row '55' is likely to have outlier data", "OutliersDataCheck"),
-                                                     DataCheckWarning("Row '72' is likely to have outlier data", "OutliersDataCheck")]
+    assert outliers_check.validate(X.to_numpy()) == [DataCheckWarning("Column '3' is likely to have outlier data", "OutliersDataCheck"),
+                                                     DataCheckWarning("Column '25' is likely to have outlier data", "OutliersDataCheck"),
+                                                     DataCheckWarning("Column '55' is likely to have outlier data", "OutliersDataCheck"),
+                                                     DataCheckWarning("Column '72' is likely to have outlier data", "OutliersDataCheck")]