alteryx · bchen1116 · Nov 6, 2020 · Nov 3, 2020 · Nov 5, 2020 · Nov 5, 2020
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -8,6 +8,7 @@ Release Notes
     * Fixes
         * Updated enum classes to show possible enum values as attributes :pr:`1391`
     * Changes
+        * Changed ``OutliersDataCheck`` to return the list of columns, rather than rows, that contain outliers :pr:`1377`
         * Simplified and cleaned output for Code Generation :pr:`1371`
         * Reverted changes from :pr:`1337` :pr:`1409`
     * Documentation Changes

diff --git a/evalml/data_checks/outliers_data_check.py b/evalml/data_checks/outliers_data_check.py
@@ -1,5 +1,4 @@
 import pandas as pd
-from sklearn.ensemble import IsolationForest
 
 from .data_check import DataCheck
 from .data_check_message import DataCheckWarning
@@ -9,8 +8,7 @@
 
 
 class OutliersDataCheck(DataCheck):
-    """Checks if there are any outliers in input data by using an Isolation Forest to obtain the anomaly score
-        of each index and then using IQR to determine score anomalies. Indices with score anomalies are considered outliers."""
+    """Checks if there are any outliers in input data by using IQR to determine score anomalies. Columns with score anomalies are considered to contain outliers."""
 
     def __init__(self, random_state=0):
         """Checks if there are any outliers in the input data.
@@ -21,24 +19,23 @@ def __init__(self, random_state=0):
         self.random_state = get_random_state(random_state)
 
     def validate(self, X, y=None):
-        """Checks if there are any outliers in a dataframe by using an Isolation Forest to obtain the anomaly score
-        of each index and then using IQR to determine score anomalies. Indices with score anomalies are considered outliers.
+        """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.
 
         Arguments:
             X (pd.DataFrame): Features
             y: Ignored.
 
         Returns:
-            A set of indices that may have outlier data.
+            A set of columns that may have outlier data.
 
         Example:
             >>> df = pd.DataFrame({
-            ...     'x': [1, 2, 3, 40, 5],
-            ...     'y': [6, 7, 8, 990, 10],
+            ...     'x': [1, 2, 3, 4, 5],
+            ...     'y': [6, 7, 8, 9, 10],
             ...     'z': [-1, -2, -3, -1201, -4]
             ... })
             >>> outliers_check = OutliersDataCheck()
-            >>> assert outliers_check.validate(df) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck")]
+            >>> assert outliers_check.validate(df) == [DataCheckWarning("Column 'z' is likely to have outlier data", "OutliersDataCheck")]
         """
 
         if not isinstance(X, pd.DataFrame):
@@ -52,15 +49,12 @@ def get_IQR(df, k=2.0):
             q1 = df.quantile(0.25)
             q3 = df.quantile(0.75)
             iqr = q3 - q1
-            lower_bound = q1 - (k * iqr)
-            upper_bound = q3 + (k * iqr)
-            return (lower_bound, upper_bound)
-
-        clf = IsolationForest(random_state=self.random_state)
-        clf.fit(X)
-        scores = pd.Series(clf.decision_function(X))
-        lower_bound, upper_bound = get_IQR(scores, k=2)
-        outliers = (scores < lower_bound) | (scores > upper_bound)
-        outliers_indices = outliers[outliers].index.values.tolist()
-        warning_msg = "Row '{}' is likely to have outlier data"
-        return [DataCheckWarning(warning_msg.format(row_index), self.name) for row_index in outliers_indices]
+            lower_bound = pd.Series(q1 - (k * iqr), name='lower_bound')
+            upper_bound = pd.Series(q3 + (k * iqr), name='upper_bound')
+            return pd.concat([lower_bound, upper_bound], axis=1)
+
+        iqr = get_IQR(X, k=2.0)
+        has_outliers = ((X < iqr['lower_bound']) | (X > iqr['upper_bound'])).any()
+        warning_msg = "Column '{}' is likely to have outlier data"
+        cols = has_outliers.index[has_outliers]
+        return [DataCheckWarning(warning_msg.format(col), self.name) for col in cols]
diff --git a/evalml/tests/data_checks_tests/test_outliers_data_check.py b/evalml/tests/data_checks_tests/test_outliers_data_check.py
@@ -1,3 +1,5 @@
+import string
+
 import numpy as np
 import pandas as pd
 
@@ -19,16 +21,17 @@ def test_outliers_data_check_warnings():
     data = np.tile(a, (100, 10))
 
     X = pd.DataFrame(data=data)
-    X.iloc[3, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[25, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[55, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[72, :] = pd.Series(np.random.randn(100) * 1000)
+    X.iloc[0, 3] = 1000
+    X.iloc[3, 25] = 1000
+    X.iloc[5, 55] = 10000
+    X.iloc[10, 72] = -1000
+    X.iloc[:, 90] = 'string_values'
 
     outliers_check = OutliersDataCheck()
-    assert outliers_check.validate(X) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck"),
-                                          DataCheckWarning("Row '25' is likely to have outlier data", "OutliersDataCheck"),
-                                          DataCheckWarning("Row '55' is likely to have outlier data", "OutliersDataCheck"),
-                                          DataCheckWarning("Row '72' is likely to have outlier data", "OutliersDataCheck")]
+    assert outliers_check.validate(X) == [DataCheckWarning("Column '3' is likely to have outlier data", "OutliersDataCheck"),
+                                          DataCheckWarning("Column '25' is likely to have outlier data", "OutliersDataCheck"),
+                                          DataCheckWarning("Column '55' is likely to have outlier data", "OutliersDataCheck"),
+                                          DataCheckWarning("Column '72' is likely to have outlier data", "OutliersDataCheck")]
 
 
 def test_outliers_data_check_input_formats():
@@ -42,13 +45,25 @@ def test_outliers_data_check_input_formats():
     data = np.tile(a, (100, 10))
 
     X = pd.DataFrame(data=data)
-    X.iloc[3, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[25, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[55, :] = pd.Series(np.random.randn(100) * 1000)
-    X.iloc[72, :] = pd.Series(np.random.randn(100) * 1000)
+    X.iloc[0, 3] = 1000
+    X.iloc[3, 25] = 1000
+    X.iloc[5, 55] = 10000
+    X.iloc[10, 72] = -1000
+
+    outliers_check = OutliersDataCheck()
+    assert outliers_check.validate(X.to_numpy()) == [DataCheckWarning("Column '3' is likely to have outlier data", "OutliersDataCheck"),
+                                                     DataCheckWarning("Column '25' is likely to have outlier data", "OutliersDataCheck"),
+                                                     DataCheckWarning("Column '55' is likely to have outlier data", "OutliersDataCheck"),
+                                                     DataCheckWarning("Column '72' is likely to have outlier data", "OutliersDataCheck")]
+
+
+def test_outliers_data_check_string_cols():
+    a = np.arange(10) * 0.01
+    data = np.tile(a, (100, 2))
+    n_cols = 20
+
+    X = pd.DataFrame(data=data, columns=[string.ascii_lowercase[i] for i in range(n_cols)])
+    X.iloc[0, 3] = 1000
 
     outliers_check = OutliersDataCheck()
-    assert outliers_check.validate(X.to_numpy()) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck"),
-                                                     DataCheckWarning("Row '25' is likely to have outlier data", "OutliersDataCheck"),
-                                                     DataCheckWarning("Row '55' is likely to have outlier data", "OutliersDataCheck"),
-                                                     DataCheckWarning("Row '72' is likely to have outlier data", "OutliersDataCheck")]
+    assert outliers_check.validate(X) == [DataCheckWarning("Column 'd' is likely to have outlier data", "OutliersDataCheck")]