Skip to content

Commit

Permalink
Fix CLA
Browse files Browse the repository at this point in the history
  • Loading branch information
bchen1116 committed Nov 3, 2020
1 parent b16000e commit 183f118
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 33 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Release Notes
* Fixes
* Updated enum classes to show possible enum values as attributes :pr:`1391`
* Changes
* Changed ``OutliersDataCheck`` to return the list of columns, rather than rows, that contain outliers :pr:`1377`
* Documentation Changes
* Added description of CLA to contributing guide, updated description of draft PRs :pr:`1402`
* Testing Changes
Expand Down
36 changes: 19 additions & 17 deletions evalml/data_checks/outliers_data_check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pandas as pd
from sklearn.ensemble import IsolationForest

from .data_check import DataCheck
from .data_check_message import DataCheckWarning
Expand All @@ -9,8 +8,7 @@


class OutliersDataCheck(DataCheck):
"""Checks if there are any outliers in input data by using an Isolation Forest to obtain the anomaly score
of each index and then using IQR to determine score anomalies. Indices with score anomalies are considered outliers."""
"""Checks if there are any outliers in input data by using IQR to determine score anomalies. Columns with score anomalies are considered to contain outliers."""

def __init__(self, random_state=0):
"""Checks if there are any outliers in the input data.
Expand All @@ -21,24 +19,23 @@ def __init__(self, random_state=0):
self.random_state = get_random_state(random_state)

def validate(self, X, y=None):
"""Checks if there are any outliers in a dataframe by using an Isolation Forest to obtain the anomaly score
of each index and then using IQR to determine score anomalies. Indices with score anomalies are considered outliers.
"""Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.
Arguments:
X (pd.DataFrame): Features
y: Ignored.
Returns:
A set of indices that may have outlier data.
A set of columns that may have outlier data.
Example:
>>> df = pd.DataFrame({
... 'x': [1, 2, 3, 40, 5],
... 'y': [6, 7, 8, 990, 10],
... 'x': [1, 2, 3, 4, 5],
... 'y': [6, 7, 8, 9, 10],
... 'z': [-1, -2, -3, -1201, -4]
... })
>>> outliers_check = OutliersDataCheck()
>>> assert outliers_check.validate(df) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck")]
>>> assert outliers_check.validate(df) == [DataCheckWarning("Column 'z' is likely to have outlier data", "OutliersDataCheck")]
"""

if not isinstance(X, pd.DataFrame):
Expand All @@ -56,11 +53,16 @@ def get_IQR(df, k=2.0):
upper_bound = q3 + (k * iqr)
return (lower_bound, upper_bound)

clf = IsolationForest(random_state=self.random_state)
clf.fit(X)
scores = pd.Series(clf.decision_function(X))
lower_bound, upper_bound = get_IQR(scores, k=2)
outliers = (scores < lower_bound) | (scores > upper_bound)
outliers_indices = outliers[outliers].index.values.tolist()
warning_msg = "Row '{}' is likely to have outlier data"
return [DataCheckWarning(warning_msg.format(row_index), self.name) for row_index in outliers_indices]
lower_bound, upper_bound = get_IQR(X)
indices = set()
# get the columns that fall out of the bounds, which means they contain outliers
for idx, bound in enumerate([lower_bound, upper_bound]):
cols_in_range = (X >= bound.values) if idx == 0 else (X <= bound.values)
cols_in_range = cols_in_range.all()
outlier_cols = cols_in_range[~cols_in_range].keys()
indices.update(outlier_cols.tolist())
# order the columns by how they appear in the dataframe
indices = sorted(list(indices), key=lambda x: X.columns.tolist().index(x))
warning_msg = "Column '{}' is likely to have outlier data"
s = [DataCheckWarning(warning_msg.format(row_index), self.name) for row_index in indices]
return s
33 changes: 17 additions & 16 deletions evalml/tests/data_checks_tests/test_outliers_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,17 @@ def test_outliers_data_check_warnings():
data = np.tile(a, (100, 10))

X = pd.DataFrame(data=data)
X.iloc[3, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[25, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[55, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[72, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[0, 3] = 1000
X.iloc[3, 25] = 1000
X.iloc[5, 55] = 10000
X.iloc[10, 72] = -1000
X.iloc[:, 90] = 'string_values'

outliers_check = OutliersDataCheck()
assert outliers_check.validate(X) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Row '25' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Row '55' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Row '72' is likely to have outlier data", "OutliersDataCheck")]
assert outliers_check.validate(X) == [DataCheckWarning("Column '3' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Column '25' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Column '55' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Column '72' is likely to have outlier data", "OutliersDataCheck")]


def test_outliers_data_check_input_formats():
Expand All @@ -42,13 +43,13 @@ def test_outliers_data_check_input_formats():
data = np.tile(a, (100, 10))

X = pd.DataFrame(data=data)
X.iloc[3, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[25, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[55, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[72, :] = pd.Series(np.random.randn(100) * 1000)
X.iloc[0, 3] = 1000
X.iloc[3, 25] = 1000
X.iloc[5, 55] = 10000
X.iloc[10, 72] = -1000

outliers_check = OutliersDataCheck()
assert outliers_check.validate(X.to_numpy()) == [DataCheckWarning("Row '3' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Row '25' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Row '55' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Row '72' is likely to have outlier data", "OutliersDataCheck")]
assert outliers_check.validate(X.to_numpy()) == [DataCheckWarning("Column '3' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Column '25' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Column '55' is likely to have outlier data", "OutliersDataCheck"),
DataCheckWarning("Column '72' is likely to have outlier data", "OutliersDataCheck")]

0 comments on commit 183f118

Please sign in to comment.