alteryx · angela97lin · Oct 18, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 10, 2019
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -20,7 +20,6 @@ Demo Datasets
 
 .. currentmodule:: evalml.preprocessing
 
-
 Preprocessing
 =============
 
@@ -31,7 +30,6 @@ Preprocessing
 
     load_data
     split_data
-    detect_label_leakage
 
 
 .. currentmodule:: evalml
@@ -57,6 +55,7 @@ Model Types
 
     list_model_types
 
+
 .. currentmodule:: evalml.pipelines
 
 Pipelines
@@ -76,11 +75,10 @@ Pipelines
     RFRegressionPipeline
 
 
-Objective Functions
-====================
-
 .. currentmodule:: evalml.objectives
 
+Objective Functions
+====================
 
 Domain Specific
 ~~~~~~~~~~~~~~~
@@ -131,6 +129,7 @@ Regression
 
     R2
 
+
 .. currentmodule:: evalml.problem_types
 
 Problem Types
@@ -144,6 +143,7 @@ Problem Types
     ProblemTypes
     handle_problem_types
 
+
 .. currentmodule:: evalml.tuners
 
 Tuners
@@ -157,4 +157,15 @@ Tuners
     SKOptTuner
 
 
+.. currentmodule:: evalml.guardrails
+
+Guardrails
+=============
 
+.. autosummary::
+    :toctree: generated
+    :template: class.rst
+    :nosignatures:
+
+    detect_highly_null
+    detect_label_leakage
diff --git a/evalml/__init__.py b/evalml/__init__.py
@@ -18,6 +18,7 @@
 import evalml.pipelines
 import evalml.model_types
 import evalml.utils
+import evalml.guardrails
 
 from evalml.pipelines import list_model_types, save_pipeline, load_pipeline
 from evalml.models import AutoClassifier, AutoRegressor

diff --git a/evalml/guardrails/__init__.py b/evalml/guardrails/__init__.py
@@ -0,0 +1,2 @@
+# flake8:noqa
+from .utils import *
diff --git a/evalml/guardrails/utils.py b/evalml/guardrails/utils.py
@@ -0,0 +1,45 @@
+import pandas as pd
+
+
+def detect_label_leakage(X, y, threshold=.95):
+    """Check if any of the features are highly correlated with the target.
+
+    Currently only supports binary and numeric targets and features
+
+    Args:
+        X (pd.DataFrame): The input features to check
+        y (pd.Series): the labels
+        threshold (float): the correlation threshold to be considered leakage. Defaults to .95
+
+    Returns:
+        leakage, dictionary of features with leakage and corresponding threshold
+    """
+
+    # only select numeric
+    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']
+    X = X.select_dtypes(include=numerics)
+
+    if len(X.columns) == 0:
+        return {}
+
+    corrs = X.corrwith(y).abs()
+    out = corrs[corrs >= threshold]
+    return out.to_dict()
+
+
+def detect_highly_null(X, percent_threshold=.95):
+    """ Checks if there are any highly-null columns in a dataframe.
+
+    Args:
+        X (DataFrame) : features
+        percent_threshold(float): Require that percentage of null values to be considered "highly-null", defaults to .95
+
+    Returns:
+        A dictionary of features with column name or index and their percentage of null values
+    """
+    if not isinstance(X, pd.DataFrame):
+        X = pd.DataFrame(X)
+
+    percent_null = (X.isnull().mean()).to_dict()
+    highly_null_cols = {key: value for key, value in percent_null.items() if value >= percent_threshold}
+    return highly_null_cols
diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from tqdm import tqdm
 
-from evalml import preprocessing
+from evalml import guardrails
 from evalml.objectives import get_objective, get_objectives
 from evalml.pipelines import get_pipelines
 from evalml.problem_types import ProblemTypes
@@ -18,7 +18,7 @@
 class AutoBase:
     def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
                  model_types, detect_label_leakage, start_iteration_callback,
-                 add_result_callback, additional_objectives, random_state, verbose):
+                 add_result_callback, additional_objectives, null_threshold, random_state, verbose):
         if tuner is None:
             tuner = SKOptTuner
         self.objective = get_objective(objective)
@@ -29,9 +29,9 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
         self.start_iteration_callback = start_iteration_callback
         self.add_result_callback = add_result_callback
         self.cv = cv
+        self.null_threshold = null_threshold
         self.verbose = verbose
         self.logger = Logger(self.verbose)
-
         self.possible_pipelines = get_pipelines(problem_type=self.problem_type, model_types=model_types)
         self.objective = get_objective(objective)
 
@@ -114,11 +114,16 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
         self.logger.log("Possible model types: %s\n" % ", ".join([model.value for model in self.possible_model_types]))
 
         if self.detect_label_leakage:
-            leaked = preprocessing.detect_label_leakage(X, y)
+            leaked = guardrails.detect_label_leakage(X, y)
             if len(leaked) > 0:
                 leaked = [str(k) for k in leaked.keys()]
                 self.logger.log("WARNING: Possible label leakage: %s" % ", ".join(leaked))
 
+        if self.null_threshold is not None:
+            highly_null_columns = guardrails.detect_highly_null(X, percent_threshold=self.null_threshold)
+            if len(highly_null_columns) > 0:
+                self.logger.log("WARNING: {} columns are at least {}% null.".format(', '.join(highly_null_columns), self.null_threshold * 100))
+
         pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc}   {percentage:3.0f}%|{bar}| Elapsed:{elapsed}')
         start = time.time()
         for n in pbar:

diff --git a/evalml/models/auto_classifier.py b/evalml/models/auto_classifier.py
@@ -22,6 +22,7 @@ def __init__(self,
                  start_iteration_callback=None,
                  add_result_callback=None,
                  additional_objectives=None,
+                 null_threshold=0.95,
                  random_state=0,
                  verbose=True):
         """Automated classifier pipeline search
@@ -55,6 +56,9 @@ def __init__(self,
             additional_objectives (list): Custom set of objectives to score on.
                 Will override default objectives for problem type if not empty.
 
+            null_threshold(float): Float in range [0,1] that represents what percentage of a feature needs to be
+                null values for the feature to be considered "highly-null". Default is 0.95.
+
             random_state (int): the random_state
 
             verbose (boolean): If True, turn verbosity on. Defaults to True
@@ -84,9 +88,10 @@ def __init__(self,
             detect_label_leakage=detect_label_leakage,
             start_iteration_callback=start_iteration_callback,
             add_result_callback=add_result_callback,
+            additional_objectives=additional_objectives,
+            null_threshold=null_threshold,
             random_state=random_state,
             verbose=verbose,
-            additional_objectives=additional_objectives
         )
 
     def set_problem_type(self, objective, multiclass):

diff --git a/evalml/models/auto_regressor.py b/evalml/models/auto_regressor.py
@@ -19,6 +19,7 @@ def __init__(self,
                  start_iteration_callback=None,
                  add_result_callback=None,
                  additional_objectives=None,
+                 null_threshold=0.95,
                  random_state=0,
                  verbose=True):
         """Automated regressors pipeline search
@@ -50,6 +51,9 @@ def __init__(self,
             additional_objectives (list): Custom set of objectives to score on.
                 Will override default objectives for problem type if not empty.
 
+            null_threshold(float): Float in range [0,1] that represents what percentage of a feature needs to be
+                null values for the feature to be considered "highly-null". Default is 0.95.
+
             random_state (int): the random_state
 
             verbose (boolean): If True, turn verbosity on. Defaults to True
@@ -74,7 +78,8 @@ def __init__(self,
             detect_label_leakage=detect_label_leakage,
             start_iteration_callback=start_iteration_callback,
             add_result_callback=add_result_callback,
+            additional_objectives=additional_objectives,
+            null_threshold=null_threshold,
             random_state=random_state,
-            verbose=verbose,
-            additional_objectives=additional_objectives
+            verbose=verbose
         )
diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py
@@ -11,4 +11,4 @@
     list_model_types,
     load_pipeline,
     save_pipeline
-)
+)
diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -90,29 +90,3 @@ def number_of_features(dtypes):
 def label_distribution(labels):
     distribution = labels.value_counts() / len(labels)
     return distribution.mul(100).apply('{:.2f}%'.format).rename_axis('Labels')
-
-
-def detect_label_leakage(X, y, threshold=.95):
-    """Check if any of the features are highly correlated with the target.
-
-    Currently only supports binary and numeric targets and features
-
-    Args:
-        X (pd.DataFrame): The input features to check
-        y (pd.Series): the labels
-        threshold (float): the correlation threshold to be considered leakage. Defaults to .95
-
-    Returns:
-        leakage, dictionary of features with leakage and corresponding threshold
-    """
-
-    # only select numeric
-    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']
-    X = X.select_dtypes(include=numerics)
-
-    if len(X.columns) == 0:
-        return {}
-
-    corrs = X.corrwith(y).abs()
-    out = corrs[corrs >= threshold]
-    return out.to_dict()
diff --git a/...essing_tests/test_detect_label_leakage.py → ...rdrail_tests/test_detect_label_leakage.py b/...essing_tests/test_detect_label_leakage.py → ...rdrail_tests/test_detect_label_leakage.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from evalml.preprocessing import detect_label_leakage
+from evalml.guardrails import detect_label_leakage
 
 
 def test_detect_label_leakage():

diff --git a/evalml/tests/guardrail_tests/test_drop_null.py b/evalml/tests/guardrail_tests/test_drop_null.py
@@ -0,0 +1,17 @@
+import numpy as np
+import pandas as pd
+
+from evalml.guardrails import detect_highly_null
+
+
+def test_detect_highly_null():
+    df = pd.DataFrame(np.random.random((100, 5)), columns=list("ABCDE"))
+    df.loc[:11, 'A'] = np.nan
+    df.loc[:9, 'B'] = np.nan
+    df.loc[:30, 'C'] = np.nan
+    df.loc[:, 'D'] = np.nan
+    df.loc[:89, 'E'] = np.nan
+
+    expected = {'D': 1.0, 'E': 0.9}
+    highly_null_set = detect_highly_null(df, percent_threshold=.9)
+    assert expected == highly_null_set
diff --git a/evalml/utils/__init__.py b/evalml/utils/__init__.py
@@ -1,3 +1,3 @@
 # flake8:noqa
 from .logging_utils import Logger
-from .convert_time import convert_to_seconds
+from .convert_time import convert_to_seconds