alteryx · angela97lin · Oct 18, 2019 · Oct 10, 2019 · Oct 10, 2019 · Oct 10, 2019
diff --git a/evalml/models/auto_base.py b/evalml/models/auto_base.py
@@ -18,7 +18,8 @@
 class AutoBase:
     def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
                  model_types, detect_label_leakage, start_iteration_callback,
-                 add_result_callback, additional_objectives, random_state, verbose):
+                 add_result_callback, additional_objectives, random_state, verbose,
+                 detect_highly_null, null_threshold):
         if tuner is None:
             tuner = SKOptTuner
         self.objective = get_objective(objective)
@@ -30,8 +31,10 @@ def __init__(self, problem_type, tuner, cv, objective, max_pipelines, max_time,
         self.add_result_callback = add_result_callback
         self.cv = cv
         self.verbose = verbose
-        self.logger = Logger(self.verbose)
+        self.detect_highly_null = detect_highly_null
+        self.null_threshold = null_threshold
 
+        self.logger = Logger(self.verbose)
         self.possible_pipelines = get_pipelines(problem_type=self.problem_type, model_types=model_types)
         self.objective = get_objective(objective)
 
@@ -119,6 +122,10 @@ def fit(self, X, y, feature_types=None, raise_errors=False):
                 leaked = [str(k) for k in leaked.keys()]
                 self.logger.log("WARNING: Possible label leakage: %s" % ", ".join(leaked))
 
+        if self.detect_highly_null:
+            highly_null_columns = preprocessing.detect_highly_null(X, percent_threshold=self.null_threshold)
+            self.logger.log("WARNING: {} columns are at least {}% null.".format(', '.join(highly_null_columns), self.null_threshold * 100))
+
         pbar = tqdm(range(self.max_pipelines), disable=not self.verbose, file=stdout, bar_format='{desc}   {percentage:3.0f}%|{bar}| Elapsed:{elapsed}')
         start = time.time()
         for n in pbar:

diff --git a/evalml/models/auto_classifier.py b/evalml/models/auto_classifier.py
@@ -23,7 +23,9 @@ def __init__(self,
                  add_result_callback=None,
                  additional_objectives=None,
                  random_state=0,
-                 verbose=True):
+                 verbose=True,
+                 detect_highly_null=True,
+                 null_threshold=0.95):
         """Automated classifier pipeline search
 
         Arguments:
@@ -86,7 +88,9 @@ def __init__(self,
             add_result_callback=add_result_callback,
             random_state=random_state,
             verbose=verbose,
-            additional_objectives=additional_objectives
+            additional_objectives=additional_objectives,
+            detect_highly_null=detect_highly_null,
+            null_threshold=null_threshold
         )
 
     def set_problem_type(self, objective, multiclass):

diff --git a/evalml/models/auto_regressor.py b/evalml/models/auto_regressor.py
@@ -20,7 +20,9 @@ def __init__(self,
                  add_result_callback=None,
                  additional_objectives=None,
                  random_state=0,
-                 verbose=True):
+                 verbose=True,
+                 detect_highly_null=True,
+                 null_threshold=0.95):
         """Automated regressors pipeline search
 
         Arguments:
@@ -76,5 +78,7 @@ def __init__(self,
             add_result_callback=add_result_callback,
             random_state=random_state,
             verbose=verbose,
-            additional_objectives=additional_objectives
+            additional_objectives=additional_objectives,
+            detect_highly_null=detect_highly_null,
+            null_threshold=null_threshold
         )
diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -116,3 +116,19 @@ def detect_label_leakage(X, y, threshold=.95):
     corrs = X.corrwith(y).abs()
     out = corrs[corrs >= threshold]
     return out.to_dict()
+
+
+def detect_highly_null(X, percent_threshold=.95):
+    """ Checks if there are any highly-null columns in a dataframe.
+
+    Args:
+        X (DataFrame) : features
+        percent_threshold(float): Require that percentage of non-null values to not be considered "highly-null", defaults to .95
+
+    Returns:
+        a set of features that are highly-null
+    """
+    threshold = len(X) * percent_threshold
+    num_nonnan = X.count()
+    filtered = num_nonnan[num_nonnan < threshold]
+    return (set(filtered.index))
diff --git a/evalml/tests/preprocessing_tests/test_drop_null.py b/evalml/tests/preprocessing_tests/test_drop_null.py
@@ -0,0 +1,17 @@
+import numpy as np
+import pandas as pd
+
+from evalml.preprocessing import detect_highly_null
+
+
+def test_detect_highly_null():
+    df = pd.DataFrame(np.random.random((100, 5)), columns=list("ABCDE"))
+    df.loc[:11, 'A'] = np.nan
+    df.loc[:10, 'B'] = np.nan
+    df.loc[:30, 'C'] = np.nan
+    df.loc[:, 'D'] = np.nan
+    df.loc[:9, 'E'] = np.nan
+
+    expected = {'A', 'B', 'C', 'D'}
+    nan_dropped_df = detect_highly_null(df, percent_threshold=.9)
+    assert expected == nan_dropped_df