Merge pull request #89 from ballet/discover-nan-targets

Support missing targets in discover and elsewhere
ballet · Jun 28, 2021 · 41d6f75 · 41d6f75
2 parents c9348e6 + 7e91d09
commit 41d6f75
Show file tree

Hide file tree

Showing 7 changed files with 175 additions and 20 deletions.
diff --git a/ballet/discovery.py b/ballet/discovery.py
@@ -7,7 +7,7 @@
 
 import ballet
 from ballet.transformer import get_transformer_primitives
-from ballet.util import asarray2d, dont_log_nonnegative
+from ballet.util import asarray2d, dont_log_nonnegative, skipna
 from ballet.validation.entropy import (
     _get_cont_columns, _get_disc_columns, estimate_conditional_information,
     estimate_mutual_information,)
@@ -37,13 +37,14 @@ def _summarize_feature(
             else feature.input
             if not callable(feature.input)
             else [],
-        'transformer': feature.transformer,
+        'transformer': repr(feature.transformer),
         'primitives': get_transformer_primitives(feature.transformer),
         'output': feature.output,
         'author': feature.author,
         'source': feature.source,
         'mutual_information': np.nan,
         'conditional_mutual_information': np.nan,
+        'ninputs': np.nan,
         'nvalues': np.nan,
         'ncontinuous': np.nan,
         'ndiscrete': np.nan,
@@ -71,7 +72,14 @@ def _summarize_feature(
             else:
                 x = np.empty((z.shape[0], 0))
 
-            result['mutual_information'] = estimate_mutual_information(z, y)
+            _y, _z = skipna(y, z, how='left')
+            result['mutual_information'] = estimate_mutual_information(_z, _y)
+
+            if not callable(feature.input):
+                if isinstance(feature.input, str):
+                    result['ninputs'] = 1
+                else:
+                    result['ninputs'] = len(feature.input)
             result['nvalues'] = z.shape[1]
             result['ncontinuous'] = np.sum(_get_cont_columns(z))
             result['ndiscrete'] = np.sum(_get_disc_columns(z))
@@ -84,8 +92,9 @@ def _summarize_feature(
             result['nunique'] = np.mean(countunique(z, axis=0))
 
             if expensive_stats or x.shape[1] < EXPENSIVE_STATS_CMI_MAX_COLS_X:
+                _y, _z, _x = skipna(y, z, x, how='left')
                 result['conditional_mutual_information'] = \
-                    estimate_conditional_information(z, y, x)
+                    estimate_conditional_information(_z, _y, _x)
 
     return result
 
@@ -123,6 +132,7 @@ def discover(
     - conditional_mutual_information: estimated conditional mutual information
         between the feature (or averaged over feature values) and the target
         conditional on all other features on the development dataset split
+    - ninputs: the number of input columns to the feature
     - nvalues: the number of feature values this feature extracts (i.e. 1 for
         a scalar-valued feature and >1 for a vector-valued feature)
     - ncontinuous: the number of feature values this feature extracts that are

diff --git a/ballet/util/__init__.py b/ballet/util/__init__.py
@@ -178,3 +178,62 @@ def filter(record: LogRecord) -> int:
 
 # re-export cookiecutter work_in
 work_in = cookiecutter.utils.work_in
+
+
+def skipna(a: np.ndarray, b: np.ndarray, *c: np.ndarray, how: str = 'left'):
+    """Drop rows of both a and b corresponding to missing values
+
+    The length of a and b along the first dimension must be equal.
+
+    Args:
+        a:
+            first array
+        b:
+            second array
+        *c:
+            any additional arrays
+        how:
+            how to determine the rows to drop, one of 'left', 'any', or 'all'.
+            If left, then any row in which a has a missing value is dropped. If
+            any, then any row in which at least one of a, b, or additional
+            arrays has a missing value is dropped. If all , then any row in
+            which all of a, b, and additional arrays has a missing value is
+            dropped. Defaults to left.
+
+    Returns:
+        tuple of a, b, and any additional arrays where a, b, and any
+        additional arrays are guaranteed to be the same length with missing
+        values removed according to ``how``.
+    """
+    if how not in ('left', 'any', 'all'):
+        raise ValueError(f'Invalid value for how: {how}')
+
+    def find_nan_inds(arr):
+        nan_inds = np.isnan(arr)
+        if arr.ndim > 1:
+            nan_inds = nan_inds.any(axis=1)
+        nan_inds = nan_inds.squeeze()
+        assert nan_inds.shape == (arr.shape[0],)
+        return nan_inds
+
+    if how == 'left':
+        nan_inds = find_nan_inds(a)
+    elif how == 'any':
+        arr = np.concatenate(
+            (asarray2d(a), asarray2d(b), *(asarray2d(c0) for c0 in c)),
+            axis=1
+        )
+        nan_inds = find_nan_inds(arr)
+    elif how == 'all':
+        nan_inds = find_nan_inds(a)
+        for arr in [b, *c]:
+            nan_inds &= find_nan_inds(arr)
+
+    a_out = a[~nan_inds]
+    b_out = b[~nan_inds]
+    c_out = [
+        arr[~nan_inds]
+        for arr in c
+    ]
+    out = (a_out, b_out, *c_out)
+    return out
diff --git a/ballet/validation/base.py b/ballet/validation/base.py
@@ -19,6 +19,11 @@ def validate(self) -> bool:
 class FeaturePerformanceEvaluator(metaclass=ABCMeta):
     """Evaluate the performance of features from an ML point-of-view
 
+    Implementing classes should be clear about their support for missing
+    targets, i.e. NaN values in ``y_val``. For example, the subclass can raise
+    an error indicating that it cannot be used for a problem, or it can choose
+    to skip rows with missing values in the performance evaluation.
+
     Args:
         X_df: entities frame for fitting the features
         y_df: targets frame/series for fitting the features

diff --git a/ballet/validation/feature_acceptance/validator.py b/ballet/validation/feature_acceptance/validator.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from ballet.util import asarray2d
+from ballet.util import asarray2d, skipna
 from ballet.util.log import logger
 from ballet.util.testing import seeded
 from ballet.validation.base import FeatureAcceptanceMixin, FeatureAccepter
@@ -47,6 +47,10 @@ def judge(self):
         Uses lines 1-8 of agGFSSF where we do not remove accepted but
         redundant features on line 8.
         """
+        if np.isnan(self.y_val).any():
+            raise ValueError(
+                f'{self.__class__.__name__} does not support missing targets,'
+                ' please use a different evaluator.')
 
         logger.info(f'Judging feature using {self}')
 
@@ -198,13 +202,11 @@ def judge(self):
         return outcome
 
     def _handle_nans(self, z, y):
-        nans = np.any(np.isnan(y), 1)  # whether there are any nans in this row
-        if np.any(nans):
+        if np.isnan(y).any():
             if self.handle_nan_targets == 'fail':
                 return None, None  # hack
             elif self.handle_nan_targets == 'ignore':
-                z = z[~nans, :]
-                y = y[~nans, :]
+                y, z = skipna(y, z, how='left')
             else:
                 raise ValueError(
                     'Invalid value for handle_nan_targets: '

diff --git a/ballet/validation/feature_pruning/validator.py b/ballet/validation/feature_pruning/validator.py
@@ -1,5 +1,7 @@
 import random
 
+import numpy as np
+
 from ballet.util.log import logger
 from ballet.util.testing import seeded
 from ballet.validation.base import FeaturePruner, FeaturePruningMixin
@@ -36,6 +38,10 @@ def prune(self):
 
         Uses lines 12-13 of agGFSSF
         """
+        if np.isnan(self.y_val).any():
+            raise ValueError(
+                f'{self.__class__.__name__} does not support missing targets,'
+                ' please use a different evaluator.')
 
         logger.info(f'Pruning features using {self}')
 

diff --git a/tests/test_discovery.py b/tests/test_discovery.py
@@ -41,7 +41,7 @@ def test_discover(sample_data, expensive_stats):
     expected_cols = {
         'name', 'description', 'input', 'transformer', 'primitives', 'output',
         'author', 'source', 'mutual_information',
-        'conditional_mutual_information', 'nvalues', 'ncontinuous',
+        'conditional_mutual_information', 'ninputs', 'nvalues', 'ncontinuous',
         'ndiscrete', 'mean', 'std', 'variance', 'min', 'median', 'max',
         'nunique',
     }
@@ -52,8 +52,8 @@ def test_discover(sample_data, expensive_stats):
 
     # test filter
     input = 'size'
-    df = discover(features, X_df, y_df, y, input=input)
-    assert df.shape[0] == len([
+    discovery_df = discover(features, X_df, y_df, y, input=input)
+    assert discovery_df.shape[0] == len([
         feature
         for feature in features
         if feature.input == input or input in feature.input
@@ -62,11 +62,11 @@ def test_discover(sample_data, expensive_stats):
     # test no data available
     # have to clear cache, as values on data already known
     ballet.discovery._summarize_feature.memory.clear()
-    df = discover(features, None, None, None)
-    assert df.shape[0] == len(features)
-    actual_cols = df.columns
+    discovery_df = discover(features, None, None, None)
+    assert discovery_df.shape[0] == len(features)
+    actual_cols = discovery_df.columns
     assert not expected_cols.symmetric_difference(actual_cols)
-    assert np.isnan(df['mean'].at[0])
+    assert np.isnan(discovery_df['mean'].at[0])
 
 
 def test_discover_feature_error(sample_data):
@@ -76,7 +76,23 @@ def test_discover_feature_error(sample_data):
     X_df, y_df = sample_data.X, sample_data.y
     y = np.asfarray(y_df)
 
-    df = discover(features, X_df, y_df, y)
+    discovery_df = discover(features, X_df, y_df, y)
 
-    assert df.shape[0] == len(features)
-    assert np.isnan(df['mean'].at[0])
+    assert discovery_df.shape[0] == len(features)
+    assert np.isnan(discovery_df['mean'].at[0])
+
+
+def test_discover_target_nans(sample_data):
+    features = [
+        Feature('size', NullFiller(0)),
+    ]
+    X_df, y_df = sample_data.X, sample_data.y
+    y = np.asfarray(y_df)
+
+    # introduce nan to target
+    y[0] = np.nan
+
+    discovery_df = discover(features, X_df, y_df, y)
+
+    # stats with target should still be computed
+    assert not np.isnan(discovery_df['mutual_information']).any()
diff --git a/tests/util/test_util.py b/tests/util/test_util.py
@@ -8,7 +8,7 @@
 from ballet.util import (
     DeepcopyMixin, asarray2d, dfilter, dont_log_nonnegative, falsy,
     get_arr_desc, has_nans, indent, load_sklearn_df, make_plural_suffix,
-    nonnegative, quiet, truthy,)
+    nonnegative, quiet, skipna, truthy,)
 from ballet.util.log import logger
 from ballet.util.testing import assert_array_equal
 
@@ -252,3 +252,60 @@ def estimate_something():
         estimate_something
 
     assert not caplog.text
+
+
+@pytest.mark.parametrize('how', ['left', 'any', 'all'])
+@pytest.mark.parametrize(
+    'a,b,c',
+    [
+        (np.ones(10), np.ones(10), None),
+        (np.ones(10), np.ones(10), np.ones(10)),
+        (np.ones((10, 2)), np.ones(10), None),
+        (np.ones(10), np.ones((10, 2)), None),
+        (np.concatenate([np.ones(5), np.full(5, np.nan)]), np.ones(10), None),
+        (np.ones(10), np.concatenate([np.ones(5), np.full(5, np.nan)]), None),
+        (
+            np.concatenate([np.ones(10), np.full(5, np.nan)]),
+            np.concatenate([np.ones(5), np.full(5, np.nan), np.ones(5)]),
+            None,
+        ),
+        (
+            np.ones(10),
+            np.concatenate([np.ones(5), np.full(5, np.nan)]),
+            np.concatenate([np.full(2, np.nan), np.ones(8)]),
+        ),
+    ]
+)
+def test_skipna(a, b, c, how):
+    if c is not None:
+        a1, b1, c1 = skipna(a, b, c, how=how)
+    else:
+        a1, b1 = skipna(a, b, how=how)
+        c1 = None
+
+    assert a1.shape[0] == b1.shape[0]
+    assert a1.shape[1:] == a.shape[1:]
+    assert b1.shape[1:] == b.shape[1:]
+    if c is not None:
+        assert c1.shape[0] == a1.shape[0]
+        assert c1.shape[1:] == c.shape[1:]
+
+    if how == 'left' or how == 'any':
+        assert not np.isnan(a1).any()
+    if how == 'any':
+        assert not np.isnan(b1).any()
+    if how == 'all':
+        left_nans = np.isnan(a1)
+        if left_nans.ndim > 1:
+            left_nans = left_nans.any(axis=1)
+        right_nans = np.isnan(b1)
+        if right_nans.ndim > 1:
+            right_nans = right_nans.any(axis=1)
+        assert not (left_nans & right_nans).any()
+
+    # symmetry of b and c
+    if c is not None:
+        _, b2, c2 = skipna(a, b, c, how=how)
+        _, c3, b3 = skipna(a, c, b, how=how)
+        assert_array_equal(b2, b3)
+        assert_array_equal(c2, c3)