Skip to content

Commit

Permalink
Merge pull request #89 from ballet/discover-nan-targets
Browse files Browse the repository at this point in the history
Support missing targets in discover and elsewhere
  • Loading branch information
micahjsmith committed Jun 28, 2021
2 parents c9348e6 + 7e91d09 commit 41d6f75
Show file tree
Hide file tree
Showing 7 changed files with 175 additions and 20 deletions.
18 changes: 14 additions & 4 deletions ballet/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import ballet
from ballet.transformer import get_transformer_primitives
from ballet.util import asarray2d, dont_log_nonnegative
from ballet.util import asarray2d, dont_log_nonnegative, skipna
from ballet.validation.entropy import (
_get_cont_columns, _get_disc_columns, estimate_conditional_information,
estimate_mutual_information,)
Expand Down Expand Up @@ -37,13 +37,14 @@ def _summarize_feature(
else feature.input
if not callable(feature.input)
else [],
'transformer': feature.transformer,
'transformer': repr(feature.transformer),
'primitives': get_transformer_primitives(feature.transformer),
'output': feature.output,
'author': feature.author,
'source': feature.source,
'mutual_information': np.nan,
'conditional_mutual_information': np.nan,
'ninputs': np.nan,
'nvalues': np.nan,
'ncontinuous': np.nan,
'ndiscrete': np.nan,
Expand Down Expand Up @@ -71,7 +72,14 @@ def _summarize_feature(
else:
x = np.empty((z.shape[0], 0))

result['mutual_information'] = estimate_mutual_information(z, y)
_y, _z = skipna(y, z, how='left')
result['mutual_information'] = estimate_mutual_information(_z, _y)

if not callable(feature.input):
if isinstance(feature.input, str):
result['ninputs'] = 1
else:
result['ninputs'] = len(feature.input)
result['nvalues'] = z.shape[1]
result['ncontinuous'] = np.sum(_get_cont_columns(z))
result['ndiscrete'] = np.sum(_get_disc_columns(z))
Expand All @@ -84,8 +92,9 @@ def _summarize_feature(
result['nunique'] = np.mean(countunique(z, axis=0))

if expensive_stats or x.shape[1] < EXPENSIVE_STATS_CMI_MAX_COLS_X:
_y, _z, _x = skipna(y, z, x, how='left')
result['conditional_mutual_information'] = \
estimate_conditional_information(z, y, x)
estimate_conditional_information(_z, _y, _x)

return result

Expand Down Expand Up @@ -123,6 +132,7 @@ def discover(
- conditional_mutual_information: estimated conditional mutual information
between the feature (or averaged over feature values) and the target
conditional on all other features on the development dataset split
- ninputs: the number of input columns to the feature
- nvalues: the number of feature values this feature extracts (i.e. 1 for
a scalar-valued feature and >1 for a vector-valued feature)
- ncontinuous: the number of feature values this feature extracts that are
Expand Down
59 changes: 59 additions & 0 deletions ballet/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,62 @@ def filter(record: LogRecord) -> int:

# re-export cookiecutter work_in
work_in = cookiecutter.utils.work_in


def skipna(a: np.ndarray, b: np.ndarray, *c: np.ndarray, how: str = 'left'):
"""Drop rows of both a and b corresponding to missing values
The length of a and b along the first dimension must be equal.
Args:
a:
first array
b:
second array
*c:
any additional arrays
how:
how to determine the rows to drop, one of 'left', 'any', or 'all'.
If left, then any row in which a has a missing value is dropped. If
any, then any row in which at least one of a, b, or additional
arrays has a missing value is dropped. If all , then any row in
which all of a, b, and additional arrays has a missing value is
dropped. Defaults to left.
Returns:
tuple of a, b, and any additional arrays where a, b, and any
additional arrays are guaranteed to be the same length with missing
values removed according to ``how``.
"""
if how not in ('left', 'any', 'all'):
raise ValueError(f'Invalid value for how: {how}')

def find_nan_inds(arr):
nan_inds = np.isnan(arr)
if arr.ndim > 1:
nan_inds = nan_inds.any(axis=1)
nan_inds = nan_inds.squeeze()
assert nan_inds.shape == (arr.shape[0],)
return nan_inds

if how == 'left':
nan_inds = find_nan_inds(a)
elif how == 'any':
arr = np.concatenate(
(asarray2d(a), asarray2d(b), *(asarray2d(c0) for c0 in c)),
axis=1
)
nan_inds = find_nan_inds(arr)
elif how == 'all':
nan_inds = find_nan_inds(a)
for arr in [b, *c]:
nan_inds &= find_nan_inds(arr)

a_out = a[~nan_inds]
b_out = b[~nan_inds]
c_out = [
arr[~nan_inds]
for arr in c
]
out = (a_out, b_out, *c_out)
return out
5 changes: 5 additions & 0 deletions ballet/validation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ def validate(self) -> bool:
class FeaturePerformanceEvaluator(metaclass=ABCMeta):
"""Evaluate the performance of features from an ML point-of-view
Implementing classes should be clear about their support for missing
targets, i.e. NaN values in ``y_val``. For example, the subclass can raise
an error indicating that it cannot be used for a problem, or it can choose
to skip rows with missing values in the performance evaluation.
Args:
X_df: entities frame for fitting the features
y_df: targets frame/series for fitting the features
Expand Down
12 changes: 7 additions & 5 deletions ballet/validation/feature_acceptance/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy as np

from ballet.util import asarray2d
from ballet.util import asarray2d, skipna
from ballet.util.log import logger
from ballet.util.testing import seeded
from ballet.validation.base import FeatureAcceptanceMixin, FeatureAccepter
Expand Down Expand Up @@ -47,6 +47,10 @@ def judge(self):
Uses lines 1-8 of agGFSSF where we do not remove accepted but
redundant features on line 8.
"""
if np.isnan(self.y_val).any():
raise ValueError(
f'{self.__class__.__name__} does not support missing targets,'
' please use a different evaluator.')

logger.info(f'Judging feature using {self}')

Expand Down Expand Up @@ -198,13 +202,11 @@ def judge(self):
return outcome

def _handle_nans(self, z, y):
nans = np.any(np.isnan(y), 1) # whether there are any nans in this row
if np.any(nans):
if np.isnan(y).any():
if self.handle_nan_targets == 'fail':
return None, None # hack
elif self.handle_nan_targets == 'ignore':
z = z[~nans, :]
y = y[~nans, :]
y, z = skipna(y, z, how='left')
else:
raise ValueError(
'Invalid value for handle_nan_targets: '
Expand Down
6 changes: 6 additions & 0 deletions ballet/validation/feature_pruning/validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import random

import numpy as np

from ballet.util.log import logger
from ballet.util.testing import seeded
from ballet.validation.base import FeaturePruner, FeaturePruningMixin
Expand Down Expand Up @@ -36,6 +38,10 @@ def prune(self):
Uses lines 12-13 of agGFSSF
"""
if np.isnan(self.y_val).any():
raise ValueError(
f'{self.__class__.__name__} does not support missing targets,'
' please use a different evaluator.')

logger.info(f'Pruning features using {self}')

Expand Down
36 changes: 26 additions & 10 deletions tests/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_discover(sample_data, expensive_stats):
expected_cols = {
'name', 'description', 'input', 'transformer', 'primitives', 'output',
'author', 'source', 'mutual_information',
'conditional_mutual_information', 'nvalues', 'ncontinuous',
'conditional_mutual_information', 'ninputs', 'nvalues', 'ncontinuous',
'ndiscrete', 'mean', 'std', 'variance', 'min', 'median', 'max',
'nunique',
}
Expand All @@ -52,8 +52,8 @@ def test_discover(sample_data, expensive_stats):

# test filter
input = 'size'
df = discover(features, X_df, y_df, y, input=input)
assert df.shape[0] == len([
discovery_df = discover(features, X_df, y_df, y, input=input)
assert discovery_df.shape[0] == len([
feature
for feature in features
if feature.input == input or input in feature.input
Expand All @@ -62,11 +62,11 @@ def test_discover(sample_data, expensive_stats):
# test no data available
# have to clear cache, as values on data already known
ballet.discovery._summarize_feature.memory.clear()
df = discover(features, None, None, None)
assert df.shape[0] == len(features)
actual_cols = df.columns
discovery_df = discover(features, None, None, None)
assert discovery_df.shape[0] == len(features)
actual_cols = discovery_df.columns
assert not expected_cols.symmetric_difference(actual_cols)
assert np.isnan(df['mean'].at[0])
assert np.isnan(discovery_df['mean'].at[0])


def test_discover_feature_error(sample_data):
Expand All @@ -76,7 +76,23 @@ def test_discover_feature_error(sample_data):
X_df, y_df = sample_data.X, sample_data.y
y = np.asfarray(y_df)

df = discover(features, X_df, y_df, y)
discovery_df = discover(features, X_df, y_df, y)

assert df.shape[0] == len(features)
assert np.isnan(df['mean'].at[0])
assert discovery_df.shape[0] == len(features)
assert np.isnan(discovery_df['mean'].at[0])


def test_discover_target_nans(sample_data):
features = [
Feature('size', NullFiller(0)),
]
X_df, y_df = sample_data.X, sample_data.y
y = np.asfarray(y_df)

# introduce nan to target
y[0] = np.nan

discovery_df = discover(features, X_df, y_df, y)

# stats with target should still be computed
assert not np.isnan(discovery_df['mutual_information']).any()
59 changes: 58 additions & 1 deletion tests/util/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from ballet.util import (
DeepcopyMixin, asarray2d, dfilter, dont_log_nonnegative, falsy,
get_arr_desc, has_nans, indent, load_sklearn_df, make_plural_suffix,
nonnegative, quiet, truthy,)
nonnegative, quiet, skipna, truthy,)
from ballet.util.log import logger
from ballet.util.testing import assert_array_equal

Expand Down Expand Up @@ -252,3 +252,60 @@ def estimate_something():
estimate_something

assert not caplog.text


@pytest.mark.parametrize('how', ['left', 'any', 'all'])
@pytest.mark.parametrize(
'a,b,c',
[
(np.ones(10), np.ones(10), None),
(np.ones(10), np.ones(10), np.ones(10)),
(np.ones((10, 2)), np.ones(10), None),
(np.ones(10), np.ones((10, 2)), None),
(np.concatenate([np.ones(5), np.full(5, np.nan)]), np.ones(10), None),
(np.ones(10), np.concatenate([np.ones(5), np.full(5, np.nan)]), None),
(
np.concatenate([np.ones(10), np.full(5, np.nan)]),
np.concatenate([np.ones(5), np.full(5, np.nan), np.ones(5)]),
None,
),
(
np.ones(10),
np.concatenate([np.ones(5), np.full(5, np.nan)]),
np.concatenate([np.full(2, np.nan), np.ones(8)]),
),
]
)
def test_skipna(a, b, c, how):
if c is not None:
a1, b1, c1 = skipna(a, b, c, how=how)
else:
a1, b1 = skipna(a, b, how=how)
c1 = None

assert a1.shape[0] == b1.shape[0]
assert a1.shape[1:] == a.shape[1:]
assert b1.shape[1:] == b.shape[1:]
if c is not None:
assert c1.shape[0] == a1.shape[0]
assert c1.shape[1:] == c.shape[1:]

if how == 'left' or how == 'any':
assert not np.isnan(a1).any()
if how == 'any':
assert not np.isnan(b1).any()
if how == 'all':
left_nans = np.isnan(a1)
if left_nans.ndim > 1:
left_nans = left_nans.any(axis=1)
right_nans = np.isnan(b1)
if right_nans.ndim > 1:
right_nans = right_nans.any(axis=1)
assert not (left_nans & right_nans).any()

# symmetry of b and c
if c is not None:
_, b2, c2 = skipna(a, b, c, how=how)
_, c3, b3 = skipna(a, c, b, how=how)
assert_array_equal(b2, b3)
assert_array_equal(c2, c3)

0 comments on commit 41d6f75

Please sign in to comment.