Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VarianceThresholdAccepter, MutualInformationAccepter, and CompoundAccepter #76

Merged
merged 3 commits into from
May 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,10 @@ pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cov/
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
Expand Down
4 changes: 1 addition & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ clean-pyc: ## remove Python file artifacts
.PHONY: clean-test
clean-test: ## remove test and coverage artifacts
rm -fr .tox
rm -f .coverage
rm -f coverage.xml
rm -fr htmlcov
rm -fr .cov
rm -fr .pytest_cache
rm -fr .mypy_cache

Expand Down
2 changes: 2 additions & 0 deletions ballet/validation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class FeaturePerformanceEvaluator(metaclass=ABCMeta):
y_df: targets frame/series for fitting the features
X_df_val: entities frame for evaluating the features
y_val: target values for evaluating the features
features: all collected features
candidate_feature: the feature to evaluate
"""

def __init__(self,
Expand Down
22 changes: 20 additions & 2 deletions ballet/validation/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pathlib
from types import ModuleType
from typing import (
Callable, Collection, Iterator, List, NamedTuple, Optional, Tuple,)
Callable, Collection, Iterator, List, NamedTuple, Optional, Tuple, Union,)

import git
from funcy import (
Expand All @@ -19,7 +19,8 @@
Differ, LocalMergeBuildDiffer, LocalPullRequestBuildDiffer, NoOpDiffer,
can_use_local_differ, can_use_local_merge_differ,)
from ballet.util.log import logger
from ballet.util.mod import import_module_at_path, relpath_to_modname
from ballet.util.mod import (
import_module_at_path, import_module_from_modname, relpath_to_modname,)
from ballet.validation.base import FeaturePerformanceEvaluator
from ballet.validation.project_structure.checks import ProjectStructureCheck

Expand Down Expand Up @@ -288,3 +289,20 @@ def __init__(self, *args, p=0.3, seed=None):

def __str__(self):
return f'{super().__str__()}: p={self.p}, seed={self.seed}'


def load_spec(spec: Union[str, dict]) -> Tuple[type, dict]:
if isinstance(spec, str):
path = spec
params = {}
else:
path = spec['name']
params = spec.get('params', {})
modname, clsname = path.rsplit('.', maxsplit=1)
mod = import_module_from_modname(modname)
cls = getattr(mod, clsname)
modfile = getattr(mod, '__file__', '<unknown>')
logger.debug(
f'Loaded class {clsname} from module at {modfile} '
f'with params {params!r}')
return cls, params
134 changes: 132 additions & 2 deletions ballet/validation/feature_acceptance/validator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import random
from typing import List

import numpy as np

from ballet.util import asarray2d
from ballet.util.log import logger
from ballet.util.testing import seeded
from ballet.validation.base import FeatureAcceptanceMixin, FeatureAccepter
from ballet.validation.common import RandomFeaturePerformanceEvaluator
from ballet.validation.entropy import estimate_conditional_information
from ballet.validation.common import (
RandomFeaturePerformanceEvaluator, load_spec,)
from ballet.validation.entropy import (
estimate_conditional_information, estimate_mutual_information,)
from ballet.validation.gfssf import (
GFSSFIterationInfo, GFSSFPerformanceEvaluator, _compute_lmbdas,
_compute_threshold, _concat_datasets,)
Expand Down Expand Up @@ -121,3 +127,127 @@ def judge(self):
f'Rejected feature: best marginal conditional mutual information was not greater than threshold ({cmi_closest:0.3e} - {omitted_cmi_closest:0.3e} = {statistic_closest:0.3e}, vs needed {threshold_closest:0.3e}).') # noqa

return False


class VarianceThresholdAccepter(FeatureAccepter):
"""Accept features with variance above a threshold

Args:
threshold: variance threshold
"""

def __init__(self, *args, threshold=0.05):
super().__init__(*args)
self.threshold = threshold

def judge(self):
logger.info(f'Judging feature using {self}')
z = (
self.candidate_feature
.as_feature_engineering_pipeline()
.fit(self.X_df, y=self.y_df)
.transform(self.X_df_val)
)
return np.var(z) >= self.threshold

def __str__(self):
return f'{super().__str__()}: threshold={self.threshold}'


class MutualInformationAccepter(FeatureAccepter):
"""Accept features with mutual information with the target above a threshold

Args:
threshold: mutual information threshold
handle_nan_targets: one of ``'fail'`` or ``'ignore'``, whether to
fail validation if NaN-valued targets are discovered or to drop
those rows in calculation of the mutual information score
"""
def __init__(self, *args, threshold=0.05, handle_nan_targets='fail'):
super().__init__(*args)
self.threshold = threshold
self.handle_nan_targets = handle_nan_targets

def judge(self):
logger.info(f'Judging feature using {self}')
z = (
self.candidate_feature
.as_feature_engineering_pipeline()
.fit(self.X_df, y=self.y_df)
.transform(self.X_df_val)
)
y = self.y_val
z, y = asarray2d(z), asarray2d(y)
z, y = self._handle_nans(z, y)
if z is None and y is None:
# nans were found and handle_nan_targets == 'fail'
return False
mi = estimate_mutual_information(z, y)
return mi >= self.threshold

def _handle_nans(self, z, y):
nans = np.any(np.isnan(y), 1) # whether there are any nans in this row
if np.any(nans):
if self.handle_nan_targets == 'fail':
return None, None # hack
elif self.handle_nan_targets == 'ignore':
z = z[~nans, :]
y = y[~nans, :]
else:
raise ValueError(
'Invalid value for handle_nan_targets: '
f'{self.handle_nan_targets}'
)

return z, y

def __str__(self):
return f'{super().__str__()}: threshold={self.threshold}'


class CompoundAccepter(FeatureAccepter):
"""A compound accepter that runs a list of individual accepters

An accepter spec is just a simple serialization of a class and its kwargs::

name: ballet.validation.feature_acceptance.validator.CompoundAccepter
params:
agg: any
specs:
- name: ballet.validation.feature_acceptance.validator.VarianceThresholdAccepter
params:
threshold: 0.1
- name: ballet.validation.feature_acceptance.validator.MutualInformationAccepter
params:
threshold: 0.1

Args:
agg: one of ``'all'`` or ``'any'``; whether to accept if all
underlying accepters accept or if any accepter accepts.
specs: list of dicts of accepter specs
""" # noqa
def __init__(self, *args, agg='all', specs: List[dict] = []):
super().__init__(*args)
if not specs:
raise ValueError('Missing list of accepter specs!')
self.accepters = []
for spec in specs:
cls, params = load_spec(spec)
self.accepters.append(cls(*args, **params))
if agg == 'all':
self.agg = all
elif agg == 'any':
self.agg = any
else:
raise ValueError(f'Unsupported value for parameter agg: {agg}')

def judge(self):
logger.info(f'Judging feature using {self}')
return self.agg(
accepter.judge()
for accepter in self.accepters
)

def __str__(self):
accepter_str = ', '.join(str(accepter) for accepter in self.accepters)
return f'{super().__str__()} ({accepter_str})'
21 changes: 3 additions & 18 deletions ballet/validation/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
from ballet.feature import Feature
from ballet.project import Project
from ballet.util.log import logger
from ballet.util.mod import import_module_from_modname
from ballet.validation.common import (
get_accepted_features, get_proposed_feature,)
get_accepted_features, get_proposed_feature, load_spec,)

# helpful for log parsing
PRUNER_MESSAGE = 'Found Redundant Feature: '
Expand Down Expand Up @@ -64,22 +63,8 @@ def _load_validator_class_params(
make_validator(arg)
baz.qux.MyFeatureAccepter(arg, key1=value1)
""" # noqa E501
entry = project.config.get(config_key)
if isinstance(entry, str):
path = entry
params = {}
else:
path = entry.get('name')
params = dict(entry.get('params'))

modname, clsname = path.rsplit('.', maxsplit=1)
mod = import_module_from_modname(modname)
cls = getattr(mod, clsname)
clsname = getattr(cls, '__name__', '<unknown>')
modfile = getattr(mod, '__file__', '<unknown>')
logger.debug(
'Loaded class %s from module at %s with params %r',
clsname, modfile, params)
spec = project.config.get(config_key)
cls, params = load_spec(spec)
return func_partial(cls, **params)


Expand Down
17 changes: 12 additions & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -38,34 +38,41 @@ skip_glob = **/ballet/templates/**, **/ballet/compat.py, **/ballet/__init__.py,
test = pytest

[tool:pytest]
filterwarnings =
filterwarnings =
ignore::FutureWarning
addopts =
addopts =
--strict-markers
--cov-config=setup.cfg
--cov-report=html
--cov-report=xml
--cov-report=term
--cov=ballet
--ignore=setup.py
markers =
markers =
slow

[coverage:run]
branch = True
source = ballet
omit =
data_file = .cov/.coverage
omit =
ballet/templates/*
ballet/eng/external/*

[coverage:report]
exclude_lines =
exclude_lines =
if self.debug:
pragma: no cover
raise NotImplementedError
if __name__ == .__main__.:
ignore_errors = True

[coverage:xml]
output = .cov/coverage.xml

[coverage:html]
directory = .cov/html

[mypy]
ignore_missing_imports = True
show_error_codes = True
Expand Down