Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add discover features method to client #80

Merged
merged 15 commits into from
May 22, 2021
12 changes: 12 additions & 0 deletions ballet/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from funcy import cached_property

from ballet.discovery import discover as _discover
from ballet.feature import Feature
from ballet.project import FeatureEngineeringProject, Project
from ballet.validation.common import subsample_data_for_validation
Expand Down Expand Up @@ -96,5 +97,16 @@ def validate_feature_acceptance(
accepter_class, feature, result.features, result.X_df,
result.y_df, result.X_df, result.y, False)

def discover(self, input=None, primitive=None) -> pd.DataFrame:
features = self.api.features
X_df, y_df = self.api.load_data()
encoder = self.api.encoder
y = encoder.fit_transform(y_df)

return _discover(
features, X_df, y_df, y, input=input, primitive=primitive)

discover.__doc__ = _discover.__doc__


b = Client()
133 changes: 133 additions & 0 deletions ballet/discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import funcy as fy
import numpy as np
import pandas as pd

from ballet.transformer import get_transformer_primitives
from ballet.util import asarray2d
from ballet.validation.entropy import (
estimate_conditional_information, estimate_mutual_information,)


def countunique(z, axis=0):
return np.apply_along_axis(
lambda arr: len(np.unique(arr)), axis, z)


@fy.memoize(key_func=lambda feature, values, y: id(feature))
def _summarize_feature(feature, values, y) -> dict:
z = values[feature]

feature_values_list = [
feature_values
for other_feature, feature_values in values.items()
if other_feature is not feature
]
if feature_values_list:
x = np.concatenate(feature_values_list, axis=1)
else:
x = np.empty((z.shape[0], 0))

mutual_information = estimate_mutual_information(z, y)
conditional_mutual_information = \
estimate_conditional_information(z, y, x)
mean = np.mean(z, axis=0)
std = np.std(z, axis=0)
variance = np.var(z, axis=0)
nunique = countunique(z, axis=0)
return {
'name': feature.name,
'description': feature.description,
'input':
[feature.input]
if isinstance(feature.input, str)
else feature.input,
'transformer': feature.transformer,
'primitives': get_transformer_primitives(feature.transformer),
'output': feature.output,
'author': feature.author,
'source': feature.source,
'mutual_information': mutual_information,
'conditional_mutual_information':
conditional_mutual_information,
'mean': np.mean(mean), # same as mean over flattened anyways
'std': np.mean(std),
'variance': np.mean(variance),
'nunique': np.mean(nunique),
}


def discover(
features, X_df, y_df, y, input=None, primitive=None
) -> pd.DataFrame:
"""Discover existing features

Display information about existing features including summary statistics.
If the feature extracts multiple feature values, then the summary
statistics (e.g. mean, std, nunique) are computed for each feature value
and then averaged.

The following information is shown:
- name: the name of the feature
- description: the description of the feature
- input: the variables that are used as input to the feature
- transformer: the transformer/transformer pipeline
- output: the output columns of the feature (not usually specified)
- author: the GitHub username of the feature's author
- source: the fully-qualified name of the Python module that contains the
feature
- mutual_information: estimated mutual information between the feature (or
averaged over feature values) and the target on the development
dataset split
- conditional_mutual_information: estimated conditional mutual information
between the feature (or averaged over feature values) and the target
conditional on all other features on the development dataset split
- mean: mean of the feature on the development dataset split
- std: standard deviation of the feature (or averaged over feature values)
on the development dataset split
- var: variance of the feature (or averaged over feature values) on the
development dataset split
- nunique: number of unique values of the feature (or averaged over
feature values) on the development dataset split

The following query operators are supported:
- input (str): filter to only features that have ``input`` in their input/
list of inputs
- primitive (str): filter to only features that use primitive
``primitive`` (i.e. a class with name ``primitive``) in the
transformer/transformer pipeline

For other queries, you should just use normal DataFrame indexing::

>>> features_df[features_df['author'] == 'jane']
>>> features_df[features_df['name'].str.contains('married')]
>>> features_df[features_df['mutual_information'] > 0.05]
>>> features_df[features_df['input'].apply(
lambda input: 'A' in input and 'B' in input)]

Returns:
data frame with features on the row index and columns as described
above
"""
y = asarray2d(y)
records = []
values = {
feature: asarray2d(
feature
.as_feature_engineering_pipeline()
.fit_transform(X_df, y_df)
)
for feature in features
}
for feature in features:
if input and input not in feature.input and input != feature.input:
continue
if (
primitive
and primitive not in get_transformer_primitives(
feature.transformer)
):
continue
summary = _summarize_feature(feature, values, y)
records.append(summary)

return pd.DataFrame.from_records(records)
24 changes: 21 additions & 3 deletions ballet/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import ballet.pipeline
from ballet.transformer import RobustTransformer, make_robust_transformer
from ballet.util.typing import OneOrMore, Pathy, TransformerLike
from ballet.util.typing import OneOrMore, TransformerLike

__all__ = ('Feature', )

Expand Down Expand Up @@ -34,7 +34,7 @@ class Feature:
description: description of the feature
output: ordered sequence of names of features
produced by this transformer
source: the source file in which this feature was defined
source: the module in which this feature was defined
options: options
"""

Expand All @@ -45,7 +45,7 @@ def __init__(
name: Optional[str] = None,
description: Optional[str] = None,
output: OneOrMore[str] = None,
source: Pathy = None,
source: Optional[str] = None,
options: dict = None
):
self.input = input
Expand Down Expand Up @@ -84,3 +84,21 @@ def as_feature_engineering_pipeline(
) -> ballet.pipeline.FeatureEngineeringPipeline:
"""Return standalone FeatureEngineeringPipeline with this feature"""
return ballet.pipeline.FeatureEngineeringPipeline(self)

@property
def author(self) -> Optional[str]:
"""The author of this feature if it can be inferred from its source

The author can be inferred if the module the feature was defined in
follows the pattern
``package.subpackage.user_username.feature_featurename``. Otherwise,
returns ``None``.
"""
if self.source:
pieces = self.source.rsplit('.', maxsplit=2)
if len(pieces) >= 2:
user_str = pieces[-2]
if user_str.startswith('user_'):
return user_str[len('user_'):]

return None
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"# automagical client\n",
"from ballet import b"
]
},
Expand Down Expand Up @@ -80,7 +81,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Explore existing features"
"## Explore existing features\n",
"\n",
"To discover existing features, you can use the `discover` method of the `b` client, which returns information and summary statistics about each existing feature in a data frame."
]
},
{
Expand All @@ -89,16 +92,24 @@
"metadata": {},
"outputs": [],
"source": [
"result = b.api.engineer_features(X_df, y_df)\n",
"X_train, y_train = result.X, result.y"
"b.discover()"
]
},
{
"source": [
"You can also use the `engineer_features` method of {{ cookiecutter.package_slug }}'s API, which is also exposed by `b`. The resulting object is a named tuple that allows you to access the transformed development dataset (feature matrix `X` and target `y`), the feature engineering pipeline (`pipeline`), the target encoder (`encoder`), and the set of existing features (`features`)."
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result = b.api.engineer_features(X_df, y_df)\n",
"X_train, y_train = result.X, result.y\n",
"print('Number of existing features: ', len(result.features))\n",
"print('Number of columns in feature matrix: ', X_train.shape[1])"
]
Expand Down Expand Up @@ -230,4 +241,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
24 changes: 24 additions & 0 deletions ballet/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
def make_robust_transformer(
transformer: OneOrMore[TransformerLike]
) -> RobustTransformer:
"""Convert to robust transformer or pipeline

Convert to either a single DelegatingRobustTransformer or a
TransformerPipeline where each transformer in the pipeline is a
DelegatingRobustTransformer.
"""
if is_seqcont(transformer):
transformer = cast(Collection[TransformerLike], transformer)
transformers = list(
Expand Down Expand Up @@ -304,3 +310,21 @@ def _replace_callable_or_none_with_transformer(
else:
transformer = cast(BaseTransformer, transformer)
return transformer


def get_transformer_primitives(
transformer: TransformerLike
) -> List[str]:
"""Get the primitives used in this transformer or pipeline

The primitives are just the class names underlying the transformer or
pipeline.
"""
if isinstance(transformer, DelegatingRobustTransformer):
return [transformer._tname]
else:
_transformer = cast(TransformerPipeline, transformer)
return [
t._tname
for _, t in _transformer.steps
]
4 changes: 3 additions & 1 deletion ballet/validation/feature_acceptance/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class MutualInformationAccepter(FeatureAccepter):
fail validation if NaN-valued targets are discovered or to drop
those rows in calculation of the mutual information score
"""

def __init__(self, *args, threshold=0.05, handle_nan_targets='fail'):
super().__init__(*args)
self.threshold = threshold
Expand Down Expand Up @@ -236,7 +237,8 @@ class CompoundAccepter(FeatureAccepter):
agg: one of ``'all'`` or ``'any'``; whether to accept if all
underlying accepters accept or if any accepter accepts.
specs: list of dicts of accepter specs
""" # noqa
""" # noqa

def __init__(self, *args, agg='all', specs: List[dict] = []):
super().__init__(*args)
self._agg = agg
Expand Down
24 changes: 24 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from unittest.mock import patch

import git
import numpy as np
import pandas as pd
import pytest
import responses as _responses

Expand Down Expand Up @@ -97,3 +99,25 @@ def project_template_copy(tempdir):
def responses():
with _responses.RequestsMock() as rsps:
yield rsps


class SampleData(NamedTuple):
df: pd.DataFrame
X: pd.DataFrame
y: pd.DataFrame


@pytest.fixture
def sample_data():
df = pd.DataFrame(
data={
'country': ['USA', 'USA', 'Canada', 'Japan'],
'year': [2001, 2002, 2001, 2002],
'size': [np.nan, -11, 12, 0.0],
'strength': [18, 110, np.nan, 101],
'happy': [False, True, False, False]
}
).set_index(['country', 'year'])
X = df[['size', 'strength']]
y = df[['happy']]
return SampleData(df, X, y)
16 changes: 16 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest


@pytest.mark.xfail
def test_validate_feature_api():
raise NotImplementedError


@pytest.mark.xfail
def test_validate_feature_acceptance():
raise NotImplementedError


@pytest.mark.xfail
def test_discover():
pass