Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Supported nested features and transformers that operate on column subsets #82

Merged
merged 9 commits into from
May 24, 2021
29 changes: 29 additions & 0 deletions ballet/eng/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.utils.validation import check_is_fitted
from sklearn_pandas import DataFrameMapper

import ballet.transformer # avoid circular import
from ballet.exc import BalletError
from ballet.util import get_arr_desc
from ballet.util.typing import OneOrMore, TransformerLike
Expand All @@ -19,6 +21,7 @@
'GroupwiseTransformer',
'NoFitMixin',
'SimpleFunctionTransformer',
'SubsetTransformer',
)


Expand Down Expand Up @@ -320,3 +323,29 @@ def transform(self, X, **transform_args):
raise TypeError(
f'Couldn\'t apply transformer on features in '
f'{get_arr_desc(X)}.')


class SubsetTransformer(DataFrameMapper):
"""Transform a subset of columns with another transformer

Args:
input:
transformer:
alias:
"""

def __init__(self,
input: OneOrMore[str],
transformer: TransformerLike,
alias: Optional[str] = None):
self.input = input
self.transformer = transformer
self.alias = alias
super().__init__(
[(input,
ballet.transformer.desugar_transformer(transformer),
{'alias': alias})],
default=None,
input_df=True,
df_out=True,
)
22 changes: 22 additions & 0 deletions ballet/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,25 @@ def author(self) -> Optional[str]:
return user_str[len('user_'):]

return None

_pipeline = None

@property
def pipeline(self) -> ballet.pipeline.FeatureEngineeringPipeline:
"""A feature engineering pipeline containing just this feature"""
if self._pipeline is None:
self._pipeline = self.as_feature_engineering_pipeline()

return self._pipeline

def fit(self, X, y=None):
"""Fit feature.pipeline"""
return self.pipeline.fit(X, y=y)

def transform(self, X):
"""Transform data using feature.pipeline"""
return self.pipeline.transform(X)

def fit_transform(self, X, y=None):
"""Fit feature.pipeline and then transform data"""
return self.fit(X, y=y).transform(X)
18 changes: 14 additions & 4 deletions ballet/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sklearn.preprocessing import FunctionTransformer
from sklearn_pandas.pipeline import TransformerPipeline

from ballet.eng import BaseTransformer, IdentityTransformer
from ballet.eng import BaseTransformer, IdentityTransformer, SubsetTransformer
from ballet.exc import UnsuccessfulInputConversionError
from ballet.util import DeepcopyMixin, asarray2d, indent, quiet
from ballet.util.log import TRACE, logger
Expand All @@ -34,13 +34,13 @@ def make_robust_transformer(
if is_seqcont(transformer):
transformer = cast(Collection[TransformerLike], transformer)
transformers = list(
map(_replace_callable_or_none_with_transformer, transformer))
map(desugar_transformer, transformer))
for t in transformers:
_validate_transformer_api(t)
return make_robust_transformer_pipeline(transformers)
else:
transformer = cast(TransformerLike, transformer)
transformer = _replace_callable_or_none_with_transformer(transformer)
transformer = desugar_transformer(transformer)
_validate_transformer_api(transformer)
return DelegatingRobustTransformer(transformer)

Expand Down Expand Up @@ -300,13 +300,23 @@ def _validate_transformer_api(transformer: BaseTransformer):
f'Invalid signature for transformer.transform: {sig_transform}')


def _replace_callable_or_none_with_transformer(
def desugar_transformer(
transformer: TransformerLike,
) -> BaseTransformer:
"""Replace transformer syntactic sugar with actual transformer

The following syntactic sugar is supported:
- `None` is replaced with an IdentityTransformer
- a callable (function or lambda) is replaced with a FunctionTransformer
that wraps that callable
- a tuple (input, transformer) is replaced with a SubsetTransformer
"""
if transformer is None:
return IdentityTransformer()
elif callable(transformer) and not isinstance(transformer, type):
return FunctionTransformer(transformer)
elif isinstance(transformer, tuple):
return SubsetTransformer(*transformer)
else:
transformer = cast(BaseTransformer, transformer)
return transformer
Expand Down
30 changes: 13 additions & 17 deletions docs/feature_engineering_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Feature Engineering Guide
=========================

Feature engineering is the process of transforming raw variables into
feature values that can be input to a learning algorithm. We include every step that is needed to go from the raw dataset to the learning algorithm: cleaning missing values and outliers, scaling values, deriving complex features from multiple variables, reducing dimensionality, encoding categorical and ordinal variables, and more.
feature values that can be input to a learning algorithm. We consider in this process every step that is needed to go from the raw dataset to the learning algorithm: cleaning missing values and outliers, scaling values, deriving complex features from multiple variables, reducing dimensionality, encoding categorical and ordinal variables, and more.

In Ballet, feature engineering is centered around creating feature definitions.
These are modular, flexible, and expressive and will allow us to compose an
Expand All @@ -24,24 +24,18 @@ columns and a transformer to apply on them.
Feature Definitions
-------------------

A feature definition (or simply "feature") is the semantics and implementation of code to extract feature values from raw data. It is a learned map from raw variables in one data instance to feature values.
A feature definition (or simply "feature") is the code to extract feature values from raw data, paired with meta-information about the transformation.

Less formally, a feature has
A feature can produce either a scalar feature value for each instance, or a vector of feature values (e.g. the embedding of a categorical variable).

- a meaning, like "column 1 after it has been cleaned using information from column 2"
- a code representation, like a Python object that takes as input rows of raw data and produces as output rows of feature values. It also has a separate stage to *learn* import parameters from the rows of training data before it can be applied to the training data or to unseen test data.

A feature can produce either

- a scalar feature value for each instance
- a vector of feature values, as in the case of the embedding of a categorical variable.

Each feature is "parameterized" by a dataset, usually the training dataset, indicating that it learns any information it uses, such as variable means and variances. This formalizes the separation between training and testing data to avoid any "leakage" of information during the feature engineering process.
.. Each feature is "parameterized" by a dataset, usually the training dataset, indicating that it learns any information it uses, such as variable means and variances. This formalizes the separation between training and testing data to avoid any "leakage" of information during the feature engineering process.

In Ballet, features are realized in Python as instances of :py:class:`~ballet.feature.Feature` with the following attributes:

- ``input``: the input to the feature, in terms of columns of the raw dataset.
- ``transformer``: the transformation applied to the raw data. The transformer is an object (or sequence of objects) that provide (or each provide) a fit/transform interface.
- ``input``: the input columns to the feature from the raw dataset.
- ``transformer``: the transformation applied to the selected columns. The transformer is an object (or sequence of objects) that provide (or each provide) a fit/transform interface.
- ``name``: the name of the feature.
- ``description``: a longer human-readable description of the feature.

Why?
^^^^
Expand All @@ -53,10 +47,10 @@ hoops to use :py:class:`~ballet.feature.Feature` objects?
#. *Modularity.* Each feature stands alone and can be reasoned about,
validated, and implemented separately.

#. *Avoid leakage.* By writing all features as learned transformations (with
#. *Leakage.* By writing all features as learned transformations (with
separate fit and transform stages) and enforcing a train-test split, we
ensure that feature engineering code never sees test data before it applies
transformations on new instances.
transformations on new instances, helping better estimate generalization performance.

#. *Clearly declare inputs and outputs.* Each feature declares its own inputs
(and optionally outputs) and can operate on them only. Thus a feature can
Expand Down Expand Up @@ -147,7 +141,7 @@ Let's took a look at another example.
.. include:: fragments/feature-engineering-guide-third-feature.py
:code: python

The feature requests three inputs, which are various measures of square footage in the house (basement, first floor, and second floor). The combined transformer is a sequence of two "transformer-likes". The first transformer in is a function that will receive as its input a DataFrame with three columns, and it sums across rows (``axis=1``), returning a single column with the total square footage. The second transformer is a utility object that replaces missing values. In this case, neither transformer learns anything from data (i.e. it does not need to save parameters learned from the training data) so both can be simple functions. Here, the first function is implicitly converted into a :py:class:`~ballet.eng.sklearn.FunctionTransformer` and the second transformer is already a thin wrapper around ``pd.fillna``.
The feature requests three inputs, which are various measures of square footage in the house (basement, first floor, and second floor -- not shown in the sample dataset). The combined transformer is a sequence of two "transformer-like" steps. The first transformer step is a function that will receive as its input a DataFrame with three columns, and it sums across rows (``axis=1``), returning a single column with the total square footage. The second transformer step is a utility object that replaces missing values. In this case, neither transformer learns anything from data (i.e. it does not need to save parameters learned from the training data) so both can be simple functions. Here, the first function is implicitly converted into a :py:class:`~ballet.eng.sklearn.FunctionTransformer` and the second transformer is already a thin wrapper around ``pd.fillna``.

In this feature, the sum is equivalent to a weighted sum with the weights all equal to 1. But maybe you have the intuition that not all living area is created equal? You might apply custom weights as follows:

Expand Down Expand Up @@ -192,6 +186,8 @@ A *transformer-like* is any of the following:
- an object that satisfies the scikit-learn `Transformer API`_, having ``fit``, ``transform``, and ``fit_transform`` methods.
- a callable that accepts the ``X`` DataFrame as input and produces an array-like as output. This can be thought of as a transformer that does not have a fit stage. Ballet will take care of converting it into a :py:class:`~ballet.eng.sklearn.FunctionTransformer` object.
- the value ``None``, shorthand to indicate the identity transformer. Ballet will convert it into a :py:class:`~ballet.eng.IdentityTransformer` object.
- a tuple of ``(input, transformer)``. This allows nested transformations that operate on only a subset of the inputs that your feature is already working on. Both elements of the tuple are interpreted the same as if they were passed to the :py:class:`~ballet.feature.Feature` constructor. Internally, they will be converted to a :py:class:`~ballet.eng.base.SubsetTransformer`, which you can also use directly.
- another feature instance itself! This is another way to nest transformations. You can import a feature from another module and use it within your own transformer.

Feature engineering pipelines
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
31 changes: 31 additions & 0 deletions tests/eng/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,34 @@ def test_conditional_transformer_unsatisfy_transform(sample_data):
# value is transformed by unsatisfy condition and is not equal
assert_series_not_equal(result_tr['value'], X_tr['value'])
assert_series_not_equal(result_te['value'], X_te['value'])


def test_subset_transformer_identity(sample_data):
"""After passing through a column unchanged, the entire df is the same as before""" # noqa
X_tr, X_te = sample_data

t = ballet.eng.SubsetTransformer('value', None)
result_tr = t.fit_transform(X_tr)
result_te = t.transform(X_te)

assert_frame_equal(result_tr, X_tr)
assert_frame_equal(result_te, X_te)


def test_subset_transformer_mutate(sample_data):
"""After modifying one column, that column is different and the complement is the same""" # noqa
X_tr, X_te = sample_data

input = 'size'
t = ballet.eng.SubsetTransformer(input, lambda x: x+1)
result_tr = t.fit_transform(X_tr)
result_te = t.transform(X_te)

# the input col is modified
assert_series_not_equal(result_tr[input], X_tr[input])
assert_series_not_equal(result_te[input], X_te[input])

# the complement is passed through unchanged
complement = [col for col in X_tr.columns if col != input]
assert_frame_equal(result_tr[complement], X_tr[complement])
assert_frame_equal(result_te[complement], X_te[complement])
9 changes: 9 additions & 0 deletions tests/test_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,12 @@ def test_feature_as_feature_engineering_pipeline(inputs):
feature = Feature(input, transformer)
mapper = feature.as_feature_engineering_pipeline()
assert isinstance(mapper, FeatureEngineeringPipeline)


def test_feature_pipeline(inputs):
input, transformer = inputs
feature = Feature(input, transformer)
pipeline = feature.pipeline
assert isinstance(pipeline, FeatureEngineeringPipeline)
pipeline2 = feature.pipeline
assert pipeline is pipeline2
36 changes: 22 additions & 14 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,30 @@
from ballet.pipeline import FeatureEngineeringPipeline


@pytest.fixture
def inputs():
input = 'foo'
transformer = IdentityTransformer()
@pytest.fixture(
params=[
IdentityTransformer(),
[IdentityTransformer()],
[None, IdentityTransformer(), lambda x: x],
Feature(['foo', 'bar'], IdentityTransformer()),
[None, IdentityTransformer(),
Feature(['foo', 'bar'], IdentityTransformer())],
],
ids=[
'scalar',
'list of transformer',
'list of mixed',
'nested feature',
'list of mixed and nested features',
]
)
def inputs(request):
input = ['foo', 'bar']
transformer = request.param
return input, transformer


def test_init_seqcont(inputs):
input, transformer = inputs
feature = Feature(input, transformer)
features = [feature]
mapper = FeatureEngineeringPipeline(features)
assert isinstance(mapper, FeatureEngineeringPipeline)


def test_init_scalar(inputs):
def test_init(inputs):
input, transformer = inputs
feature = Feature(input, transformer)
mapper = FeatureEngineeringPipeline(feature)
Expand All @@ -46,4 +54,4 @@ def test_transform(inputs):
df.columns = ['foo', 'bar']
mapper.fit(df)
X = mapper.transform(df)
assert np.shape(X) == (5, 1)
assert np.shape(X) == (5, len(inputs))
16 changes: 16 additions & 0 deletions tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import pandas as pd
import pytest
import sklearn.preprocessing
from sklearn_pandas.pipeline import TransformerPipeline

from ballet.compat import SimpleImputer
from ballet.eng.misc import IdentityTransformer
from ballet.feature import Feature
from ballet.transformer import (
DelegatingRobustTransformer, get_transformer_primitives,
make_robust_transformer,)
Expand Down Expand Up @@ -140,3 +142,17 @@ def test_get_transformer_primitives(transformer, expected):
robust_transformer = make_robust_transformer(transformer)
primitives = get_transformer_primitives(robust_transformer)
assert primitives == expected


def test_robust_transformer_desugar():
"""Should be able to "desugar" multiple things into a valid transformer pipeline""" # noqa
transformer = [
None,
IdentityTransformer(),
lambda x: x,
Feature('A', IdentityTransformer()),
('A', IdentityTransformer()),
('A', [None, IdentityTransformer()]),
]
robust_transformer = make_robust_transformer(transformer)
assert isinstance(robust_transformer, TransformerPipeline)