ballet · micahjsmith · May 22, 2021 · May 21, 2021 · May 21, 2021 · May 21, 2021
diff --git a/ballet/client.py b/ballet/client.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from funcy import cached_property
 
+from ballet.discovery import discover as _discover
 from ballet.feature import Feature
 from ballet.project import FeatureEngineeringProject, Project
 from ballet.validation.common import subsample_data_for_validation
@@ -96,5 +97,16 @@ def validate_feature_acceptance(
             accepter_class, feature, result.features, result.X_df,
             result.y_df, result.X_df, result.y, False)
 
+    def discover(self, input=None, primitive=None) -> pd.DataFrame:
+        features = self.api.features
+        X_df, y_df = self.api.load_data()
+        encoder = self.api.encoder
+        y = encoder.fit_transform(y_df)
+
+        return _discover(
+            features, X_df, y_df, y, input=input, primitive=primitive)
+
+    discover.__doc__ = _discover.__doc__
+
 
 b = Client()
diff --git a/ballet/discovery.py b/ballet/discovery.py
@@ -0,0 +1,133 @@
+import funcy as fy
+import numpy as np
+import pandas as pd
+
+from ballet.transformer import get_transformer_primitives
+from ballet.util import asarray2d
+from ballet.validation.entropy import (
+    estimate_conditional_information, estimate_mutual_information,)
+
+
+def countunique(z, axis=0):
+    return np.apply_along_axis(
+        lambda arr: len(np.unique(arr)), axis, z)
+
+
+@fy.memoize(key_func=lambda feature, values, y: id(feature))
+def _summarize_feature(feature, values, y) -> dict:
+    z = values[feature]
+
+    feature_values_list = [
+        feature_values
+        for other_feature, feature_values in values.items()
+        if other_feature is not feature
+    ]
+    if feature_values_list:
+        x = np.concatenate(feature_values_list, axis=1)
+    else:
+        x = np.empty((z.shape[0], 0))
+
+    mutual_information = estimate_mutual_information(z, y)
+    conditional_mutual_information = \
+        estimate_conditional_information(z, y, x)
+    mean = np.mean(z, axis=0)
+    std = np.std(z, axis=0)
+    variance = np.var(z, axis=0)
+    nunique = countunique(z, axis=0)
+    return {
+        'name': feature.name,
+        'description': feature.description,
+        'input':
+            [feature.input]
+            if isinstance(feature.input, str)
+            else feature.input,
+        'transformer': feature.transformer,
+        'primitives': get_transformer_primitives(feature.transformer),
+        'output': feature.output,
+        'author': feature.author,
+        'source': feature.source,
+        'mutual_information': mutual_information,
+        'conditional_mutual_information':
+            conditional_mutual_information,
+        'mean': np.mean(mean),  # same as mean over flattened anyways
+        'std': np.mean(std),
+        'variance': np.mean(variance),
+        'nunique': np.mean(nunique),
+    }
+
+
+def discover(
+    features, X_df, y_df, y, input=None, primitive=None
+) -> pd.DataFrame:
+    """Discover existing features
+
+    Display information about existing features including summary statistics.
+    If the feature extracts multiple feature values, then the summary
+    statistics (e.g. mean, std, nunique) are computed for each feature value
+    and then averaged.
+
+    The following information is shown:
+    - name: the name of the feature
+    - description: the description of the feature
+    - input: the variables that are used as input to the feature
+    - transformer: the transformer/transformer pipeline
+    - output: the output columns of the feature (not usually specified)
+    - author: the GitHub username of the feature's author
+    - source: the fully-qualified name of the Python module that contains the
+        feature
+    - mutual_information: estimated mutual information between the feature (or
+        averaged over feature values) and the target on the development
+        dataset split
+    - conditional_mutual_information: estimated conditional mutual information
+        between the feature (or averaged over feature values) and the target
+        conditional on all other features on the development dataset split
+    - mean: mean of the feature on the development dataset split
+    - std: standard deviation of the feature (or averaged over feature values)
+        on the development dataset split
+    - var: variance of the feature (or averaged over feature values) on the
+        development dataset split
+    - nunique: number of unique values of the feature (or averaged over
+        feature values) on the development dataset split
+
+    The following query operators are supported:
+    - input (str): filter to only features that have ``input`` in their input/
+        list of inputs
+    - primitive (str): filter to only features that use primitive
+        ``primitive`` (i.e. a class with name ``primitive``) in the
+        transformer/transformer pipeline
+
+    For other queries, you should just use normal DataFrame indexing::
+
+       >>> features_df[features_df['author'] == 'jane']
+       >>> features_df[features_df['name'].str.contains('married')]
+       >>> features_df[features_df['mutual_information'] > 0.05]
+       >>> features_df[features_df['input'].apply(
+               lambda input: 'A' in input and 'B' in input)]
+
+    Returns:
+        data frame with features on the row index and columns as described
+        above
+    """
+    y = asarray2d(y)
+    records = []
+    values = {
+        feature: asarray2d(
+            feature
+            .as_feature_engineering_pipeline()
+            .fit_transform(X_df, y_df)
+        )
+        for feature in features
+    }
+    for feature in features:
+        if input and input not in feature.input and input != feature.input:
+            continue
+        if (
+            primitive
+            and primitive not in get_transformer_primitives(
+                feature.transformer)
+        ):
+            continue
+        summary = _summarize_feature(feature, values, y)
+        records.append(summary)
+
+    return pd.DataFrame.from_records(records)
diff --git a/ballet/feature.py b/ballet/feature.py
@@ -5,7 +5,7 @@
 
 import ballet.pipeline
 from ballet.transformer import RobustTransformer, make_robust_transformer
-from ballet.util.typing import OneOrMore, Pathy, TransformerLike
+from ballet.util.typing import OneOrMore, TransformerLike
 
 __all__ = ('Feature', )
 
@@ -34,7 +34,7 @@ class Feature:
         description: description of the feature
         output: ordered sequence of names of features
             produced by this transformer
-        source: the source file in which this feature was defined
+        source: the module in which this feature was defined
         options: options
     """
 
@@ -45,7 +45,7 @@ def __init__(
         name: Optional[str] = None,
         description: Optional[str] = None,
         output: OneOrMore[str] = None,
-        source: Pathy = None,
+        source: Optional[str] = None,
         options: dict = None
     ):
         self.input = input
@@ -84,3 +84,21 @@ def as_feature_engineering_pipeline(
     ) -> ballet.pipeline.FeatureEngineeringPipeline:
         """Return standalone FeatureEngineeringPipeline with this feature"""
         return ballet.pipeline.FeatureEngineeringPipeline(self)
+
+    @property
+    def author(self) -> Optional[str]:
+        """The author of this feature if it can be inferred from its source
+
+        The author can be inferred if the module the feature was defined in
+        follows the pattern
+        ``package.subpackage.user_username.feature_featurename``. Otherwise,
+        returns ``None``.
+        """
+        if self.source:
+            pieces = self.source.rsplit('.', maxsplit=2)
+            if len(pieces) >= 2:
+                user_str = pieces[-2]
+                if user_str.startswith('user_'):
+                    return user_str[len('user_'):]
+
+        return None
diff --git a/ballet/templates/project_template/{{cookiecutter.project_slug}}/notebooks/Analysis.ipynb b/ballet/templates/project_template/{{cookiecutter.project_slug}}/notebooks/Analysis.ipynb
@@ -39,6 +39,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# automagical client\n",
     "from ballet import b"
    ]
   },
@@ -80,7 +81,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Explore existing features"
+    "## Explore existing features\n",
+    "\n",
+    "To discover existing features, you can use the `discover` method of the `b` client, which returns information and summary statistics about each existing feature in a data frame."
    ]
   },
   {
@@ -89,16 +92,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "result = b.api.engineer_features(X_df, y_df)\n",
-    "X_train, y_train = result.X, result.y"
+    "b.discover()"
    ]
   },
+  {
+   "source": [
+    "You can also use the `engineer_features` method of {{ cookiecutter.package_slug }}'s API, which is also exposed by `b`. The resulting object is a named tuple that allows you to access the transformed development dataset (feature matrix `X` and target `y`), the feature engineering pipeline (`pipeline`), the target encoder (`encoder`), and the set of existing features (`features`)."
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "result = b.api.engineer_features(X_df, y_df)\n",
+    "X_train, y_train = result.X, result.y\n",
     "print('Number of existing features: ', len(result.features))\n",
     "print('Number of columns in feature matrix: ', X_train.shape[1])"
    ]
@@ -230,4 +241,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/ballet/transformer.py b/ballet/transformer.py
@@ -25,6 +25,12 @@
 def make_robust_transformer(
     transformer: OneOrMore[TransformerLike]
 ) -> RobustTransformer:
+    """Convert to robust transformer or pipeline
+
+    Convert to either a single DelegatingRobustTransformer or a
+    TransformerPipeline where each transformer in the pipeline is a
+    DelegatingRobustTransformer.
+    """
     if is_seqcont(transformer):
         transformer = cast(Collection[TransformerLike], transformer)
         transformers = list(
@@ -304,3 +310,21 @@ def _replace_callable_or_none_with_transformer(
     else:
         transformer = cast(BaseTransformer, transformer)
         return transformer
+
+
+def get_transformer_primitives(
+    transformer: TransformerLike
+) -> List[str]:
+    """Get the primitives used in this transformer or pipeline
+
+    The primitives are just the class names underlying the transformer or
+    pipeline.
+    """
+    if isinstance(transformer, DelegatingRobustTransformer):
+        return [transformer._tname]
+    else:
+        _transformer = cast(TransformerPipeline, transformer)
+        return [
+            t._tname
+            for _, t in _transformer.steps
+        ]
diff --git a/ballet/validation/feature_acceptance/validator.py b/ballet/validation/feature_acceptance/validator.py
@@ -169,6 +169,7 @@ class MutualInformationAccepter(FeatureAccepter):
             fail validation if NaN-valued targets are discovered or to drop
             those rows in calculation of the mutual information score
     """
+
     def __init__(self, *args, threshold=0.05, handle_nan_targets='fail'):
         super().__init__(*args)
         self.threshold = threshold
@@ -236,7 +237,8 @@ class CompoundAccepter(FeatureAccepter):
         agg: one of ``'all'`` or ``'any'``; whether to accept if all
             underlying accepters accept or if any accepter accepts.
         specs: list of dicts of accepter specs
-    """ # noqa
+    """  # noqa
+
     def __init__(self, *args, agg='all', specs: List[dict] = []):
         super().__init__(*args)
         self._agg = agg

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,8 @@
 from unittest.mock import patch
 
 import git
+import numpy as np
+import pandas as pd
 import pytest
 import responses as _responses
 
@@ -97,3 +99,25 @@ def project_template_copy(tempdir):
 def responses():
     with _responses.RequestsMock() as rsps:
         yield rsps
+
+
+class SampleData(NamedTuple):
+    df: pd.DataFrame
+    X: pd.DataFrame
+    y: pd.DataFrame
+
+
+@pytest.fixture
+def sample_data():
+    df = pd.DataFrame(
+        data={
+            'country': ['USA', 'USA', 'Canada', 'Japan'],
+            'year': [2001, 2002, 2001, 2002],
+            'size': [np.nan, -11, 12, 0.0],
+            'strength': [18, 110, np.nan, 101],
+            'happy': [False, True, False, False]
+        }
+    ).set_index(['country', 'year'])
+    X = df[['size', 'strength']]
+    y = df[['happy']]
+    return SampleData(df, X, y)
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -0,0 +1,16 @@
+import pytest
+
+
+@pytest.mark.xfail
+def test_validate_feature_api():
+    raise NotImplementedError
+
+
+@pytest.mark.xfail
+def test_validate_feature_acceptance():
+    raise NotImplementedError
+
+
+@pytest.mark.xfail
+def test_discover():
+    pass