Adds util functions needed for dynamic prepocessing pipelines (#852)

* first util * add util fxn init * add test * changelog * test fix for all_estimators and minimal dependencies * add comment * use static * cleanup
alteryx · Jun 18, 2020 · 2a56753 · 2a56753
1 parent d220dca
commit 2a56753
Show file tree

Hide file tree

Showing 6 changed files with 336 additions and 7 deletions.
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -134,6 +134,8 @@ Pipeline Utils
 
     all_pipelines
     get_pipelines
+    get_estimators
+    make_pipeline
     list_model_families
 
 

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -14,6 +14,7 @@ Changelog
         * Added preprocessing component to handle DateTime columns featurization :pr:`838`
         * Added ability to clone pipelines and components :pr:`842`
         * Define getter method for component `parameters` :pr:`847`
+        * Added new utility functions necessary for generating dynamic preprocessing pipelines :pr:`852`
         * Added kwargs to all components :pr:`863`
     * Fixes
         * Fixed bug where SimpleImputer cannot handle dropped columns :pr:`846`

diff --git a/evalml/pipelines/__init__.py b/evalml/pipelines/__init__.py
@@ -62,7 +62,10 @@
 from .utils import (
     all_pipelines,
     get_pipelines,
-    list_model_families
+    list_model_families,
+    all_estimators,
+    get_estimators,
+    make_pipeline
 )
 from .graph_utils import (
     precision_recall_curve,

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -1,3 +1,7 @@
+import numpy as np
+import pandas as pd
+
+from .binary_classification_pipeline import BinaryClassificationPipeline
 from .classification import (
     CatBoostBinaryClassificationPipeline,
     CatBoostMulticlassClassificationPipeline,
@@ -8,16 +12,35 @@
     XGBoostBinaryPipeline,
     XGBoostMulticlassPipeline
 )
+from .multiclass_classification_pipeline import (
+    MulticlassClassificationPipeline
+)
 from .regression import (
     CatBoostRegressionPipeline,
     LinearRegressionPipeline,
     RFRegressionPipeline,
     XGBoostRegressionPipeline
 )
+from .regression_pipeline import RegressionPipeline
 
 from evalml.exceptions import MissingComponentError
 from evalml.model_family import handle_model_family
-from evalml.problem_types import handle_problem_types
+from evalml.pipelines.components import (
+    CatBoostClassifier,
+    CatBoostRegressor,
+    DateTimeFeaturization,
+    DropNullColumns,
+    LinearRegressor,
+    LogisticRegressionClassifier,
+    OneHotEncoder,
+    RandomForestClassifier,
+    RandomForestRegressor,
+    SimpleImputer,
+    StandardScaler,
+    XGBoostClassifier,
+    XGBoostRegressor
+)
+from evalml.problem_types import ProblemTypes, handle_problem_types
 from evalml.utils import get_logger
 
 logger = get_logger(__file__)
@@ -36,6 +59,16 @@
                   XGBoostRegressionPipeline]
 
 
+_ALL_ESTIMATORS = [CatBoostClassifier,
+                   CatBoostRegressor,
+                   LinearRegressor,
+                   LogisticRegressionClassifier,
+                   RandomForestClassifier,
+                   RandomForestRegressor,
+                   XGBoostClassifier,
+                   XGBoostRegressor]
+
+
 def all_pipelines():
     """Returns a complete list of all supported pipeline classes.
 
@@ -59,6 +92,8 @@ def get_pipelines(problem_type, model_families=None):
     Can also optionally filter by a list of model types.
 
     Arguments:
+        problem_type (ProblemTypes or str): problem type to filter for
+        model_families (list[ModelFamily] or list[str]): model families to filter for
 
     Returns:
         list[PipelineBase]: a list of pipeline classes
@@ -95,7 +130,7 @@ def get_pipelines(problem_type, model_families=None):
 def list_model_families(problem_type):
     """List model type for a particular problem type
 
-    Args:
+    Arguments:
         problem_types (ProblemTypes or str): binary, multiclass, or regression
 
     Returns:
@@ -109,3 +144,127 @@ def list_model_families(problem_type):
             problem_pipelines.append(p)
 
     return list(set([p.model_family for p in problem_pipelines]))
+
+
+def all_estimators():
+    """Returns a complete list of all supported estimator classes.
+
+    Returns:
+        list[Estimator]: a list of estimator classes
+    """
+    estimators = []
+    for estimator_class in _ALL_ESTIMATORS:
+        try:
+            estimator_class()
+            estimators.append(estimator_class)
+        except (MissingComponentError, ImportError):
+            estimator_name = estimator_class.name
+            logger.debug('Estimator {} failed import, withholding from all_estimators'.format(estimator_name))
+    return estimators
+
+
+def get_estimators(problem_type, model_families=None):
+    """Returns the estimators allowed for a particular problem type.
+
+    Can also optionally filter by a list of model types.
+
+    Arguments:
+        problem_type (ProblemTypes or str): problem type to filter for
+        model_families (list[ModelFamily] or list[str]): model families to filter for
+
+    Returns:
+        list[class]: a list of estimator subclasses
+    """
+    if model_families is not None and not isinstance(model_families, list):
+        raise TypeError("model_families parameter is not a list.")
+    problem_type = handle_problem_types(problem_type)
+    if model_families is None:
+        model_families = list_model_families(problem_type)
+
+    model_families = [handle_model_family(model_family) for model_family in model_families]
+    all_model_families = list_model_families(problem_type)
+    for model_family in model_families:
+        if model_family not in all_model_families:
+            raise RuntimeError("Unrecognized model type for problem type %s: %s" % (problem_type, model_family))
+
+    estimator_classes = []
+    for estimator_class in all_estimators():
+        if problem_type not in [handle_problem_types(supported_pt) for supported_pt in estimator_class.supported_problem_types]:
+            continue
+        if estimator_class.model_family not in model_families:
+            continue
+        estimator_classes.append(estimator_class)
+    return estimator_classes
+
+
+def _get_preprocessing_components(X, y, problem_type, estimator_class):
+    """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.
+
+    Arguments:
+        X (pd.DataFrame): the input data of shape [n_samples, n_features]
+        y (pd.Series): the target labels of length [n_samples]
+        problem_type (ProblemTypes or str): problem type
+        estimator_class (class):A class which subclasses Estimator estimator for pipeline
+
+    Returns:
+        list[Transformer]: a list of applicable preprocessing components to use with the estimator
+    """
+    if not isinstance(X, pd.DataFrame):
+        X = pd.DataFrame(X)
+    pp_components = []
+    all_null_cols = X.columns[X.isnull().all()]
+    if len(all_null_cols) > 0:
+        pp_components.append(DropNullColumns)
+    X = X.drop(all_null_cols, axis=1)
+    pp_components.append(SimpleImputer)
+
+    datetime_cols = X.select_dtypes(include=[np.datetime64])
+    add_datetime_featurization = len(datetime_cols.columns) > 0
+    if add_datetime_featurization:
+        pp_components.append(DateTimeFeaturization)
+
+    # DateTimeFeaturization can create categorical columns
+    categorical_cols = X.select_dtypes(include=['category', 'object'])
+    if (add_datetime_featurization or len(categorical_cols.columns) > 0) and estimator_class not in {CatBoostClassifier, CatBoostRegressor}:
+        pp_components.append(OneHotEncoder)
+
+    if estimator_class in {LinearRegressor, LogisticRegressionClassifier}:
+        pp_components.append(StandardScaler)
+    return pp_components
+
+
+def make_pipeline(X, y, estimator, problem_type):
+    """Given input data, target data, an estimator class and the problem type,
+        generates a pipeline class with a preprocessing chain which was recommended based on the inputs.
+        The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type.
+
+   Arguments:
+        X (pd.DataFrame): the input data of shape [n_samples, n_features]
+        y (pd.Series): the target labels of length [n_samples]
+        estimator (Estimator): estimator for pipeline
+        problem_type (ProblemTypes or str): problem type for pipeline to generate
+
+    Returns:
+        class: PipelineBase subclass with dynamically generated preprocessing components and specified estimator
+
+    """
+    problem_type = handle_problem_types(problem_type)
+    if estimator not in get_estimators(problem_type):
+        raise ValueError(f"{estimator.name} is not a valid estimator for problem type")
+    preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator)
+    complete_component_graph = preprocessing_components + [estimator]
+
+    def get_pipeline_base_class(problem_type):
+        """Returns pipeline base class for problem_type"""
+        if problem_type == ProblemTypes.BINARY:
+            return BinaryClassificationPipeline
+        elif problem_type == ProblemTypes.MULTICLASS:
+            return MulticlassClassificationPipeline
+        elif problem_type == ProblemTypes.REGRESSION:
+            return RegressionPipeline
+
+    base_class = get_pipeline_base_class(problem_type)
+
+    class GeneratedPipeline(base_class):
+        component_graph = complete_component_graph
+    return GeneratedPipeline
diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py
@@ -30,7 +30,7 @@
 )
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def test_classes():
     class MockComponent(ComponentBase):
         name = "Mock Component"
@@ -140,12 +140,12 @@ class MockComponentModelFamily(ComponentBase):
     with pytest.raises(TypeError):
         MockComponentModelFamily()
 
-    class MockEstimator(Estimator):
+    class MockEstimatorWithoutAttribute(Estimator):
         name = "Mock Estimator"
         model_family = ModelFamily.LINEAR_MODEL
 
     with pytest.raises(TypeError):
-        MockEstimator()
+        MockEstimatorWithoutAttribute()
 
 
 def test_missing_methods_on_components(X_y, test_classes):