Skip to content

Commit

Permalink
Adds util functions needed for dynamic prepocessing pipelines (#852)
Browse files Browse the repository at this point in the history
* first util

* add util fxn init

* add test

* changelog

* test fix for all_estimators and minimal dependencies

* add comment

* use static

* cleanup
  • Loading branch information
angela97lin committed Jun 18, 2020
1 parent d220dca commit 2a56753
Show file tree
Hide file tree
Showing 6 changed files with 336 additions and 7 deletions.
2 changes: 2 additions & 0 deletions docs/source/api_reference.rst
Expand Up @@ -134,6 +134,8 @@ Pipeline Utils

all_pipelines
get_pipelines
get_estimators
make_pipeline
list_model_families


Expand Down
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -14,6 +14,7 @@ Changelog
* Added preprocessing component to handle DateTime columns featurization :pr:`838`
* Added ability to clone pipelines and components :pr:`842`
* Define getter method for component `parameters` :pr:`847`
* Added new utility functions necessary for generating dynamic preprocessing pipelines :pr:`852`
* Added kwargs to all components :pr:`863`
* Fixes
* Fixed bug where SimpleImputer cannot handle dropped columns :pr:`846`
Expand Down
5 changes: 4 additions & 1 deletion evalml/pipelines/__init__.py
Expand Up @@ -62,7 +62,10 @@
from .utils import (
all_pipelines,
get_pipelines,
list_model_families
list_model_families,
all_estimators,
get_estimators,
make_pipeline
)
from .graph_utils import (
precision_recall_curve,
Expand Down
163 changes: 161 additions & 2 deletions evalml/pipelines/utils.py
@@ -1,3 +1,7 @@
import numpy as np
import pandas as pd

from .binary_classification_pipeline import BinaryClassificationPipeline
from .classification import (
CatBoostBinaryClassificationPipeline,
CatBoostMulticlassClassificationPipeline,
Expand All @@ -8,16 +12,35 @@
XGBoostBinaryPipeline,
XGBoostMulticlassPipeline
)
from .multiclass_classification_pipeline import (
MulticlassClassificationPipeline
)
from .regression import (
CatBoostRegressionPipeline,
LinearRegressionPipeline,
RFRegressionPipeline,
XGBoostRegressionPipeline
)
from .regression_pipeline import RegressionPipeline

from evalml.exceptions import MissingComponentError
from evalml.model_family import handle_model_family
from evalml.problem_types import handle_problem_types
from evalml.pipelines.components import (
CatBoostClassifier,
CatBoostRegressor,
DateTimeFeaturization,
DropNullColumns,
LinearRegressor,
LogisticRegressionClassifier,
OneHotEncoder,
RandomForestClassifier,
RandomForestRegressor,
SimpleImputer,
StandardScaler,
XGBoostClassifier,
XGBoostRegressor
)
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils import get_logger

logger = get_logger(__file__)
Expand All @@ -36,6 +59,16 @@
XGBoostRegressionPipeline]


_ALL_ESTIMATORS = [CatBoostClassifier,
CatBoostRegressor,
LinearRegressor,
LogisticRegressionClassifier,
RandomForestClassifier,
RandomForestRegressor,
XGBoostClassifier,
XGBoostRegressor]


def all_pipelines():
"""Returns a complete list of all supported pipeline classes.
Expand All @@ -59,6 +92,8 @@ def get_pipelines(problem_type, model_families=None):
Can also optionally filter by a list of model types.
Arguments:
problem_type (ProblemTypes or str): problem type to filter for
model_families (list[ModelFamily] or list[str]): model families to filter for
Returns:
list[PipelineBase]: a list of pipeline classes
Expand Down Expand Up @@ -95,7 +130,7 @@ def get_pipelines(problem_type, model_families=None):
def list_model_families(problem_type):
"""List model type for a particular problem type
Args:
Arguments:
problem_types (ProblemTypes or str): binary, multiclass, or regression
Returns:
Expand All @@ -109,3 +144,127 @@ def list_model_families(problem_type):
problem_pipelines.append(p)

return list(set([p.model_family for p in problem_pipelines]))


def all_estimators():
"""Returns a complete list of all supported estimator classes.
Returns:
list[Estimator]: a list of estimator classes
"""
estimators = []
for estimator_class in _ALL_ESTIMATORS:
try:
estimator_class()
estimators.append(estimator_class)
except (MissingComponentError, ImportError):
estimator_name = estimator_class.name
logger.debug('Estimator {} failed import, withholding from all_estimators'.format(estimator_name))
return estimators


def get_estimators(problem_type, model_families=None):
"""Returns the estimators allowed for a particular problem type.
Can also optionally filter by a list of model types.
Arguments:
problem_type (ProblemTypes or str): problem type to filter for
model_families (list[ModelFamily] or list[str]): model families to filter for
Returns:
list[class]: a list of estimator subclasses
"""
if model_families is not None and not isinstance(model_families, list):
raise TypeError("model_families parameter is not a list.")
problem_type = handle_problem_types(problem_type)
if model_families is None:
model_families = list_model_families(problem_type)

model_families = [handle_model_family(model_family) for model_family in model_families]
all_model_families = list_model_families(problem_type)
for model_family in model_families:
if model_family not in all_model_families:
raise RuntimeError("Unrecognized model type for problem type %s: %s" % (problem_type, model_family))

estimator_classes = []
for estimator_class in all_estimators():
if problem_type not in [handle_problem_types(supported_pt) for supported_pt in estimator_class.supported_problem_types]:
continue
if estimator_class.model_family not in model_families:
continue
estimator_classes.append(estimator_class)
return estimator_classes


def _get_preprocessing_components(X, y, problem_type, estimator_class):
"""Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.
Arguments:
X (pd.DataFrame): the input data of shape [n_samples, n_features]
y (pd.Series): the target labels of length [n_samples]
problem_type (ProblemTypes or str): problem type
estimator_class (class):A class which subclasses Estimator estimator for pipeline
Returns:
list[Transformer]: a list of applicable preprocessing components to use with the estimator
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
pp_components = []
all_null_cols = X.columns[X.isnull().all()]
if len(all_null_cols) > 0:
pp_components.append(DropNullColumns)
X = X.drop(all_null_cols, axis=1)
pp_components.append(SimpleImputer)

datetime_cols = X.select_dtypes(include=[np.datetime64])
add_datetime_featurization = len(datetime_cols.columns) > 0
if add_datetime_featurization:
pp_components.append(DateTimeFeaturization)

# DateTimeFeaturization can create categorical columns
categorical_cols = X.select_dtypes(include=['category', 'object'])
if (add_datetime_featurization or len(categorical_cols.columns) > 0) and estimator_class not in {CatBoostClassifier, CatBoostRegressor}:
pp_components.append(OneHotEncoder)

if estimator_class in {LinearRegressor, LogisticRegressionClassifier}:
pp_components.append(StandardScaler)
return pp_components


def make_pipeline(X, y, estimator, problem_type):
"""Given input data, target data, an estimator class and the problem type,
generates a pipeline class with a preprocessing chain which was recommended based on the inputs.
The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type.
Arguments:
X (pd.DataFrame): the input data of shape [n_samples, n_features]
y (pd.Series): the target labels of length [n_samples]
estimator (Estimator): estimator for pipeline
problem_type (ProblemTypes or str): problem type for pipeline to generate
Returns:
class: PipelineBase subclass with dynamically generated preprocessing components and specified estimator
"""
problem_type = handle_problem_types(problem_type)
if estimator not in get_estimators(problem_type):
raise ValueError(f"{estimator.name} is not a valid estimator for problem type")
preprocessing_components = _get_preprocessing_components(X, y, problem_type, estimator)
complete_component_graph = preprocessing_components + [estimator]

def get_pipeline_base_class(problem_type):
"""Returns pipeline base class for problem_type"""
if problem_type == ProblemTypes.BINARY:
return BinaryClassificationPipeline
elif problem_type == ProblemTypes.MULTICLASS:
return MulticlassClassificationPipeline
elif problem_type == ProblemTypes.REGRESSION:
return RegressionPipeline

base_class = get_pipeline_base_class(problem_type)

class GeneratedPipeline(base_class):
component_graph = complete_component_graph
return GeneratedPipeline
6 changes: 3 additions & 3 deletions evalml/tests/component_tests/test_components.py
Expand Up @@ -30,7 +30,7 @@
)


@pytest.fixture
@pytest.fixture(scope="module")
def test_classes():
class MockComponent(ComponentBase):
name = "Mock Component"
Expand Down Expand Up @@ -140,12 +140,12 @@ class MockComponentModelFamily(ComponentBase):
with pytest.raises(TypeError):
MockComponentModelFamily()

class MockEstimator(Estimator):
class MockEstimatorWithoutAttribute(Estimator):
name = "Mock Estimator"
model_family = ModelFamily.LINEAR_MODEL

with pytest.raises(TypeError):
MockEstimator()
MockEstimatorWithoutAttribute()


def test_missing_methods_on_components(X_y, test_classes):
Expand Down

0 comments on commit 2a56753

Please sign in to comment.