Fix pipeline structure for ts

alteryx · Nov 15, 2021 · 8c345a8 · 8c345a8
1 parent b799981
commit 8c345a8
Show file tree

Hide file tree

Showing 2 changed files with 175 additions and 59 deletions.
diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -59,78 +59,91 @@
 logger = logging.getLogger(__name__)
 
 
-def _get_preprocessing_components(
-    X, y, problem_type, estimator_class, sampler_name=None
-):
-    """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.
-
-    Args:
-        X (pd.DataFrame): The input data of shape [n_samples, n_features].
-        y (pd.Series): The target data of length [n_samples].
-        problem_type (ProblemTypes or str): Problem type.
-        estimator_class (class): A class which subclasses Estimator estimator for pipeline.
-        sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None.
-
-    Returns:
-        list[Transformer]: A list of applicable preprocessing components to use with the estimator.
-    """
-    pp_components = []
-
+def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
+    component = []
     if is_classification(problem_type):
-        pp_components.append(LabelEncoder)
+        component.append(LabelEncoder)
+    return component
+
 
+def _get_drop_all_null(X, y, problem_type, estimator_class, sampler_name=None):
+    component = []
     all_null_cols = X.columns[X.isnull().all()]
     if len(all_null_cols) > 0:
-        pp_components.append(DropNullColumns)
+        component.append(DropNullColumns)
+    return component
 
+
+def _get_drop_index_unknown(X, y, problem_type, estimator_class, sampler_name=None):
+    component = []
     index_and_unknown_columns = list(
         X.ww.select(["index", "unknown"], return_schema=True).columns
     )
     if len(index_and_unknown_columns) > 0:
-        pp_components.append(DropColumns)
+        component.append(DropColumns)
+    return component
+
 
+def _get_url_email(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
     email_columns = list(X.ww.select("EmailAddress", return_schema=True).columns)
     if len(email_columns) > 0:
-        pp_components.append(EmailFeaturizer)
+        components.append(EmailFeaturizer)
 
     url_columns = list(X.ww.select("URL", return_schema=True).columns)
     if len(url_columns) > 0:
-        pp_components.append(URLFeaturizer)
+        components.append(URLFeaturizer)
 
-    if (
-        is_time_series(problem_type)
-        and estimator_class.model_family != ModelFamily.ARIMA
-    ):
-        pp_components.append(DelayedFeatureTransformer)
+    return components
 
-    input_logical_types = {type(lt) for lt in X.ww.logical_types.values()}
-    types_imputer_handles = {
-        logical_types.Boolean,
-        logical_types.Categorical,
-        logical_types.Double,
-        logical_types.Integer,
-        logical_types.URL,
-        logical_types.EmailAddress,
-        logical_types.Datetime,
-    }
 
+def _get_datetime(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
     datetime_cols = list(X.ww.select(["Datetime"], return_schema=True).columns)
 
     add_datetime_featurizer = len(datetime_cols) > 0
     if add_datetime_featurizer and estimator_class.model_family not in [
         ModelFamily.ARIMA,
         ModelFamily.PROPHET,
     ]:
-        pp_components.append(DateTimeFeaturizer)
+        components.append(DateTimeFeaturizer)
+    return components
+
 
+def _get_natural_language(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
     text_columns = list(X.ww.select("NaturalLanguage", return_schema=True).columns)
     if len(text_columns) > 0:
-        pp_components.append(NaturalLanguageFeaturizer)
+        components.append(NaturalLanguageFeaturizer)
+    return components
+
+
+def _get_imputer(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
+
+    input_logical_types = {type(lt) for lt in X.ww.logical_types.values()}
+    text_columns = list(X.ww.select("NaturalLanguage", return_schema=True).columns)
+
+    types_imputer_handles = {
+        logical_types.Boolean,
+        logical_types.Categorical,
+        logical_types.Double,
+        logical_types.Integer,
+        logical_types.URL,
+        logical_types.EmailAddress,
+        logical_types.Datetime,
+    }
 
     if len(input_logical_types.intersection(types_imputer_handles)) or len(
         text_columns
     ):
-        pp_components.append(Imputer)
+        components.append(Imputer)
+
+    return components
+
+
+def _get_ohe(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
 
     # The URL and EmailAddress Featurizers will create categorical columns
     categorical_cols = list(
@@ -140,7 +153,12 @@ def _get_preprocessing_components(
         CatBoostClassifier,
         CatBoostRegressor,
     }:
-        pp_components.append(OneHotEncoder)
+        components.append(OneHotEncoder)
+    return components
+
+
+def _get_sampler(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
 
     sampler_components = {
         "Undersampler": Undersampler,
@@ -151,17 +169,99 @@ def _get_preprocessing_components(
             import_or_raise(
                 "imblearn.over_sampling", error_msg="imbalanced-learn is not installed"
             )
-            pp_components.append(sampler_components[sampler_name])
+            components.append(sampler_components[sampler_name])
         except ImportError:
             logger.warning(
                 "Could not import imblearn.over_sampling, so defaulting to use Undersampler"
             )
-            pp_components.append(Undersampler)
+            components.append(Undersampler)
+    return components
 
+
+def _get_standard_scaler(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
     if estimator_class and estimator_class.model_family == ModelFamily.LINEAR_MODEL:
-        pp_components.append(StandardScaler)
+        components.append(StandardScaler)
+    return components
+
+
+def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_name=None):
+    components = []
+    if (
+        is_time_series(problem_type)
+        and estimator_class.model_family != ModelFamily.ARIMA
+    ):
+        components.append(DelayedFeatureTransformer)
+    return components
 
-    return pp_components
+
+def _get_preprocessing_components(
+    X, y, problem_type, estimator_class, sampler_name=None
+):
+    """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.
+
+    Args:
+        X (pd.DataFrame): The input data of shape [n_samples, n_features].
+        y (pd.Series): The target data of length [n_samples].
+        problem_type (ProblemTypes or str): Problem type.
+        estimator_class (class): A class which subclasses Estimator estimator for pipeline.
+        sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None.
+
+    Returns:
+        list[Transformer]: A list of applicable preprocessing components to use with the estimator.
+    """
+    if is_time_series(problem_type):
+        components_functions = [
+            _get_label_encoder,
+            _get_drop_all_null,
+            _get_drop_index_unknown,
+            _get_url_email,
+            _get_natural_language,
+            _get_imputer,
+            _get_time_series_featurizer,
+            _get_datetime,
+            _get_ohe,
+            _get_sampler,
+            _get_standard_scaler,
+        ]
+    else:
+        components_functions = [
+            _get_label_encoder,
+            _get_drop_all_null,
+            _get_drop_index_unknown,
+            _get_url_email,
+            _get_datetime,
+            _get_natural_language,
+            _get_imputer,
+            _get_ohe,
+            _get_sampler,
+            _get_standard_scaler,
+        ]
+    components = []
+    for function in components_functions:
+        components.extend(function(X, y, problem_type, estimator_class, sampler_name))
+
+    return components
+
+
+def _get_time_series_components(X, y, problem_type, estimator_class, sampler_name=None):
+    components_functions = [
+        _get_label_encoder,
+        _get_drop_all_null,
+        _get_drop_index_unknown,
+        _get_url_email,
+        _get_natural_language,
+        _get_imputer,
+        _get_time_series_featurizer,
+        _get_datetime,
+        _get_ohe,
+        _get_sampler,
+    ]
+    components = []
+    for function in components_functions:
+        components.extend(function(X, y, problem_type, estimator_class, sampler_name))
+
+    return components
 
 
 def _get_pipeline_base_class(problem_type):

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -227,20 +227,36 @@ def test_make_pipeline(
                 and input_type == "pd"
                 else []
             )
-            expected_components = (
-                label_encoder
-                + email_featurizer
-                + url_featurizer
-                + drop_null
-                + drop_col
-                + delayed_features
-                + natural_language_featurizer
-                + datetime
-                + imputer
-                + ohe
-                + standard_scaler
-                + [estimator_class]
-            )
+            if is_time_series(problem_type):
+                expected_components = (
+                    label_encoder
+                    + email_featurizer
+                    + url_featurizer
+                    + drop_null
+                    + drop_col
+                    + natural_language_featurizer
+                    + imputer
+                    + delayed_features
+                    + datetime
+                    + ohe
+                    + standard_scaler
+                    + [estimator_class]
+                )
+            else:
+                expected_components = (
+                    label_encoder
+                    + email_featurizer
+                    + url_featurizer
+                    + drop_null
+                    + drop_col
+                    + delayed_features
+                    + natural_language_featurizer
+                    + datetime
+                    + imputer
+                    + ohe
+                    + standard_scaler
+                    + [estimator_class]
+                )
             assert pipeline.component_graph.compute_order == [
                 component.name for component in expected_components
             ], test_description