Skip to content

Commit

Permalink
Fix pipeline structure for ts
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyaboulton committed Nov 15, 2021
1 parent b799981 commit 8c345a8
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 59 deletions.
190 changes: 145 additions & 45 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,78 +59,91 @@
logger = logging.getLogger(__name__)


def _get_preprocessing_components(
X, y, problem_type, estimator_class, sampler_name=None
):
"""Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.
Args:
X (pd.DataFrame): The input data of shape [n_samples, n_features].
y (pd.Series): The target data of length [n_samples].
problem_type (ProblemTypes or str): Problem type.
estimator_class (class): A class which subclasses Estimator estimator for pipeline.
sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None.
Returns:
list[Transformer]: A list of applicable preprocessing components to use with the estimator.
"""
pp_components = []

def _get_label_encoder(X, y, problem_type, estimator_class, sampler_name=None):
component = []
if is_classification(problem_type):
pp_components.append(LabelEncoder)
component.append(LabelEncoder)
return component


def _get_drop_all_null(X, y, problem_type, estimator_class, sampler_name=None):
component = []
all_null_cols = X.columns[X.isnull().all()]
if len(all_null_cols) > 0:
pp_components.append(DropNullColumns)
component.append(DropNullColumns)
return component


def _get_drop_index_unknown(X, y, problem_type, estimator_class, sampler_name=None):
component = []
index_and_unknown_columns = list(
X.ww.select(["index", "unknown"], return_schema=True).columns
)
if len(index_and_unknown_columns) > 0:
pp_components.append(DropColumns)
component.append(DropColumns)
return component


def _get_url_email(X, y, problem_type, estimator_class, sampler_name=None):
components = []
email_columns = list(X.ww.select("EmailAddress", return_schema=True).columns)
if len(email_columns) > 0:
pp_components.append(EmailFeaturizer)
components.append(EmailFeaturizer)

url_columns = list(X.ww.select("URL", return_schema=True).columns)
if len(url_columns) > 0:
pp_components.append(URLFeaturizer)
components.append(URLFeaturizer)

if (
is_time_series(problem_type)
and estimator_class.model_family != ModelFamily.ARIMA
):
pp_components.append(DelayedFeatureTransformer)
return components

input_logical_types = {type(lt) for lt in X.ww.logical_types.values()}
types_imputer_handles = {
logical_types.Boolean,
logical_types.Categorical,
logical_types.Double,
logical_types.Integer,
logical_types.URL,
logical_types.EmailAddress,
logical_types.Datetime,
}

def _get_datetime(X, y, problem_type, estimator_class, sampler_name=None):
components = []
datetime_cols = list(X.ww.select(["Datetime"], return_schema=True).columns)

add_datetime_featurizer = len(datetime_cols) > 0
if add_datetime_featurizer and estimator_class.model_family not in [
ModelFamily.ARIMA,
ModelFamily.PROPHET,
]:
pp_components.append(DateTimeFeaturizer)
components.append(DateTimeFeaturizer)
return components


def _get_natural_language(X, y, problem_type, estimator_class, sampler_name=None):
components = []
text_columns = list(X.ww.select("NaturalLanguage", return_schema=True).columns)
if len(text_columns) > 0:
pp_components.append(NaturalLanguageFeaturizer)
components.append(NaturalLanguageFeaturizer)
return components


def _get_imputer(X, y, problem_type, estimator_class, sampler_name=None):
components = []

input_logical_types = {type(lt) for lt in X.ww.logical_types.values()}
text_columns = list(X.ww.select("NaturalLanguage", return_schema=True).columns)

types_imputer_handles = {
logical_types.Boolean,
logical_types.Categorical,
logical_types.Double,
logical_types.Integer,
logical_types.URL,
logical_types.EmailAddress,
logical_types.Datetime,
}

if len(input_logical_types.intersection(types_imputer_handles)) or len(
text_columns
):
pp_components.append(Imputer)
components.append(Imputer)

return components


def _get_ohe(X, y, problem_type, estimator_class, sampler_name=None):
components = []

# The URL and EmailAddress Featurizers will create categorical columns
categorical_cols = list(
Expand All @@ -140,7 +153,12 @@ def _get_preprocessing_components(
CatBoostClassifier,
CatBoostRegressor,
}:
pp_components.append(OneHotEncoder)
components.append(OneHotEncoder)
return components


def _get_sampler(X, y, problem_type, estimator_class, sampler_name=None):
components = []

sampler_components = {
"Undersampler": Undersampler,
Expand All @@ -151,17 +169,99 @@ def _get_preprocessing_components(
import_or_raise(
"imblearn.over_sampling", error_msg="imbalanced-learn is not installed"
)
pp_components.append(sampler_components[sampler_name])
components.append(sampler_components[sampler_name])
except ImportError:
logger.warning(
"Could not import imblearn.over_sampling, so defaulting to use Undersampler"
)
pp_components.append(Undersampler)
components.append(Undersampler)
return components


def _get_standard_scaler(X, y, problem_type, estimator_class, sampler_name=None):
components = []
if estimator_class and estimator_class.model_family == ModelFamily.LINEAR_MODEL:
pp_components.append(StandardScaler)
components.append(StandardScaler)
return components


def _get_time_series_featurizer(X, y, problem_type, estimator_class, sampler_name=None):
components = []
if (
is_time_series(problem_type)
and estimator_class.model_family != ModelFamily.ARIMA
):
components.append(DelayedFeatureTransformer)
return components

return pp_components

def _get_preprocessing_components(
X, y, problem_type, estimator_class, sampler_name=None
):
"""Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data.
Args:
X (pd.DataFrame): The input data of shape [n_samples, n_features].
y (pd.Series): The target data of length [n_samples].
problem_type (ProblemTypes or str): Problem type.
estimator_class (class): A class which subclasses Estimator estimator for pipeline.
sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None.
Returns:
list[Transformer]: A list of applicable preprocessing components to use with the estimator.
"""
if is_time_series(problem_type):
components_functions = [
_get_label_encoder,
_get_drop_all_null,
_get_drop_index_unknown,
_get_url_email,
_get_natural_language,
_get_imputer,
_get_time_series_featurizer,
_get_datetime,
_get_ohe,
_get_sampler,
_get_standard_scaler,
]
else:
components_functions = [
_get_label_encoder,
_get_drop_all_null,
_get_drop_index_unknown,
_get_url_email,
_get_datetime,
_get_natural_language,
_get_imputer,
_get_ohe,
_get_sampler,
_get_standard_scaler,
]
components = []
for function in components_functions:
components.extend(function(X, y, problem_type, estimator_class, sampler_name))

return components


def _get_time_series_components(X, y, problem_type, estimator_class, sampler_name=None):
components_functions = [
_get_label_encoder,
_get_drop_all_null,
_get_drop_index_unknown,
_get_url_email,
_get_natural_language,
_get_imputer,
_get_time_series_featurizer,
_get_datetime,
_get_ohe,
_get_sampler,
]
components = []
for function in components_functions:
components.extend(function(X, y, problem_type, estimator_class, sampler_name))

return components


def _get_pipeline_base_class(problem_type):
Expand Down
44 changes: 30 additions & 14 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,20 +227,36 @@ def test_make_pipeline(
and input_type == "pd"
else []
)
expected_components = (
label_encoder
+ email_featurizer
+ url_featurizer
+ drop_null
+ drop_col
+ delayed_features
+ natural_language_featurizer
+ datetime
+ imputer
+ ohe
+ standard_scaler
+ [estimator_class]
)
if is_time_series(problem_type):
expected_components = (
label_encoder
+ email_featurizer
+ url_featurizer
+ drop_null
+ drop_col
+ natural_language_featurizer
+ imputer
+ delayed_features
+ datetime
+ ohe
+ standard_scaler
+ [estimator_class]
)
else:
expected_components = (
label_encoder
+ email_featurizer
+ url_featurizer
+ drop_null
+ drop_col
+ delayed_features
+ natural_language_featurizer
+ datetime
+ imputer
+ ohe
+ standard_scaler
+ [estimator_class]
)
assert pipeline.component_graph.compute_order == [
component.name for component in expected_components
], test_description
Expand Down

0 comments on commit 8c345a8

Please sign in to comment.