Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Remove redundant categorical imputation #375

Merged
merged 5 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 0 additions & 16 deletions autoPyTorch/configs/greedy_portfolio.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[{"data_loader:batch_size": 60,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -32,7 +31,6 @@
{"data_loader:batch_size": 255,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -66,7 +64,6 @@
{"data_loader:batch_size": 165,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -97,7 +94,6 @@
{"data_loader:batch_size": 299,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -129,7 +125,6 @@
{"data_loader:batch_size": 183,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -163,7 +158,6 @@
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -192,7 +186,6 @@
{"data_loader:batch_size": 159,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -222,7 +215,6 @@
{"data_loader:batch_size": 442,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -255,7 +247,6 @@
{"data_loader:batch_size": 140,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -288,7 +279,6 @@
{"data_loader:batch_size": 48,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -316,7 +306,6 @@
{"data_loader:batch_size": 168,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -349,7 +338,6 @@
{"data_loader:batch_size": 21,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -378,7 +366,6 @@
{"data_loader:batch_size": 163,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -411,7 +398,6 @@
{"data_loader:batch_size": 150,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "NoFeaturePreprocessor",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down Expand Up @@ -445,7 +431,6 @@
{"data_loader:batch_size": 151,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedMLPBackbone",
Expand Down Expand Up @@ -475,7 +460,6 @@
{"data_loader:batch_size": 42,
"encoder:__choice__": "OneHotEncoder",
"feature_preprocessor:__choice__": "TruncatedSVD",
"imputer:categorical_strategy": "most_frequent",
"imputer:numerical_strategy": "mean",
"lr_scheduler:__choice__": "CosineAnnealingLR",
"network_backbone:__choice__": "ShapedResNetBackbone",
Expand Down
7 changes: 5 additions & 2 deletions autoPyTorch/optimizer/smbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,11 @@ def __init__(self,

self.initial_configurations: Optional[List[Configuration]] = None
if portfolio_selection is not None:
self.initial_configurations = read_return_initial_configurations(config_space=config_space,
portfolio_selection=portfolio_selection)
initial_configurations = read_return_initial_configurations(config_space=config_space,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This way makes more sense but it means there is no longer and attribute self.initial_configurations if portfolio_selection is None. While unlikely to matter, this means code like if self.initial_configurations: happening elsewhere will now break as it doesn't exist. I'd keep the default thing that it is None and given a value if portfolio_selection is not None.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, thanks for pointing it out. I'll fix this.

portfolio_selection=portfolio_selection)
# incase we dont have any valid configuration from the portfolio
self.initial_configurations = initial_configurations \
if len(initial_configurations) > 0 else None

def reset_data_manager(self) -> None:
if self.datamanager is not None:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np

from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

Expand Down Expand Up @@ -48,18 +49,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
"TabularColumnTransformer": an instance of self
"""
self.check_requirements(X, y)
numerical_pipeline = 'drop'
categorical_pipeline = 'drop'

preprocessors = get_tabular_preprocessers(X)
if len(X['dataset_properties']['numerical_columns']):
column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
if len(preprocessors['numerical']) > 0:
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
if len(X['dataset_properties']['categorical_columns']):
column_transformers.append(
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
)
if len(preprocessors['categorical']) > 0:
categorical_pipeline = make_pipeline(*preprocessors['categorical'])

self.preprocessor = ColumnTransformer([
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
column_transformers.append(
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
)

# in case the preprocessing steps are disabled
# i.e, NoEncoder for categorical, we want to
# let the data in categorical columns pass through
self.preprocessor = ColumnTransformer(
column_transformers,
remainder='passthrough'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,70 +13,42 @@


class SimpleImputer(BaseImputer):
"""An imputer for categorical and numerical columns

Impute missing values for categorical columns with 'constant_!missing!'

Note:
In case of numpy data, the constant value is set to -1, under the assumption
that categorical data is fit with an Ordinal Scaler.
"""
An imputer for numerical columns

Attributes:
random_state (Optional[np.random.RandomState]):
The random state to use for the imputer.
numerical_strategy (str: default='mean'):
The strategy to use for imputing numerical columns.
Can be one of ['most_frequent', 'constant_!missing!']
categorical_strategy (str: default='most_frequent')
The strategy to use for imputing categorical columns.
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
"""

def __init__(
self,
random_state: Optional[np.random.RandomState] = None,
numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'
):
"""
Note:
'constant' as numerical_strategy uses 0 as the default fill_value while
'constant_!missing!' uses a fill_value of -1.
This behaviour should probably be fixed.
"""
super().__init__()
self.random_state = random_state
self.numerical_strategy = numerical_strategy
self.categorical_strategy = categorical_strategy

def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
""" Fits the underlying model and returns the transformed array.
"""
Builds the preprocessor based on the given fit dictionary 'X'.

Args:
X (np.ndarray):
The input features to fit on
y (Optional[np.ndarray]):
The labels for the input features `X`
X (Dict[str, Any]):
The fit dictionary
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to give a little example dict in the documentation.

y (Optional[Any]):
Not Used -- to comply with API
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this change to Any, seems more semantically clear (:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, there is outdated documentation throughout the pipeline components. I'll raise an issue and fix them with a different PR.


Returns:
SimpleImputer:
returns self
self:
returns an instance of self.
"""
self.check_requirements(X, y)

# Choose an imputer for any categorical columns
categorical_columns = X['dataset_properties']['categorical_columns']

if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
if self.categorical_strategy == 'constant_!missing!':
# Train data is numpy as of this point, where an Ordinal Encoding is used
# for categoricals. Only Numbers are allowed for `fill_value`
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
self.preprocessor['categorical'] = imputer
else:
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
self.preprocessor['categorical'] = imputer

# Choose an imputer for any numerical columns
numerical_columns = X['dataset_properties']['numerical_columns']

Expand All @@ -98,11 +70,6 @@ def get_hyperparameter_search_space(
value_range=("mean", "median", "most_frequent", "constant_zero"),
default_value="mean",
),
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='categorical_strategy',
value_range=("most_frequent", "constant_!missing!"),
default_value="most_frequent"
)
) -> ConfigurationSpace:
"""Get the hyperparameter search space for the SimpleImputer

Expand All @@ -112,8 +79,6 @@ def get_hyperparameter_search_space(
Note: Not actually Optional, just adhering to its supertype
numerical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for numerical imputation
caterogical_strategy (HyperparameterSearchSpace: default = ...)
The strategy to use for categorical imputation

Returns:
ConfigurationSpace
Expand All @@ -132,12 +97,6 @@ def get_hyperparameter_search_space(
):
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)

if (
isinstance(dataset_properties['categorical_columns'], List)
and len(dataset_properties['categorical_columns'])
):
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)

return cs

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent):
def __init__(self) -> None:
super().__init__()
self.add_fit_requirements([
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
"""
Expand All @@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Returns:
(Dict[str, Any]): the updated 'X' dictionary
"""
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0:
raise ValueError("cant call transform on {} without fitting first."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise ValueError("cant call transform on {} without fitting first."
raise ValueError("cannot call transform on {} without fitting first."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"can not" is two words :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not know that "can not" exists, but "cannot" is preferred for formal settings?

  • article1 --> "You’ll find cannot in formal writing and speech. Research studies, academic reports, and professional presentations are places where cannot is the most appropriate choice."
  • article 2 --> "While can not is an acceptable alternate spelling, cannot is generally preferred by most writers."
  • article 3 --> "Both cannot and can not are perfectly fine, but cannot is far more common and is therefore recommended"

.format(self.__class__.__name__))
X.update({'imputer': self.preprocessor})
Expand Down