Skip to content

Commit

Permalink
making data preprocessing step configurable with two options no prepr…
Browse files Browse the repository at this point in the history
…ocessing and feature type split (#977)

* making data preprocessing step configurable with two options no preprocessing and feature type split

* Fix: execution fails when data_preprocessor is no_preprocesing

* Incorporating review comments

* Fixing test cases; updating metalearning with updated hyperparameters

* Fixing examples

* Updating portfolios with new config

* Incorporated review comments and fix test case

* Test fixes

* Test fixes

* Fix metalearning config

* Remove unused imports

* Fix test cases

* Fix test cases and examples

* Adding more checks for include and exclude params

* Fix flake error

* Fix flake error

* Handling target_type in datatset_properties

* Fixes

* Fixes

* Fix error

* Fix test cases

* Adding datatype annotations

* Fix test cases

* Fix build

* Fix test case'

* Update stale.yaml

* Fix annotation type

* Update portfolios with new config

Co-authored-by: Rohit Agarwal <rohit.agarwal4@aexp.com>
  • Loading branch information
2 people authored and eddiebergman committed Aug 18, 2021
1 parent 90658e5 commit e7b5daa
Show file tree
Hide file tree
Showing 158 changed files with 23,419 additions and 22,420 deletions.
6 changes: 2 additions & 4 deletions .github/workflows/stale.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
steps:
- uses: actions/stale@v3
with:
days-before-stale: 30
days-before-stale: 60
days-before-close: 7
stale-issue-message: >
This issue has been automatically marked as stale because it has not had
Expand All @@ -18,7 +18,5 @@ jobs:
close-issue-message: >
This issue has been automatically closed due to inactivity.
stale-issue-label: 'stale'
# Only issues with ANY of these labels are checked.
# Separate multiple labels with commas (eg. "incomplete,waiting-feedback").
any-of-labels: 'Answered,Feedback-Required,invalid,wontfix'
only-issue-labels: 'Answered,Feedback-Required,invalid,wontfix'
exempt-all-milestones: true
60 changes: 19 additions & 41 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
get_named_client_logger,
)
from autosklearn.util import pipeline, RE_PATTERN
from autosklearn.util.pipeline import parse_include_exclude_components
from autosklearn.util.parallel import preload_modules
from autosklearn.ensemble_builder import EnsembleBuilderManager
from autosklearn.ensembles.singlebest_ensemble import SingleBest
Expand Down Expand Up @@ -125,10 +124,8 @@ def __init__(self,
memory_limit=3072,
metadata_directory=None,
debug_mode=False,
include_estimators=None,
exclude_estimators=None,
include_preprocessors=None,
exclude_preprocessors=None,
include=None,
exclude=None,
resampling_strategy='holdout-iterative-fit',
resampling_strategy_arguments=None,
n_jobs=None,
Expand Down Expand Up @@ -157,10 +154,8 @@ def __init__(self,
self._memory_limit = memory_limit
self._data_memory_limit = None
self._metadata_directory = metadata_directory
self._include_estimators = include_estimators
self._exclude_estimators = exclude_estimators
self._include_preprocessors = include_preprocessors
self._exclude_preprocessors = exclude_preprocessors
self._include = include
self._exclude = exclude
self._resampling_strategy = resampling_strategy
self._scoring_functions = scoring_functions if scoring_functions is not None else []
self._resampling_strategy_arguments = resampling_strategy_arguments \
Expand Down Expand Up @@ -565,10 +560,8 @@ def fit(
self._logger.debug(' memory_limit: %s', str(self._memory_limit))
self._logger.debug(' metadata_directory: %s', self._metadata_directory)
self._logger.debug(' debug_mode: %s', self._debug_mode)
self._logger.debug(' include_estimators: %s', str(self._include_estimators))
self._logger.debug(' exclude_estimators: %s', str(self._exclude_estimators))
self._logger.debug(' include_preprocessors: %s', str(self._include_preprocessors))
self._logger.debug(' exclude_preprocessors: %s', str(self._exclude_preprocessors))
self._logger.debug(' include: %s', str(self._include))
self._logger.debug(' exclude: %s', str(self._exclude))
self._logger.debug(' resampling_strategy: %s', str(self._resampling_strategy))
self._logger.debug(' resampling_strategy_arguments: %s',
str(self._resampling_strategy_arguments))
Expand Down Expand Up @@ -629,10 +622,9 @@ def fit(
self._backend.temporary_directory,
self._backend,
datamanager,
include_estimators=self._include_estimators,
exclude_estimators=self._exclude_estimators,
include_preprocessors=self._include_preprocessors,
exclude_preprocessors=self._exclude_preprocessors)
include=self._include,
exclude=self._exclude,
)
if only_return_configuration_space:
self._fit_cleanup()
return self.configuration_space
Expand Down Expand Up @@ -748,10 +740,8 @@ def fit(
metric=self._metric,
resampling_strategy=self._resampling_strategy,
resampling_strategy_args=self._resampling_strategy_arguments,
include_estimators=self._include_estimators,
exclude_estimators=self._exclude_estimators,
include_preprocessors=self._include_preprocessors,
exclude_preprocessors=self._exclude_preprocessors,
include=self._include,
exclude=self._exclude,
disable_file_output=self._disable_evaluator_output,
get_smac_object_callback=self._get_smac_object_callback,
smac_scenario_args=self._smac_scenario_args,
Expand Down Expand Up @@ -1088,21 +1078,11 @@ def fit_pipeline(
config = Configuration(self.configuration_space, config)
config.config_id = self.num_run

# Get the components to include and exclude on the configuration space
# from the estimator attributes
include, exclude = parse_include_exclude_components(
task=self._task,
include_estimators=self._include_estimators,
exclude_estimators=self._exclude_estimators,
include_preprocessors=self._include_preprocessors,
exclude_preprocessors=self._exclude_preprocessors,
)

# Prepare missing components to the TAE function call
if 'include' not in kwargs:
kwargs['include'] = include
kwargs['include'] = self._include
if 'exclude' not in kwargs:
kwargs['exclude'] = exclude
kwargs['exclude'] = self._exclude
if 'memory_limit' not in kwargs:
kwargs['memory_limit'] = self._memory_limit
if 'resampling_strategy' not in kwargs:
Expand Down Expand Up @@ -1575,20 +1555,18 @@ def show_models(self):
return sio.getvalue()

def _create_search_space(self, tmp_dir, backend, datamanager,
include_estimators=None,
exclude_estimators=None,
include_preprocessors=None,
exclude_preprocessors=None):
include=None,
exclude=None,
):
task_name = 'CreateConfigSpace'

self._stopwatch.start_task(task_name)
configspace_path = os.path.join(tmp_dir, 'space.json')
configuration_space = pipeline.get_configuration_space(
datamanager.info,
include_estimators=include_estimators,
exclude_estimators=exclude_estimators,
include_preprocessors=include_preprocessors,
exclude_preprocessors=exclude_preprocessors)
include=include,
exclude=exclude,
)
configuration_space = self.configuration_space_created_hook(
datamanager, configuration_space)
backend.write_txt_file(
Expand Down
8 changes: 4 additions & 4 deletions autosklearn/data/abstract_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

import scipy.sparse

from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \
import DataPreprocessor
from autosklearn.pipeline.components.data_preprocessing.feature_type \
import FeatTypeSplit


class AbstractDataManager():
Expand Down Expand Up @@ -39,11 +39,11 @@ def feat_type(self, value: Dict[Union[str, int], str]) -> None:
self._feat_type = value

@property
def encoder(self) -> DataPreprocessor:
def encoder(self) -> FeatTypeSplit:
return self._encoder

@encoder.setter
def encoder(self, value: DataPreprocessor) -> DataPreprocessor:
def encoder(self, value: FeatTypeSplit) -> FeatTypeSplit:
self._encoder = value

def __repr__(self) -> str:
Expand Down
44 changes: 16 additions & 28 deletions autosklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,8 @@ def __init__(
max_models_on_disc=50,
seed=1,
memory_limit=3072,
include_estimators=None,
exclude_estimators=None,
include_preprocessors=None,
exclude_preprocessors=None,
include=None,
exclude=None,
resampling_strategy='holdout',
resampling_strategy_arguments=None,
tmp_folder=None,
Expand Down Expand Up @@ -102,22 +100,16 @@ def __init__(
In case of multi-processing, `memory_limit` will be per job.
This memory limit also applies to the ensemble creation process.
include_estimators : list, optional (None)
If None, all possible estimators are used. Otherwise specifies
set of estimators to use.
include : dict, optional (None)
If None, all possible algorithms are used. Otherwise specifies
set of algorithms for each added component is used. Include and
exclude are incompatible if used together on the same component
exclude_estimators : list, optional (None)
If None, all possible estimators are used. Otherwise specifies
set of estimators not to use. Incompatible with include_estimators.
include_preprocessors : list, optional (None)
If None all possible preprocessors are used. Otherwise specifies set
of preprocessors to use.
exclude_preprocessors : list, optional (None)
If None all possible preprocessors are used. Otherwise specifies set
of preprocessors not to use. Incompatible with
include_preprocessors.
exclude : dict, optional (None)
If None, all possible algorithms are used. Otherwise specifies
set of algorithms for each added component is not used.
Incompatible with include. Include and exclude are incompatible
if used together on the same component
resampling_strategy : string or object, optional ('holdout')
how to to handle overfitting, might need 'resampling_strategy_arguments'
Expand Down Expand Up @@ -161,7 +153,7 @@ def __init__(
folder to store configuration output and log files, if ``None``
automatically use ``/tmp/autosklearn_tmp_$pid_$random_number``
delete_tmp_folder_after_terminate: string, optional (True)
delete_tmp_folder_after_terminate: bool, optional (True)
remove tmp_folder, when finished. If tmp_folder is None
tmp_dir will always be deleted
Expand Down Expand Up @@ -254,10 +246,8 @@ def __init__(
self.max_models_on_disc = max_models_on_disc
self.seed = seed
self.memory_limit = memory_limit
self.include_estimators = include_estimators
self.exclude_estimators = exclude_estimators
self.include_preprocessors = include_preprocessors
self.exclude_preprocessors = exclude_preprocessors
self.include = include
self.exclude = exclude
self.resampling_strategy = resampling_strategy
self.resampling_strategy_arguments = resampling_strategy_arguments
self.tmp_folder = tmp_folder
Expand Down Expand Up @@ -309,10 +299,8 @@ def build_automl(self):
max_models_on_disc=self.max_models_on_disc,
seed=self.seed,
memory_limit=self.memory_limit,
include_estimators=self.include_estimators,
exclude_estimators=self.exclude_estimators,
include_preprocessors=self.include_preprocessors,
exclude_preprocessors=self.exclude_preprocessors,
include=self.include,
exclude=self.exclude,
resampling_strategy=self.resampling_strategy,
resampling_strategy_arguments=self.resampling_strategy_arguments,
n_jobs=self._n_jobs,
Expand Down
3 changes: 1 addition & 2 deletions autosklearn/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,7 @@ def __init__(
self.predict_function = self._predict_proba

self._init_params = {
'data_preprocessing:feat_type':
self.datamanager.feat_type
'data_preprocessor:feat_type': self.datamanager.feat_type
}

if init_params is not None:
Expand Down
8 changes: 4 additions & 4 deletions autosklearn/experimental/askl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,8 @@ def __init__(
'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'gradient_boosting', 'mlp',
]
include_preprocessors = ["no_preprocessing"]
include = {'classifier': include_estimators,
'feature_preprocessor': include_preprocessors}
super().__init__(
time_left_for_this_task=time_left_for_this_task,
per_run_time_limit=per_run_time_limit,
Expand All @@ -314,10 +316,8 @@ def __init__(
max_models_on_disc=max_models_on_disc,
seed=seed,
memory_limit=memory_limit,
include_estimators=include_estimators,
exclude_estimators=None,
include_preprocessors=include_preprocessors,
exclude_preprocessors=None,
include=include,
exclude=None,
resampling_strategy=None,
resampling_strategy_arguments=None,
tmp_folder=tmp_folder,
Expand Down

0 comments on commit e7b5daa

Please sign in to comment.