Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled pipeline fit #1096

Merged
merged 5 commits into from Apr 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
157 changes: 82 additions & 75 deletions autosklearn/automl.py
Expand Up @@ -8,7 +8,7 @@
import os
import sys
import time
from typing import Any, Dict, Optional, List, Tuple, Union
from typing import Any, Dict, Optional, List, Tuple
import uuid
import unittest.mock
import warnings
Expand Down Expand Up @@ -38,7 +38,11 @@

from autosklearn.metrics import Scorer
from autosklearn.data.xy_data_manager import XYDataManager
from autosklearn.data.validation import InputValidator
from autosklearn.data.validation import (
InputValidator,
SUPPORTED_FEAT_TYPES,
SUPPORTED_TARGET_TYPES,
)
from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
from autosklearn.evaluation.train_evaluator import _fit_with_budget
Expand Down Expand Up @@ -208,6 +212,7 @@ def __init__(self,

self._datamanager = None
self._dataset_name = None
self._feat_type = None
self._stopwatch = StopWatch()
self._logger = None
self._task = None
Expand Down Expand Up @@ -450,11 +455,11 @@ def _do_dummy_prediction(self, datamanager: XYDataManager, num_run: int) -> int:

def fit(
self,
X: np.ndarray,
y: np.ndarray,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
task: int,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
feat_type: Optional[List[str]] = None,
dataset_name: Optional[str] = None,
only_return_configuration_space: Optional[bool] = False,
Expand Down Expand Up @@ -530,7 +535,9 @@ def fit(
self._stopwatch.start_task(self._dataset_name)

if feat_type is None and self.InputValidator.feature_validator.feat_type:
feat_type = self.InputValidator.feature_validator.feat_type
self._feat_type = self.InputValidator.feature_validator.feat_type
elif feat_type is not None:
self._feat_type = feat_type

# Produce debug information to the logfile
self._logger.debug('Starting to print environment information')
Expand Down Expand Up @@ -612,7 +619,7 @@ def fit(
X_test=X_test,
y_test=y_test,
task=task,
feat_type=feat_type,
feat_type=self._feat_type,
dataset_name=dataset_name,
)

Expand Down Expand Up @@ -938,13 +945,14 @@ def refit(self, X, y):

def fit_pipeline(
self,
X: np.ndarray,
y: np.ndarray,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
task: int,
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
is_classification: bool,
config: Configuration,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
**kwargs: Dict,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
""" Fits and individual pipeline configuration and returns
Expand Down Expand Up @@ -991,46 +999,19 @@ def fit_pipeline(
if self.configuration_space is None:
self.configuration_space = self.fit(
X=X, y=y, task=task,
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
dataset_name=dataset_name,
X_test=kwargs.pop('X_test', None),
y_test=kwargs.pop('y_test', None),
feat_type=kwargs.pop('feat_type', None),
dataset_name=dataset_name if dataset_name is not None else self._dataset_name,
X_test=X_test,
y_test=y_test,
feat_type=kwargs.pop('feat_type', self._feat_type),
only_return_configuration_space=True)

# Get a configuration from the user or sample from the CS
config = kwargs.pop('config', self.configuration_space.sample_configuration())

# We do not want to overwrite existing runs
self.num_run += 1
config.config_id = self.num_run
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

exclude = dict()
include = dict()
if self._include_preprocessors is not None and self._exclude_preprocessors is not None:
raise ValueError('Cannot specify include_preprocessors and '
'exclude_preprocessors.')
elif self._include_preprocessors is not None:
include['feature_preprocessor'] = self._include_preprocessors
elif self._exclude_preprocessors is not None:
exclude['feature_preprocessor'] = self._exclude_preprocessors

if self._include_estimators is not None and self._exclude_estimators is not None:
raise ValueError('Cannot specify include_estimators and '
'exclude_estimators.')
elif self._include_estimators is not None:
if task in CLASSIFICATION_TASKS:
include['classifier'] = self._include_estimators
elif task in REGRESSION_TASKS:
include['regressor'] = self._include_estimators
else:
raise ValueError(task)
elif self._exclude_estimators is not None:
if task in CLASSIFICATION_TASKS:
exclude['classifier'] = self._exclude_estimators
elif task in REGRESSION_TASKS:
exclude['regressor'] = self._exclude_estimators
else:
raise ValueError(task)
# Get the components to include and exclude on the configuration space
# from the estimator attributes
include, exclude = self._get_include_exclude_pipeline_dicts()

# Prepare missing components to the TAE function call
if 'include' not in kwargs:
Expand All @@ -1053,9 +1034,6 @@ def fit_pipeline(
kwargs['stats'] = Stats(scenario_mock)
kwargs['stats'].start_timing()

# Allow to pass the cutoff also as an argument
cutoff = kwargs.pop('cutoff', self._per_run_time_limit)

# Fit a pipeline, which will be stored on disk
# which we can later load via the backend
ta = ExecuteTaFuncWithQueue(
Expand All @@ -1074,30 +1052,59 @@ def fit_pipeline(
instance=None,
instance_specific=None,
seed=self._seed,
cutoff=cutoff,
cutoff=kwargs.pop('cutoff', self._per_run_time_limit),
capped=False,
)
)

pipeline = None
if run_value.status == StatusType.SUCCESS:
if kwargs['disable_file_output'] or kwargs['resampling_strategy'] == 'test':
self._logger.warning("File output is disabled. No pipeline can returned")
elif run_value.status == StatusType.SUCCESS:
if kwargs['resampling_strategy'] in ('cv', 'cv-iterative-fit'):
load_function = self._backend.load_cv_model_by_seed_and_id_and_budget
else:
load_function = self._backend.load_model_by_seed_and_id_and_budget
try:
pipeline = load_function(
seed=self._seed,
idx=run_info.config.config_id + 1,
budget=run_info.budget,
)
except Exception as e:
self._logger.warning(f"Cannot load pipeline because of {e}")
pipeline = load_function(
seed=self._seed,
idx=run_info.config.config_id + 1,
budget=run_info.budget,
)

self._clean_logger()

return pipeline, run_info, run_value

def _get_include_exclude_pipeline_dicts(self):
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
exclude = dict()
include = dict()
if self._include_preprocessors is not None and self._exclude_preprocessors is not None:
raise ValueError('Cannot specify include_preprocessors and '
'exclude_preprocessors.')
elif self._include_preprocessors is not None:
include['feature_preprocessor'] = self._include_preprocessors
elif self._exclude_preprocessors is not None:
exclude['feature_preprocessor'] = self._exclude_preprocessors

if self._include_estimators is not None and self._exclude_estimators is not None:
raise ValueError('Cannot specify include_estimators and '
'exclude_estimators.')
elif self._include_estimators is not None:
if self._task in CLASSIFICATION_TASKS:
include['classifier'] = self._include_estimators
elif self._task in REGRESSION_TASKS:
include['regressor'] = self._include_estimators
else:
raise ValueError(self._task)
elif self._exclude_estimators is not None:
if self._task in CLASSIFICATION_TASKS:
exclude['classifier'] = self._exclude_estimators
elif self._task in REGRESSION_TASKS:
exclude['regressor'] = self._exclude_estimators
else:
raise ValueError(self._task)
return include, exclude

def predict(self, X, batch_size=None, n_jobs=1):
"""predict.

Expand Down Expand Up @@ -1571,10 +1578,10 @@ def __init__(self, *args, **kwargs):

def fit(
self,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.DataFrame],
X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
feat_type: Optional[List[bool]] = None,
dataset_name: Optional[str] = None,
only_return_configuration_space: bool = False,
Expand Down Expand Up @@ -1607,12 +1614,12 @@ def fit(

def fit_pipeline(
self,
X: np.ndarray,
y: np.ndarray,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
config: Configuration,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
**kwargs,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
y_task = type_of_target(y)
Expand Down Expand Up @@ -1663,10 +1670,10 @@ def __init__(self, *args, **kwargs):

def fit(
self,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.DataFrame],
X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
feat_type: Optional[List[bool]] = None,
dataset_name: Optional[str] = None,
only_return_configuration_space: bool = False,
Expand Down Expand Up @@ -1699,12 +1706,12 @@ def fit(

def fit_pipeline(
self,
X: np.ndarray,
y: np.ndarray,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
config: Configuration,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
**kwargs: Dict,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:

Expand Down
27 changes: 19 additions & 8 deletions autosklearn/estimators.py
Expand Up @@ -10,6 +10,10 @@
from sklearn.utils.multiclass import type_of_target
from smac.runhistory.runhistory import RunInfo, RunValue

from autosklearn.data.validation import (
SUPPORTED_FEAT_TYPES,
SUPPORTED_TARGET_TYPES,
)
from autosklearn.pipeline.base import BasePipeline
from autosklearn.automl import AutoMLClassifier, AutoMLRegressor, AutoML
from autosklearn.metrics import Scorer
Expand Down Expand Up @@ -343,12 +347,12 @@ def fit(self, **kwargs):

def fit_pipeline(
self,
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
X: np.ndarray,
y: np.ndarray,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
config: Configuration,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
*args,
**kwargs: Dict,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
Expand Down Expand Up @@ -546,8 +550,10 @@ def _get_automl_class(self):

def get_configuration_space(
self,
X: np.ndarray,
y: np.ndarray,
X: SUPPORTED_FEAT_TYPES,
y: SUPPORTED_TARGET_TYPES,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
dataset_name: Optional[str] = None,
):
"""
Expand All @@ -556,18 +562,23 @@ def get_configuration_space(

Parameters
----------
X: (np.ndarray)
X : array-like or sparse matrix of shape = [n_samples, n_features]
Array with the training features, used to get characteristics like
data sparsity
y: (np.ndarray)
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Array with the problem labels
X_test : array-like or sparse matrix of shape = [n_samples, n_features]
Array with features used for performance estimation
y_test : array-like, shape = [n_samples] or [n_samples, n_outputs]
Array with the problem labels for the testing split
dataset_name: Optional[str]
A string to tag the Auto-Sklearn run
"""
if self.automl_ is None:
self.automl_ = self.build_automl()
return self.automl_.fit(
X, y,
X_test=X_test, y_test=y_test,
dataset_name=dataset_name,
only_return_configuration_space=True,
) if self.automl_.configuration_space is None else self.automl_.configuration_space
Expand Down
4 changes: 2 additions & 2 deletions scripts/run_auto-sklearn_for_metadata_generation.py
Expand Up @@ -141,8 +141,8 @@
memory_lim = memory_limit_factor * automl_arguments['memory_limit']

pipeline, run_info, run_value = automl.fit_pipeline(
X=X_train, y=y_train, dataset_name=dataset_name,
feat_type=cat, X_test=X_test, y_test=y_test,
X=X_train, y=y_train,
X_test=X_test, y_test=y_test,
resampling_strategy='test',
memory_limit=memory_lim,
disable_file_output=True,
Expand Down