Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled pipeline fit #1096

Merged
merged 5 commits into from Apr 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
100 changes: 69 additions & 31 deletions autosklearn/automl.py
Expand Up @@ -14,6 +14,7 @@
import warnings
import tempfile

from ConfigSpace.configuration_space import Configuration
from ConfigSpace.read_and_write import json as cs_json
import dask
import dask.distributed
Expand Down Expand Up @@ -250,6 +251,11 @@ def __init__(self,
# After assigning and checking variables...
# self._backend = Backend(self._output_dir, self._tmp_dir)

# Num_run tell us how many runs have been launched
# It can be seen as an identifier for each configuration
# saved to disk
self.num_run = 0

def _create_dask_client(self):
self._is_dask_client_internally_created = True
self._dask_client = dask.distributed.Client(
Expand Down Expand Up @@ -374,7 +380,7 @@ def _print_load_time(basename, time_left_for_this_task,
(basename, time_left_after_reading))
return time_for_load_data

def _do_dummy_prediction(self, datamanager, num_run):
def _do_dummy_prediction(self, datamanager: XYDataManager, num_run: int) -> int:

# When using partial-cv it makes no sense to do dummy predictions
if self._resampling_strategy in ['partial-cv',
Expand Down Expand Up @@ -440,6 +446,7 @@ def _do_dummy_prediction(self, datamanager, num_run):
"Dummy prediction failed with run state %s and additional output: %s."
% (str(status), str(additional_info))
)
return num_run

def fit(
self,
Expand All @@ -464,7 +471,12 @@ def fit(
# The first thing we have to do is create the logger to update the backend
self._backend.setup_logger(self._logger_port)

self._backend.save_start_time(self._seed)
if not only_return_configuration_space:
# If only querying the configuration space, we do not save the start time
# The start time internally checks for the fit() method to execute only once
# But this does not apply when only querying the configuration space
self._backend.save_start_time(self._seed)

self._stopwatch = StopWatch()

# Make sure that input is valid
Expand Down Expand Up @@ -621,10 +633,6 @@ def fit(
time_for_load_data,
self._logger)

# == Perform dummy predictions
num_run = 1
self._do_dummy_prediction(datamanager, num_run)

# = Create a searchspace
# Do this before One Hot Encoding to make sure that it creates a
# search space for a dense classifier even if one hot encoding would
Expand All @@ -641,9 +649,13 @@ def fit(
include_preprocessors=self._include_preprocessors,
exclude_preprocessors=self._exclude_preprocessors)
if only_return_configuration_space:
self._close_dask_client()
self._fit_cleanup()
return self.configuration_space

# == Perform dummy predictions
# Dummy prediction always have num_run set to 1
self.num_run += self._do_dummy_prediction(datamanager, num_run=1)

# == RUN ensemble builder
# Do this before calculating the meta-features to make sure that the
# dummy predictions are actually included in the ensemble even if
Expand Down Expand Up @@ -743,7 +755,7 @@ def fit(
watcher=self._stopwatch,
n_jobs=self._n_jobs,
dask_client=self._dask_client,
start_num_run=num_run,
start_num_run=self.num_run,
num_metalearning_cfgs=self._initial_configurations_via_metalearning,
config_file=configspace_path,
seed=self._seed,
Expand Down Expand Up @@ -805,15 +817,19 @@ def fit(
self._load_models()
self._logger.info("Finished loading models...")

self._fit_cleanup()

return self

def _fit_cleanup(self):
self._logger.info("Closing the dask infrastructure")
self._close_dask_client()
self._logger.info("Finished closing the dask infrastructure")

# Clean up the logger
self._logger.info("Starting to clean up the logger")
self._clean_logger()

return self
return

@staticmethod
def subsample_if_too_large(X, y, logger, seed, memory_limit, task):
Expand Down Expand Up @@ -926,6 +942,9 @@ def fit_pipeline(
y: np.ndarray,
task: int,
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
is_classification: bool,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
**kwargs: Dict,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
""" Fits and individual pipeline configuration and returns
Expand All @@ -939,25 +958,31 @@ def fit_pipeline(

Parameters
----------
X: array-like, shape = (n_samples, n_features)
The features used for training
y: array-like
The labels used for training
task: int
The type of task, taken from autosklearn.constants
is_classification: bool
Whether the task is for classification or regression. This affects
how the targets are treated
X: array-like, shape = (n_samples, n_features)
The features used for training
y: array-like
The labels used for training
X_test: Optionalarray-like, shape = (n_samples, n_features)
If provided, the testing performance will be tracked on this features.
y_test: array-like
If provided, the testing performance will be tracked on this labels
config: Configuration
A configuration object used to define a pipeline steps
dataset_name: Optional[str]
A string to tag and identify the Auto-Sklearn run
is_classification: bool
Whether the task is for classification or regression. This affects
how the targets are treated

Returns
-------
pipeline: Optional[BasePipeline]
The fitted pipeline. In case of failure while fitting the pipeline,
a None is returned.
run_info: RunInFo
A named tuple that contains the configuration launched
run_value: RunValue
A named tuple that contains the result of the run
pipeline: Optional[BasePipeline]
The fitted pipeline. In case of failure while fitting the pipeline,
a None is returned.
run_info: RunInFo
A named tuple that contains the configuration launched
run_value: RunValue
A named tuple that contains the result of the run
"""

# Get the configuration space
Expand All @@ -966,19 +991,18 @@ def fit_pipeline(
if self.configuration_space is None:
self.configuration_space = self.fit(
X=X, y=y, task=task,
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
dataset_name=dataset_name,
X_test=kwargs.pop('X_test', None),
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
y_test=kwargs.pop('y_test', None),
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved
feat_type=kwargs.pop('feat_type', None),
dataset_name=kwargs.pop('dataset_name', None),
only_return_configuration_space=True)

# Get a configuration from the user or sample from the CS
config = kwargs.pop('config', self.configuration_space.sample_configuration())
mfeurer marked this conversation as resolved.
Show resolved Hide resolved

# A config id is expected in the TAE evaluation
if config.config_id is None:
# We do not want to overwrite existing
config.config_id = self._backend.get_highest_num_run() + 1
# We do not want to overwrite existing runs
self.num_run += 1
config.config_id = self.num_run
franchuterivera marked this conversation as resolved.
Show resolved Hide resolved

exclude = dict()
include = dict()
Expand Down Expand Up @@ -1585,6 +1609,10 @@ def fit_pipeline(
self,
X: np.ndarray,
y: np.ndarray,
config: Configuration,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
**kwargs,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:
y_task = type_of_target(y)
Expand All @@ -1600,6 +1628,9 @@ def fit_pipeline(

return super().fit_pipeline(
X=X, y=y,
X_test=X_test, y_test=y_test,
dataset_name=dataset_name,
config=config,
task=task,
is_classification=True,
**kwargs,
Expand Down Expand Up @@ -1670,6 +1701,10 @@ def fit_pipeline(
self,
X: np.ndarray,
y: np.ndarray,
config: Configuration,
dataset_name: Optional[str] = None,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
**kwargs: Dict,
) -> Tuple[Optional[BasePipeline], RunInfo, RunValue]:

Expand All @@ -1685,7 +1720,10 @@ def fit_pipeline(

return super().fit_pipeline(
X=X, y=y,
X_test=X_test, y_test=y_test,
config=config,
task=task,
dataset_name=dataset_name,
is_classification=False,
**kwargs,
)