From 5c4d66a0e58fb8f1daa3865063203a7b9592e35d Mon Sep 17 00:00:00 2001 From: Yassine Morakakam Date: Fri, 6 Apr 2018 15:01:28 +0200 Subject: [PATCH 1/2] save AutoML models --- autosklearn/automl.py | 131 ++++++++++-------------------------------- 1 file changed, 30 insertions(+), 101 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 85eb067ddf..f4c92d2432 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -10,8 +10,6 @@ import numpy.ma as ma import scipy.stats from sklearn.base import BaseEstimator -from sklearn.model_selection._split import _RepeatedSplits, \ - BaseShuffleSplit, BaseCrossValidator from smac.tae.execute_ta_run import StatusType from smac.stats.stats import Stats from sklearn.externals import joblib @@ -135,16 +133,11 @@ def __init__(self, # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) - def fit( - self, X, y, - task, - metric, - X_test=None, - y_test=None, - feat_type=None, - dataset_name=None, - only_return_configuration_space=False, - ): + def fit(self, X, y, + task=MULTICLASS_CLASSIFICATION, + metric=None, + feat_type=None, + dataset_name=None): if not self._shared_mode: self._backend.context.delete_directories() else: @@ -188,22 +181,13 @@ def fit( 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None - loaded_data_manager = XYDataManager( - X, y, - X_test=X_test, - y_test=y_test, - task=task, - feat_type=feat_type, - dataset_name=dataset_name, - ) + loaded_data_manager = XYDataManager(X, y, + task=task, + feat_type=feat_type, + dataset_name=dataset_name) - return self._fit( - loaded_data_manager, - metric, - only_return_configuration_space, - ) + return self._fit(loaded_data_manager, metric) - # TODO this is very old code which can be dropped! def fit_automl_dataset(self, dataset, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) @@ -296,7 +280,7 @@ def _do_dummy_prediction(self, datamanager, num_run): return ta.num_run - def _fit(self, datamanager, metric, only_return_configuration_space=False): + def _fit(self, datamanager, metric): # Reset learnt stuff self.models_ = None self.ensemble_ = None @@ -312,13 +296,9 @@ def _fit(self, datamanager, metric, only_return_configuration_space=False): raise ValueError("List member '%s' for argument " "'disable_evaluator_output' must be one " "of " + str(allowed_elements)) - if self._resampling_strategy not in [ - 'holdout', 'holdout-iterative-fit', - 'cv', 'partial-cv', - 'partial-cv-iterative-fit'] \ - and not issubclass(self._resampling_strategy, BaseCrossValidator)\ - and not issubclass(self._resampling_strategy, _RepeatedSplits)\ - and not issubclass(self._resampling_strategy, BaseShuffleSplit): + if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', + 'cv', 'partial-cv', + 'partial-cv-iterative-fit']: raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \ @@ -374,8 +354,6 @@ def _fit(self, datamanager, metric, only_return_configuration_space=False): exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) - if only_return_configuration_space: - return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the @@ -406,7 +384,7 @@ def _fit(self, datamanager, metric, only_return_configuration_space=False): del self._datamanager except Exception: pass - + # => RUN SMAC smac_task_name = 'runSMAC' self._stopwatch.start_task(smac_task_name) @@ -554,7 +532,7 @@ def predict(self, X, batch_size=None, n_jobs=1): # Each process computes predictions in chunks of batch_size rows. all_predictions = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_model_predict)(self, X, batch_size, identifier) - for identifier in self.ensemble_.get_model_identifiers()) + for identifier in self.ensemble_.get_model_identifiers(self.models_)) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' @@ -581,6 +559,7 @@ def fit_ensemble(self, y, task=None, metric=None, precision='32', ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size) self._proc_ensemble.main() self._proc_ensemble = None + self._load_models() return self def _get_ensemble_process(self, time_left_for_ensembles, @@ -628,8 +607,7 @@ def _get_ensemble_process(self, time_left_for_ensembles, seed=self._seed, shared_mode=self._shared_mode, precision=precision, - max_iterations=max_iterations, - read_at_most=np.inf) + max_iterations=max_iterations) def _load_models(self): if self._shared_mode: @@ -833,8 +811,7 @@ def __init__(self, *args, **kwargs): def _perform_input_checks(self, X, y): X = self._check_X(X) - if y is not None: - y = self._check_y(y) + y = self._check_y(y) return X, y def _check_X(self, X): @@ -888,21 +865,12 @@ def __init__(self, *args, **kwargs): 'multiclass': MULTICLASS_CLASSIFICATION, 'binary': BINARY_CLASSIFICATION} - def fit( - self, X, y, - X_test=None, - y_test=None, - metric=None, - feat_type=None, - dataset_name=None, - only_return_configuration_space=False, - ): + def fit(self, X, y, + metric=None, + loss=None, + feat_type=None, + dataset_name=None): X, y = self._perform_input_checks(X, y) - if X_test is not None: - X_test, y_test = self._perform_input_checks(X_test, y_test) - if len(y.shape) != len(y_test.shape): - raise ValueError('Target value shapes do not match: %s vs %s' - % (y.shape, y_test.shape)) y_task = type_of_target(y) task = self._task_mapping.get(y_task) @@ -916,31 +884,8 @@ def fit( metric = accuracy y, self._classes, self._n_classes = self._process_target_classes(y) - if y_test is not None: - # Map test values to actual values - TODO: copy to all kinds of - # other parts in this code and test it!!! - y_test_new = [] - for output_idx in range(len(self._classes)): - mapping = {self._classes[output_idx][idx]: idx - for idx in range(len(self._classes[output_idx]))} - enumeration = y_test if len(self._classes) == 1 else y_test[output_idx] - y_test_new.append( - np.array([mapping[value] for value in enumeration]) - ) - y_test = np.array(y_test_new) - if self._n_outputs == 1: - y_test = y_test.flatten() - - return super().fit( - X, y, - X_test=X_test, - y_test=y_test, - task=task, - metric=metric, - feat_type=feat_type, - dataset_name=dataset_name, - only_return_configuration_space=only_return_configuration_space, - ) + + return super().fit(X, y, task, metric, feat_type, dataset_name) def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, @@ -973,7 +918,7 @@ def _process_target_classes(self, y): _classes.append(classes_k) _n_classes.append(classes_k.shape[0]) - _n_classes = np.array(_n_classes, dtype=np.int) + self._n_classes = np.array(_n_classes, dtype=np.int) return y, _classes, _n_classes @@ -1003,15 +948,7 @@ def predict_proba(self, X, batch_size=None, n_jobs=1): class AutoMLRegressor(BaseAutoML): - def fit( - self, X, y, - X_test=None, - y_test=None, - metric=None, - feat_type=None, - dataset_name=None, - only_return_configuration_space=False, - ): + def fit(self, X, y, metric=None, feat_type=None, dataset_name=None): X, y = super()._perform_input_checks(X, y) _n_outputs = 1 if len(y.shape) == 1 else y.shape[1] if _n_outputs > 1: @@ -1019,16 +956,8 @@ def fit( 'Multi-output regression is not implemented.') if metric is None: metric = r2 - return super().fit( - X, y, - X_test=X_test, - y_test=y_test, - task=REGRESSION, - metric=metric, - feat_type=feat_type, - dataset_name=dataset_name, - only_return_configuration_space=only_return_configuration_space, - ) + return super().fit(X, y, task=REGRESSION, metric=metric, + feat_type=feat_type, dataset_name=dataset_name) def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, From 4d2e21c4bdf15c95f5e9dcf3561b69249def209c Mon Sep 17 00:00:00 2001 From: Yassine Morakakam Date: Fri, 6 Apr 2018 15:10:50 +0200 Subject: [PATCH 2/2] save AutoML models --- autosklearn/automl.py | 128 +++++++++++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 28 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index f4c92d2432..6a56950394 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -10,6 +10,8 @@ import numpy.ma as ma import scipy.stats from sklearn.base import BaseEstimator +from sklearn.model_selection._split import _RepeatedSplits, \ + BaseShuffleSplit, BaseCrossValidator from smac.tae.execute_ta_run import StatusType from smac.stats.stats import Stats from sklearn.externals import joblib @@ -133,11 +135,16 @@ def __init__(self, # After assignging and checking variables... #self._backend = Backend(self._output_dir, self._tmp_dir) - def fit(self, X, y, - task=MULTICLASS_CLASSIFICATION, - metric=None, - feat_type=None, - dataset_name=None): + def fit( + self, X, y, + task, + metric, + X_test=None, + y_test=None, + feat_type=None, + dataset_name=None, + only_return_configuration_space=False, + ): if not self._shared_mode: self._backend.context.delete_directories() else: @@ -181,13 +188,22 @@ def fit(self, X, y, 'valid feature types, you passed `%s`' % ft) self._data_memory_limit = None - loaded_data_manager = XYDataManager(X, y, - task=task, - feat_type=feat_type, - dataset_name=dataset_name) + loaded_data_manager = XYDataManager( + X, y, + X_test=X_test, + y_test=y_test, + task=task, + feat_type=feat_type, + dataset_name=dataset_name, + ) - return self._fit(loaded_data_manager, metric) + return self._fit( + loaded_data_manager, + metric, + only_return_configuration_space, + ) + # TODO this is very old code which can be dropped! def fit_automl_dataset(self, dataset, metric): self._stopwatch = StopWatch() self._backend.save_start_time(self._seed) @@ -280,7 +296,7 @@ def _do_dummy_prediction(self, datamanager, num_run): return ta.num_run - def _fit(self, datamanager, metric): + def _fit(self, datamanager, metric, only_return_configuration_space=False): # Reset learnt stuff self.models_ = None self.ensemble_ = None @@ -296,9 +312,13 @@ def _fit(self, datamanager, metric): raise ValueError("List member '%s' for argument " "'disable_evaluator_output' must be one " "of " + str(allowed_elements)) - if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', - 'cv', 'partial-cv', - 'partial-cv-iterative-fit']: + if self._resampling_strategy not in [ + 'holdout', 'holdout-iterative-fit', + 'cv', 'partial-cv', + 'partial-cv-iterative-fit'] \ + and not issubclass(self._resampling_strategy, BaseCrossValidator)\ + and not issubclass(self._resampling_strategy, _RepeatedSplits)\ + and not issubclass(self._resampling_strategy, BaseShuffleSplit): raise ValueError('Illegal resampling strategy: %s' % self._resampling_strategy) if self._resampling_strategy in ['partial-cv', 'partial-cv-iterative-fit'] \ @@ -354,6 +374,8 @@ def _fit(self, datamanager, metric): exclude_estimators=self._exclude_estimators, include_preprocessors=self._include_preprocessors, exclude_preprocessors=self._exclude_preprocessors) + if only_return_configuration_space: + return self.configuration_space # == RUN ensemble builder # Do this before calculating the meta-features to make sure that the @@ -532,7 +554,7 @@ def predict(self, X, batch_size=None, n_jobs=1): # Each process computes predictions in chunks of batch_size rows. all_predictions = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_model_predict)(self, X, batch_size, identifier) - for identifier in self.ensemble_.get_model_identifiers(self.models_)) + for identifier in self.ensemble_.get_model_identifiers()) if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' @@ -607,7 +629,8 @@ def _get_ensemble_process(self, time_left_for_ensembles, seed=self._seed, shared_mode=self._shared_mode, precision=precision, - max_iterations=max_iterations) + max_iterations=max_iterations, + read_at_most=np.inf) def _load_models(self): if self._shared_mode: @@ -811,7 +834,8 @@ def __init__(self, *args, **kwargs): def _perform_input_checks(self, X, y): X = self._check_X(X) - y = self._check_y(y) + if y is not None: + y = self._check_y(y) return X, y def _check_X(self, X): @@ -865,12 +889,21 @@ def __init__(self, *args, **kwargs): 'multiclass': MULTICLASS_CLASSIFICATION, 'binary': BINARY_CLASSIFICATION} - def fit(self, X, y, - metric=None, - loss=None, - feat_type=None, - dataset_name=None): + def fit( + self, X, y, + X_test=None, + y_test=None, + metric=None, + feat_type=None, + dataset_name=None, + only_return_configuration_space=False, + ): X, y = self._perform_input_checks(X, y) + if X_test is not None: + X_test, y_test = self._perform_input_checks(X_test, y_test) + if len(y.shape) != len(y_test.shape): + raise ValueError('Target value shapes do not match: %s vs %s' + % (y.shape, y_test.shape)) y_task = type_of_target(y) task = self._task_mapping.get(y_task) @@ -884,8 +917,31 @@ def fit(self, X, y, metric = accuracy y, self._classes, self._n_classes = self._process_target_classes(y) - - return super().fit(X, y, task, metric, feat_type, dataset_name) + if y_test is not None: + # Map test values to actual values - TODO: copy to all kinds of + # other parts in this code and test it!!! + y_test_new = [] + for output_idx in range(len(self._classes)): + mapping = {self._classes[output_idx][idx]: idx + for idx in range(len(self._classes[output_idx]))} + enumeration = y_test if len(self._classes) == 1 else y_test[output_idx] + y_test_new.append( + np.array([mapping[value] for value in enumeration]) + ) + y_test = np.array(y_test_new) + if self._n_outputs == 1: + y_test = y_test.flatten() + + return super().fit( + X, y, + X_test=X_test, + y_test=y_test, + task=task, + metric=metric, + feat_type=feat_type, + dataset_name=dataset_name, + only_return_configuration_space=only_return_configuration_space, + ) def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None, @@ -918,7 +974,7 @@ def _process_target_classes(self, y): _classes.append(classes_k) _n_classes.append(classes_k.shape[0]) - self._n_classes = np.array(_n_classes, dtype=np.int) + _n_classes = np.array(_n_classes, dtype=np.int) return y, _classes, _n_classes @@ -948,7 +1004,15 @@ def predict_proba(self, X, batch_size=None, n_jobs=1): class AutoMLRegressor(BaseAutoML): - def fit(self, X, y, metric=None, feat_type=None, dataset_name=None): + def fit( + self, X, y, + X_test=None, + y_test=None, + metric=None, + feat_type=None, + dataset_name=None, + only_return_configuration_space=False, + ): X, y = super()._perform_input_checks(X, y) _n_outputs = 1 if len(y.shape) == 1 else y.shape[1] if _n_outputs > 1: @@ -956,8 +1020,16 @@ def fit(self, X, y, metric=None, feat_type=None, dataset_name=None): 'Multi-output regression is not implemented.') if metric is None: metric = r2 - return super().fit(X, y, task=REGRESSION, metric=metric, - feat_type=feat_type, dataset_name=dataset_name) + return super().fit( + X, y, + X_test=X_test, + y_test=y_test, + task=REGRESSION, + metric=metric, + feat_type=feat_type, + dataset_name=dataset_name, + only_return_configuration_space=only_return_configuration_space, + ) def fit_ensemble(self, y, task=None, metric=None, precision='32', dataset_name=None, ensemble_nbest=None,