From 659c21d479da196f6cdb072347a155b2e0776527 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Sat, 11 Feb 2017 17:14:58 +0100 Subject: [PATCH] MAINT re-add subsampling to evaluators --- autosklearn/evaluation/__init__.py | 15 ++++- autosklearn/evaluation/train_evaluator.py | 33 ++++++--- test/test_evaluation/test_evaluation.py | 17 +++++ test/test_evaluation/test_train_evaluator.py | 71 ++++++++++++++++++++ 4 files changed, 124 insertions(+), 12 deletions(-) diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 4c29a11103..00db262854 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -74,7 +74,7 @@ def __init__(self, backend, autosklearn_seed, resampling_strategy, def start(self, config, instance, cutoff=None, seed=12345, - instance_specific="0"): + instance_specific=None): # Overwrite the start function here. This allows us to abort target # algorithm runs if the time us over without having the start method # of the parent class adding the run to the runhistory @@ -96,11 +96,20 @@ def start(self, config, instance, def run(self, config, instance=None, cutoff=None, seed=12345, - instance_specific="0"): + instance_specific=None): D = self.backend.load_datamanager() queue = multiprocessing.Queue() + if instance_specific is None or instance_specific == '0': + instance_specific = {} + else: + print(instance_specific) + instance_specific = [specific.split('=') for specific in instance_specific.split(',')] + instance_specific = {specific[0]: specific[1] for specific in instance_specific} + subsample = instance_specific.get('subsample') + subsample = int(subsample) if subsample is not None else None + arguments = dict(logger=logging.getLogger("pynisher"), wall_time_in_s=cutoff, mem_in_mb=self.memory_limit, @@ -114,7 +123,7 @@ def run(self, config, instance=None, with_predictions=self.with_predictions, all_scoring_functions=self.all_scoring_functions, output_y_test=self.output_y_test, - subsample=None, + subsample=subsample, include=self.include, exclude=self.exclude, disable_file_output=self.disable_file_output) diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 62aad050de..09fb965f0a 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -1,7 +1,8 @@ import numpy as np -from smac.tae.execute_ta_run import StatusType +import sklearn.cross_validation from autosklearn.evaluation.abstract_evaluator import AbstractEvaluator +from autosklearn.constants import * __all__ = ['TrainEvaluator', 'eval_holdout', 'eval_iterative_holdout', @@ -155,14 +156,7 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices, iterative=False): model = self._get_model() - # if self.subsample is not None: - # n_data_subsample = min(self.subsample, len(train_indices)) - # indices = np.array(([True] * n_data_subsample) + \ - # ([False] * (len(train_indices) - n_data_subsample)), - # dtype=np.bool) - # rs = np.random.RandomState(self.seed) - # rs.shuffle(indices) - # train_indices = train_indices[indices] + train_indices = self.subsample_indices(train_indices) self.indices[fold] = ((train_indices, test_indices)) @@ -224,6 +218,27 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices, return self._predict(model=model, train_indices=train_indices, test_indices=test_indices) + def subsample_indices(self, train_indices): + if self.subsample is not None: + # Only subsample if there are more indices given to this method than + # required to subsample because otherwise scikit-learn will complain + + if self.task_type in CLASSIFICATION_TASKS and \ + self.task_type != MULTILABEL_CLASSIFICATION: + stratify = self.Y_train[train_indices] + else: + stratify = None + + if len(train_indices) > self.subsample: + cv_indices_train, _ = sklearn.cross_validation.train_test_split( + train_indices, stratify=stratify, + train_size=self.subsample, random_state=1) + train_indices = train_indices[cv_indices_train] + return train_indices + + return train_indices + + def _predict(self, model, test_indices, train_indices): opt_pred = self.predict_function(self.X_train[test_indices], model, self.task_type, diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index d36df379f7..8e86f1d5a0 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -172,3 +172,20 @@ def side_effect(**kwargs): self.assertEqual(info[0], StatusType.SUCCESS) self.assertEqual(info[1], 0.5) self.assertIsInstance(info[2], float) + + @unittest.mock.patch('autosklearn.evaluation.eval_holdout') + def test_eval_with_limits_holdout(self, eval_houldout_mock): + def side_effect(*args, **kwargs): + queue = kwargs['queue'] + queue.put((StatusType.SUCCESS, 0.5, 0.12345, kwargs['subsample'])) + eval_houldout_mock.side_effect = side_effect + ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1, + resampling_strategy='holdout', + logger=self.logger, + stats=self.stats, + memory_limit=3072) + self.scenario.wallclock_limit = 180 + info = ta.start(None, cutoff=30, instance=None, + instance_specific='subsample=30') + self.assertEqual(info[0], StatusType.SUCCESS) + self.assertEqual(info[-1], 30) diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index b55eea3045..f753f9bde7 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -436,6 +436,77 @@ def test_file_output(self, makedirs_mock, backend_mock): D.data['Y_test']) self.assertEqual(rval, (1.0, 'Model predictions for optimization set contains NaNs.')) + @unittest.mock.patch('autosklearn.util.backend.Backend') + @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + def test_subsample_indices_classification(self, mock, backend_mock): + D = get_binary_classification_datamanager() + + configuration = unittest.mock.Mock(spec=Configuration) + queue_ = multiprocessing.Queue() + kfold = ShuffleSplit(n=len(D.data['Y_train']), random_state=1, n_iter=1) + evaluator = TrainEvaluator(D, backend_mock, queue_, + configuration=configuration, + cv=kfold, subsample=10) + train_indices = np.arange(69, dtype=int) + train_indices1 = evaluator.subsample_indices(train_indices) + evaluator.subsample = 20 + train_indices2 = evaluator.subsample_indices(train_indices) + evaluator.subsample = 30 + train_indices3 = evaluator.subsample_indices(train_indices) + evaluator.subsample = 67 + train_indices4 = evaluator.subsample_indices(train_indices) + # Common cases + for ti in train_indices1: + self.assertIn(ti, train_indices2) + for ti in train_indices2: + self.assertIn(ti, train_indices3) + for ti in train_indices3: + self.assertIn(ti, train_indices4) + + # Corner cases + evaluator.subsample = 0 + self.assertRaisesRegex(ValueError, 'The train_size = 0 should be ' + 'greater or equal to the number ' + 'of classes = 2', + evaluator.subsample_indices, train_indices) + # With equal or greater it should return a non-shuffled array of indices + evaluator.subsample = 69 + train_indices5 = evaluator.subsample_indices(train_indices) + self.assertTrue(np.all(train_indices5 == train_indices)) + evaluator.subsample = 68 + self.assertRaisesRegex(ValueError, 'The test_size = 1 should be greater' + ' or equal to the number of ' + 'classes = 2', + evaluator.subsample_indices, train_indices) + + @unittest.mock.patch('autosklearn.util.backend.Backend') + @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') + def test_subsample_indices_regression(self, mock, backend_mock): + D = get_regression_datamanager() + + configuration = unittest.mock.Mock(spec=Configuration) + queue_ = multiprocessing.Queue() + kfold = ShuffleSplit(n=len(D.data['Y_train']), random_state=1, n_iter=1) + evaluator = TrainEvaluator(D, backend_mock, queue_, + configuration=configuration, + cv=kfold, subsample=30) + train_indices = np.arange(69, dtype=int) + train_indices3 = evaluator.subsample_indices(train_indices) + evaluator.subsample = 67 + train_indices4 = evaluator.subsample_indices(train_indices) + # Common cases + for ti in train_indices3: + self.assertIn(ti, train_indices4) + + # Corner cases + evaluator.subsample = 0 + train_indices5 = evaluator.subsample_indices(train_indices) + np.testing.assert_allclose(train_indices5, np.array([])) + # With equal or greater it should return a non-shuffled array of indices + evaluator.subsample = 69 + train_indices6 = evaluator.subsample_indices(train_indices) + np.testing.assert_allclose(train_indices6, train_indices) + @unittest.mock.patch('autosklearn.util.backend.Backend') @unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline') def test_predict_proba_binary_classification(self, mock, backend_mock):