diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 465f3b3a8b..013af9ade5 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -540,30 +540,32 @@ def get_splitter(self, D): return cv - y = D.data['Y_train'].ravel() + y = D.data['Y_train'] shuffle = self.resampling_strategy_args.get('shuffle', True) train_size = 0.67 if self.resampling_strategy_args: train_size = self.resampling_strategy_args.get('train_size', train_size) - test_size = 1 - train_size + test_size = float("%.4f" % (1 - train_size)) + if D.info['task'] in CLASSIFICATION_TASKS and \ D.info['task'] != MULTILABEL_CLASSIFICATION: + y = y.ravel() if self.resampling_strategy in ['holdout', 'holdout-iterative-fit']: + if shuffle: try: cv = StratifiedShuffleSplit(n_splits=1, - train_size=train_size, test_size=test_size, random_state=1) test_cv = copy.deepcopy(cv) next(test_cv.split(y, y)) except ValueError as e: if 'The least populated class in y has only' in e.args[0]: - cv = ShuffleSplit(n_splits=1, train_size=train_size, - test_size=test_size, random_state=1) + cv = ShuffleSplit(n_splits=1, test_size=test_size, + random_state=1) else: raise e else: @@ -588,8 +590,8 @@ def get_splitter(self, D): 'holdout-iterative-fit']: # TODO shuffle not taken into account for this if shuffle: - cv = ShuffleSplit(n_splits=1, train_size=train_size, - test_size=test_size, random_state=1) + cv = ShuffleSplit(n_splits=1, test_size=test_size, + random_state=1) else: tmp_train_size = int(np.floor(train_size * y.shape[0])) test_fold = np.zeros(y.shape[0]) diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index d5b402f839..577cb2809a 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -23,7 +23,10 @@ eval_holdout, eval_iterative_holdout, eval_cv, eval_partial_cv from autosklearn.util import backend from autosklearn.util.pipeline import get_configuration_space -from autosklearn.constants import * +from autosklearn.constants import BINARY_CLASSIFICATION, \ + MULTILABEL_CLASSIFICATION,\ + MULTICLASS_CLASSIFICATION,\ + REGRESSION from autosklearn.metrics import accuracy, r2, f1_macro this_directory = os.path.dirname(__file__) @@ -1226,6 +1229,112 @@ def test_get_splitter_cv_object(self, te_mock): next(cv.split(D.data['Y_train'], D.data['Y_train'] , groups=evaluator.resampling_strategy_args['groups'])) + @unittest.mock.patch.object(TrainEvaluator, "__init__") + def test_holdout_split_size(self, te_mock): + te_mock.return_value = None + D = unittest.mock.Mock(spec=AbstractDataManager) + D.feat_type = [] + + evaluator = TrainEvaluator() + evaluator.resampling_strategy = 'holdout' + + # Exact Ratio + D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])) + D.info = dict(task=BINARY_CLASSIFICATION) + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 7) + self.assertEqual(len(test_samples), 3) + + # No Shuffle + evaluator.resampling_strategy_args = {'shuffle': False, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 7) + self.assertEqual(len(test_samples), 3) + + # Rounded Ratio + D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1])) + + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 6) + self.assertEqual(len(test_samples), 3) + + # Rounded Ratio No Shuffle + evaluator.resampling_strategy_args = {'shuffle': False, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 6) + self.assertEqual(len(test_samples), 3) + + # More data + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + + D.data = dict(Y_train=np.zeros((900, 1))) + cv = evaluator.get_splitter(D) + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 630) + self.assertEqual(len(test_samples), 270) + + evaluator.resampling_strategy_args = {'train_size': 0.752} + D.data = dict(Y_train=np.zeros((900, 1))) + cv = evaluator.get_splitter(D) + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 676) + self.assertEqual(len(test_samples), 224) + + # Multilabel Exact Ratio + D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], + [1, 1], [1, 1], [1, 0], [1, 1], [1, 1]] + )) + D.info = dict(task=MULTILABEL_CLASSIFICATION) + evaluator.resampling_strategy_args = {'shuffle': True, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 7) + self.assertEqual(len(test_samples), 3) + + # Multilabel No Shuffle + D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1], + [1, 1], [1, 1], [1, 0], [1, 1]])) + evaluator.resampling_strategy_args = {'shuffle': False, + 'train_size': 0.7} + cv = evaluator.get_splitter(D) + + self.assertEqual(cv.get_n_splits(), 1) + train_samples, test_samples = next(cv.split(D.data['Y_train'], + D.data['Y_train'])) + self.assertEqual(len(train_samples), 6) + self.assertEqual(len(test_samples), 3) + class FunctionsTest(unittest.TestCase): def setUp(self):