Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions autosklearn/evaluation/train_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,30 +540,32 @@ def get_splitter(self, D):

return cv

y = D.data['Y_train'].ravel()
y = D.data['Y_train']
shuffle = self.resampling_strategy_args.get('shuffle', True)
train_size = 0.67
if self.resampling_strategy_args:
train_size = self.resampling_strategy_args.get('train_size',
train_size)
test_size = 1 - train_size
test_size = float("%.4f" % (1 - train_size))

if D.info['task'] in CLASSIFICATION_TASKS and \
D.info['task'] != MULTILABEL_CLASSIFICATION:

y = y.ravel()
if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:

if shuffle:
try:
cv = StratifiedShuffleSplit(n_splits=1,
train_size=train_size,
test_size=test_size,
random_state=1)
test_cv = copy.deepcopy(cv)
next(test_cv.split(y, y))
except ValueError as e:
if 'The least populated class in y has only' in e.args[0]:
cv = ShuffleSplit(n_splits=1, train_size=train_size,
test_size=test_size, random_state=1)
cv = ShuffleSplit(n_splits=1, test_size=test_size,
random_state=1)
else:
raise e
else:
Expand All @@ -588,8 +590,8 @@ def get_splitter(self, D):
'holdout-iterative-fit']:
# TODO shuffle not taken into account for this
if shuffle:
cv = ShuffleSplit(n_splits=1, train_size=train_size,
test_size=test_size, random_state=1)
cv = ShuffleSplit(n_splits=1, test_size=test_size,
random_state=1)
else:
tmp_train_size = int(np.floor(train_size * y.shape[0]))
test_fold = np.zeros(y.shape[0])
Expand Down
111 changes: 110 additions & 1 deletion test/test_evaluation/test_train_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
eval_holdout, eval_iterative_holdout, eval_cv, eval_partial_cv
from autosklearn.util import backend
from autosklearn.util.pipeline import get_configuration_space
from autosklearn.constants import *
from autosklearn.constants import BINARY_CLASSIFICATION, \
MULTILABEL_CLASSIFICATION,\
MULTICLASS_CLASSIFICATION,\
REGRESSION
from autosklearn.metrics import accuracy, r2, f1_macro

this_directory = os.path.dirname(__file__)
Expand Down Expand Up @@ -1226,6 +1229,112 @@ def test_get_splitter_cv_object(self, te_mock):
next(cv.split(D.data['Y_train'], D.data['Y_train']
, groups=evaluator.resampling_strategy_args['groups']))

@unittest.mock.patch.object(TrainEvaluator, "__init__")
def test_holdout_split_size(self, te_mock):
te_mock.return_value = None
D = unittest.mock.Mock(spec=AbstractDataManager)
D.feat_type = []

evaluator = TrainEvaluator()
evaluator.resampling_strategy = 'holdout'

# Exact Ratio
D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]))
D.info = dict(task=BINARY_CLASSIFICATION)
evaluator.resampling_strategy_args = {'shuffle': True,
'train_size': 0.7}
cv = evaluator.get_splitter(D)

self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 7)
self.assertEqual(len(test_samples), 3)

# No Shuffle
evaluator.resampling_strategy_args = {'shuffle': False,
'train_size': 0.7}
cv = evaluator.get_splitter(D)

self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 7)
self.assertEqual(len(test_samples), 3)

# Rounded Ratio
D.data = dict(Y_train=np.array([0, 0, 0, 0, 0, 1, 1, 1, 1]))

evaluator.resampling_strategy_args = {'shuffle': True,
'train_size': 0.7}
cv = evaluator.get_splitter(D)

self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 6)
self.assertEqual(len(test_samples), 3)

# Rounded Ratio No Shuffle
evaluator.resampling_strategy_args = {'shuffle': False,
'train_size': 0.7}
cv = evaluator.get_splitter(D)

self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 6)
self.assertEqual(len(test_samples), 3)

# More data
evaluator.resampling_strategy_args = {'shuffle': True,
'train_size': 0.7}

D.data = dict(Y_train=np.zeros((900, 1)))
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 630)
self.assertEqual(len(test_samples), 270)

evaluator.resampling_strategy_args = {'train_size': 0.752}
D.data = dict(Y_train=np.zeros((900, 1)))
cv = evaluator.get_splitter(D)
self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 676)
self.assertEqual(len(test_samples), 224)

# Multilabel Exact Ratio
D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1],
[1, 1], [1, 1], [1, 0], [1, 1], [1, 1]]
))
D.info = dict(task=MULTILABEL_CLASSIFICATION)
evaluator.resampling_strategy_args = {'shuffle': True,
'train_size': 0.7}
cv = evaluator.get_splitter(D)

self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 7)
self.assertEqual(len(test_samples), 3)

# Multilabel No Shuffle
D.data = dict(Y_train=np.array([[0, 0], [0, 1], [1, 1], [1, 0], [1, 1],
[1, 1], [1, 1], [1, 0], [1, 1]]))
evaluator.resampling_strategy_args = {'shuffle': False,
'train_size': 0.7}
cv = evaluator.get_splitter(D)

self.assertEqual(cv.get_n_splits(), 1)
train_samples, test_samples = next(cv.split(D.data['Y_train'],
D.data['Y_train']))
self.assertEqual(len(train_samples), 6)
self.assertEqual(len(test_samples), 3)


class FunctionsTest(unittest.TestCase):
def setUp(self):
Expand Down