Skip to content

Commit

Permalink
FIX holdout with only a single instance for a class
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer committed Feb 24, 2017
1 parent 91017ed commit 4530fc6
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 29 deletions.
71 changes: 42 additions & 29 deletions autosklearn/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,36 +126,9 @@ def run(self, config, instance=None,
include=self.include,
exclude=self.exclude,
disable_file_output=self.disable_file_output)
if self.resampling_strategy != 'test':
if D.info['task'] in CLASSIFICATION_TASKS and \
D.info['task'] != MULTILABEL_CLASSIFICATION:
y = D.data['Y_train'].ravel()
if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:
cv = StratifiedShuffleSplit(y=y, n_iter=1, train_size=0.67,
test_size=0.33, random_state=1)
elif self.resampling_strategy in ['cv', 'partial-cv',
'partial-cv-iterative-fit']:
cv = StratifiedKFold(y=y,
n_folds=self.resampling_strategy_args[
'folds'],
shuffle=True, random_state=1)
else:
raise ValueError(self.resampling_strategy)
else:
n = D.data['Y_train'].shape[0]
if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:
cv = ShuffleSplit(n=n, n_iter=1, train_size=0.67,
test_size=0.33, random_state=1)
elif self.resampling_strategy in ['cv', 'partial-cv',
'partial-cv-iterative-fit']:
cv = KFold(n=n,
n_folds=self.resampling_strategy_args['folds'],
shuffle=True, random_state=1)
else:
raise ValueError(self.resampling_strategy)

if self.resampling_strategy != 'test':
cv = self.get_splitter(D)
obj_kwargs['cv'] = cv
if instance is not None:
obj_kwargs['instance'] = instance
Expand Down Expand Up @@ -208,3 +181,43 @@ def run(self, config, instance=None,
self.num_run += 1
return status, cost, runtime, additional_run_info

def get_splitter(self, D):
y = D.data['Y_train'].ravel()
n = D.data['Y_train'].shape[0]
if D.info['task'] in CLASSIFICATION_TASKS and \
D.info['task'] != MULTILABEL_CLASSIFICATION:

if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:
try:
cv = StratifiedShuffleSplit(y=y, n_iter=1, train_size=0.67,
test_size=0.33, random_state=1)
except ValueError as e:
if 'The least populated class in y has only' in e.args[0]:
cv = ShuffleSplit(n=n, n_iter=1, train_size=0.67,
test_size=0.33, random_state=1)
else:
raise

elif self.resampling_strategy in ['cv', 'partial-cv',
'partial-cv-iterative-fit']:
cv = StratifiedKFold(y=y,
n_folds=self.resampling_strategy_args[
'folds'],
shuffle=True, random_state=1)
else:
raise ValueError(self.resampling_strategy)
else:
if self.resampling_strategy in ['holdout',
'holdout-iterative-fit']:
cv = ShuffleSplit(n=n, n_iter=1, train_size=0.67,
test_size=0.33, random_state=1)
elif self.resampling_strategy in ['cv', 'partial-cv',
'partial-cv-iterative-fit']:
cv = KFold(n=n,
n_folds=self.resampling_strategy_args['folds'],
shuffle=True, random_state=1)
else:
raise ValueError(self.resampling_strategy)
return cv

51 changes: 51 additions & 0 deletions test/test_evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
import pynisher
from smac.tae.execute_ta_run import StatusType
from smac.stats.stats import Stats
import sklearn.cross_validation

from evaluation_util import get_multiclass_classification_datamanager
from autosklearn.constants import *
from autosklearn.evaluation import ExecuteTaFuncWithQueue
from autosklearn.data.abstract_data_manager import AbstractDataManager


def safe_eval_success_mock(*args, **kwargs):
Expand Down Expand Up @@ -189,3 +192,51 @@ def side_effect(*args, **kwargs):
instance_specific='subsample=30')
self.assertEqual(info[0], StatusType.SUCCESS)
self.assertEqual(info[-1], 30)

def test_get_splitter(self):
ta_args = dict(backend=BackendMock(), autosklearn_seed=1,
logger=self.logger, stats=self.stats, memory_limit=3072)
D = unittest.mock.Mock(spec=AbstractDataManager)
D.data = dict(Y_train=np.array([0, 0, 0, 1, 1, 1]))
D.info = dict(task=BINARY_CLASSIFICATION)

# holdout, binary classification
ta = ExecuteTaFuncWithQueue(resampling_strategy='holdout', **ta_args)
cv = ta.get_splitter(D)
self.assertIsInstance(cv,
sklearn.cross_validation.StratifiedShuffleSplit)

# holdout, binary classification, fallback to shuffle split
D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 2])
ta = ExecuteTaFuncWithQueue(resampling_strategy='holdout', **ta_args)
cv = ta.get_splitter(D)
self.assertIsInstance(cv, sklearn.cross_validation.ShuffleSplit)

# cv, binary classification
D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
ta = ExecuteTaFuncWithQueue(resampling_strategy='cv', folds=5,
**ta_args)
cv = ta.get_splitter(D)
self.assertIsInstance(cv, sklearn.cross_validation.StratifiedKFold)

# cv, binary classification, no fallback anticipated
D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 2])
ta = ExecuteTaFuncWithQueue(resampling_strategy='cv', folds=5,
**ta_args)
cv = ta.get_splitter(D)
self.assertIsInstance(cv, sklearn.cross_validation.StratifiedKFold)

# regression, shuffle split
D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
D.info['task'] = REGRESSION
ta = ExecuteTaFuncWithQueue(resampling_strategy='holdout', **ta_args)
cv = ta.get_splitter(D)
self.assertIsInstance(cv, sklearn.cross_validation.ShuffleSplit)

# regression cv, KFold
D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])
D.info['task'] = REGRESSION
ta = ExecuteTaFuncWithQueue(resampling_strategy='cv', folds=5,
**ta_args)
cv = ta.get_splitter(D)
self.assertIsInstance(cv, sklearn.cross_validation.KFold)

0 comments on commit 4530fc6

Please sign in to comment.