Skip to content

Commit

Permalink
MAINT re-add subsampling to evaluators
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer committed Feb 11, 2017
1 parent ae8468d commit 659c21d
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 12 deletions.
15 changes: 12 additions & 3 deletions autosklearn/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, backend, autosklearn_seed, resampling_strategy,
def start(self, config, instance,
cutoff=None,
seed=12345,
instance_specific="0"):
instance_specific=None):
# Overwrite the start function here. This allows us to abort target
# algorithm runs if the time us over without having the start method
# of the parent class adding the run to the runhistory
Expand All @@ -96,11 +96,20 @@ def start(self, config, instance,
def run(self, config, instance=None,
cutoff=None,
seed=12345,
instance_specific="0"):
instance_specific=None):

D = self.backend.load_datamanager()
queue = multiprocessing.Queue()

if instance_specific is None or instance_specific == '0':
instance_specific = {}
else:
print(instance_specific)
instance_specific = [specific.split('=') for specific in instance_specific.split(',')]
instance_specific = {specific[0]: specific[1] for specific in instance_specific}
subsample = instance_specific.get('subsample')
subsample = int(subsample) if subsample is not None else None

arguments = dict(logger=logging.getLogger("pynisher"),
wall_time_in_s=cutoff,
mem_in_mb=self.memory_limit,
Expand All @@ -114,7 +123,7 @@ def run(self, config, instance=None,
with_predictions=self.with_predictions,
all_scoring_functions=self.all_scoring_functions,
output_y_test=self.output_y_test,
subsample=None,
subsample=subsample,
include=self.include,
exclude=self.exclude,
disable_file_output=self.disable_file_output)
Expand Down
33 changes: 24 additions & 9 deletions autosklearn/evaluation/train_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
from smac.tae.execute_ta_run import StatusType
import sklearn.cross_validation

from autosklearn.evaluation.abstract_evaluator import AbstractEvaluator
from autosklearn.constants import *


__all__ = ['TrainEvaluator', 'eval_holdout', 'eval_iterative_holdout',
Expand Down Expand Up @@ -155,14 +156,7 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices,
iterative=False):
model = self._get_model()

# if self.subsample is not None:
# n_data_subsample = min(self.subsample, len(train_indices))
# indices = np.array(([True] * n_data_subsample) + \
# ([False] * (len(train_indices) - n_data_subsample)),
# dtype=np.bool)
# rs = np.random.RandomState(self.seed)
# rs.shuffle(indices)
# train_indices = train_indices[indices]
train_indices = self.subsample_indices(train_indices)

self.indices[fold] = ((train_indices, test_indices))

Expand Down Expand Up @@ -224,6 +218,27 @@ def _partial_fit_and_predict(self, fold, train_indices, test_indices,
return self._predict(model=model, train_indices=train_indices,
test_indices=test_indices)

def subsample_indices(self, train_indices):
if self.subsample is not None:
# Only subsample if there are more indices given to this method than
# required to subsample because otherwise scikit-learn will complain

if self.task_type in CLASSIFICATION_TASKS and \
self.task_type != MULTILABEL_CLASSIFICATION:
stratify = self.Y_train[train_indices]
else:
stratify = None

if len(train_indices) > self.subsample:
cv_indices_train, _ = sklearn.cross_validation.train_test_split(
train_indices, stratify=stratify,
train_size=self.subsample, random_state=1)
train_indices = train_indices[cv_indices_train]
return train_indices

return train_indices


def _predict(self, model, test_indices, train_indices):
opt_pred = self.predict_function(self.X_train[test_indices],
model, self.task_type,
Expand Down
17 changes: 17 additions & 0 deletions test/test_evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,20 @@ def side_effect(**kwargs):
self.assertEqual(info[0], StatusType.SUCCESS)
self.assertEqual(info[1], 0.5)
self.assertIsInstance(info[2], float)

@unittest.mock.patch('autosklearn.evaluation.eval_holdout')
def test_eval_with_limits_holdout(self, eval_houldout_mock):
def side_effect(*args, **kwargs):
queue = kwargs['queue']
queue.put((StatusType.SUCCESS, 0.5, 0.12345, kwargs['subsample']))
eval_houldout_mock.side_effect = side_effect
ta = ExecuteTaFuncWithQueue(backend=BackendMock(), autosklearn_seed=1,
resampling_strategy='holdout',
logger=self.logger,
stats=self.stats,
memory_limit=3072)
self.scenario.wallclock_limit = 180
info = ta.start(None, cutoff=30, instance=None,
instance_specific='subsample=30')
self.assertEqual(info[0], StatusType.SUCCESS)
self.assertEqual(info[-1], 30)
71 changes: 71 additions & 0 deletions test/test_evaluation/test_train_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,77 @@ def test_file_output(self, makedirs_mock, backend_mock):
D.data['Y_test'])
self.assertEqual(rval, (1.0, 'Model predictions for optimization set contains NaNs.'))

@unittest.mock.patch('autosklearn.util.backend.Backend')
@unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
def test_subsample_indices_classification(self, mock, backend_mock):
D = get_binary_classification_datamanager()

configuration = unittest.mock.Mock(spec=Configuration)
queue_ = multiprocessing.Queue()
kfold = ShuffleSplit(n=len(D.data['Y_train']), random_state=1, n_iter=1)
evaluator = TrainEvaluator(D, backend_mock, queue_,
configuration=configuration,
cv=kfold, subsample=10)
train_indices = np.arange(69, dtype=int)
train_indices1 = evaluator.subsample_indices(train_indices)
evaluator.subsample = 20
train_indices2 = evaluator.subsample_indices(train_indices)
evaluator.subsample = 30
train_indices3 = evaluator.subsample_indices(train_indices)
evaluator.subsample = 67
train_indices4 = evaluator.subsample_indices(train_indices)
# Common cases
for ti in train_indices1:
self.assertIn(ti, train_indices2)
for ti in train_indices2:
self.assertIn(ti, train_indices3)
for ti in train_indices3:
self.assertIn(ti, train_indices4)

# Corner cases
evaluator.subsample = 0
self.assertRaisesRegex(ValueError, 'The train_size = 0 should be '
'greater or equal to the number '
'of classes = 2',
evaluator.subsample_indices, train_indices)
# With equal or greater it should return a non-shuffled array of indices
evaluator.subsample = 69
train_indices5 = evaluator.subsample_indices(train_indices)
self.assertTrue(np.all(train_indices5 == train_indices))
evaluator.subsample = 68
self.assertRaisesRegex(ValueError, 'The test_size = 1 should be greater'
' or equal to the number of '
'classes = 2',
evaluator.subsample_indices, train_indices)

@unittest.mock.patch('autosklearn.util.backend.Backend')
@unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
def test_subsample_indices_regression(self, mock, backend_mock):
D = get_regression_datamanager()

configuration = unittest.mock.Mock(spec=Configuration)
queue_ = multiprocessing.Queue()
kfold = ShuffleSplit(n=len(D.data['Y_train']), random_state=1, n_iter=1)
evaluator = TrainEvaluator(D, backend_mock, queue_,
configuration=configuration,
cv=kfold, subsample=30)
train_indices = np.arange(69, dtype=int)
train_indices3 = evaluator.subsample_indices(train_indices)
evaluator.subsample = 67
train_indices4 = evaluator.subsample_indices(train_indices)
# Common cases
for ti in train_indices3:
self.assertIn(ti, train_indices4)

# Corner cases
evaluator.subsample = 0
train_indices5 = evaluator.subsample_indices(train_indices)
np.testing.assert_allclose(train_indices5, np.array([]))
# With equal or greater it should return a non-shuffled array of indices
evaluator.subsample = 69
train_indices6 = evaluator.subsample_indices(train_indices)
np.testing.assert_allclose(train_indices6, train_indices)

@unittest.mock.patch('autosklearn.util.backend.Backend')
@unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline')
def test_predict_proba_binary_classification(self, mock, backend_mock):
Expand Down

0 comments on commit 659c21d

Please sign in to comment.