diff --git a/autosklearn/automl.py b/autosklearn/automl.py index bc014878ec..02c241363a 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -187,7 +187,8 @@ def __init__(self, self._metric = None self._label_num = None self.models_ = None - self.ensemble_indices_ = None + self.ensemble_ = None + self._can_predict = False self._debug_mode = debug_mode self._backend = Backend(self._output_dir, self._tmp_dir) @@ -242,9 +243,14 @@ def fit(self, X, y, raise ValueError('Array feat_type does not have same number of ' 'variables as X has features. %d vs %d.' % (len(feat_type), X.shape[1])) - if feat_type is not None and not all([isinstance(f, bool) + if feat_type is not None and not all([isinstance(f, str) for f in feat_type]): - raise ValueError('Array feat_type must only contain bools.') + raise ValueError('Array feat_type must only contain strings.') + if feat_type is not None: + for ft in feat_type: + if ft.lower() not in ['categorical', 'numerical']: + raise ValueError('Only `Categorical` and `Numerical` are ' + 'valid feature types, you passed `%s`' % ft) loaded_data_manager = XYDataManager(X, y, task=task, @@ -298,16 +304,19 @@ def _print_load_time(basename, time_left_for_this_task, return time_for_load_data def _do_dummy_prediction(self, datamanager): + self._logger.info("Starting to create dummy predictions.") autosklearn.cli.base_interface.main(datamanager, self._resampling_strategy, None, None, - mode_args=self._resampling_strategy_arguments) + mode_args=self._resampling_strategy_arguments, + output_dir=self._tmp_dir) + self._logger.info("Finished creating dummy predictions.") def _fit(self, datamanager): # Reset learnt stuff self.models_ = None - self.ensemble_indices_ = None + self.ensemble_ = None # Check arguments prior to doing anything! if self._resampling_strategy not in ['holdout', 'holdout-iterative-fit', @@ -352,7 +361,8 @@ def _fit(self, datamanager): self._logger) # == Perform dummy predictions - self._do_dummy_prediction(datamanager) + if self._resampling_strategy in ['holdout', 'holdout-iterative-fit']: + self._do_dummy_prediction(datamanager) # = Create a searchspace # Do this before One Hot Encoding to make sure that it creates a @@ -371,6 +381,12 @@ def _fit(self, datamanager): self._include_preprocessors) self.configuration_space_created_hook(datamanager) + # == RUN ensemble builder + # Do this before calculating the meta-features to make sure that the + # dummy predictions are actually included in the ensemble even if + # calculating the meta-features takes very long + proc_ensembles = self.run_ensemble_builder() + # == Calculate metafeatures meta_features = _calculate_metafeatures( data_feat_type=datamanager.feat_type, @@ -481,9 +497,6 @@ def _fit(self, datamanager): resampling_strategy_arguments=self._resampling_strategy_arguments, shared_mode=self._shared_mode) - # == RUN ensemble builder - proc_ensembles = self.run_ensemble_builder() - procs = [] if proc_smac is not None: @@ -554,26 +567,43 @@ def run_ensemble_builder(self, 'size 0.') return None + def refit(self, X, y): + if self._keep_models is not True: + raise ValueError( + "Predict can only be called if 'keep_models==True'") + if self.models_ is None or len(self.models_) == 0 or \ + self.ensemble_ is None: + self._load_models() + + for identifier in self.models_: + if identifier in self.ensemble_.get_model_identifiers(): + model = self.models_[identifier] + # this updates the model inplace, it can then later be used in + # predict method + model.fit(X.copy(), y.copy()) + + self._can_predict = True + def predict(self, X): + return np.argmax(self.predict_proba(X), axis=1) + + def predict_proba(self, X): if self._keep_models is not True: raise ValueError( "Predict can only be called if 'keep_models==True'") - if self._resampling_strategy not in ['holdout', - 'holdout-iterative-fit']: + if not self._can_predict and \ + self._resampling_strategy not in \ + ['holdout', 'holdout-iterative-fit']: raise NotImplementedError( 'Predict is currently only implemented for resampling ' 'strategy holdout.') - if self.models_ is None or len(self.models_) == 0 or len( - self.ensemble_indices_) == 0: + if self.models_ is None or len(self.models_) == 0 or \ + self.ensemble_ is None: self._load_models() - predictions = [] - for identifier in self.models_: - if identifier not in self.ensemble_indices_: - continue - - weight = self.ensemble_indices_[identifier] + all_predictions = [] + for identifier in self.ensemble_.get_model_identifiers(): model = self.models_[identifier] X_ = X.copy() @@ -588,16 +618,16 @@ def predict(self, X): "while X_.shape is %s" % (model, str(prediction.shape), str(X_.shape))) - predictions.append(prediction * weight) + all_predictions.append(prediction) - if len(predictions) == 0: + if len(all_predictions) == 0: raise ValueError('Something went wrong generating the predictions. ' 'The ensemble should consist of the following ' 'models: %s, the following models were loaded: ' '%s' % (str(list(self.ensemble_indices_.keys())), str(list(self.models_.keys())))) - predictions = np.sum(np.array(predictions), axis=0) + predictions = self.ensemble_.predict(all_predictions) return predictions def _load_models(self): @@ -610,42 +640,23 @@ def _load_models(self): if len(self.models_) == 0: raise ValueError('No models fitted!') - self.ensemble_indices_ = self._backend.load_ensemble_indices_weights( - seed) + self.ensemble_ = self._backend.load_ensemble(seed) def score(self, X, y): # fix: Consider only index 1 of second dimension # Don't know if the reshaping should be done there or in calculate_score - prediction = self.predict(X) - if self._task == BINARY_CLASSIFICATION: - prediction = prediction[:, 1].reshape((-1, 1)) + prediction = self.predict_proba(X) return calculate_score(y, prediction, self._task, self._metric, self._label_num, logger=self._logger) def show_models(self): - if self.models_ is None or len(self.models_) == 0 or len( - self.ensemble_indices_) == 0: - self._load_models() - output = [] - sio = six.StringIO() - for identifier in self.models_: - if identifier not in self.ensemble_indices_: - continue - - weight = self.ensemble_indices_[identifier] - model = self.models_[identifier] - output.append((weight, model)) - - output.sort(reverse=True) - - sio.write("[") - for weight, model in output: - sio.write("(%f, %s),\n" % (weight, model)) - sio.write("]") + if self.models_ is None or len(self.models_) == 0 or \ + self.ensemble_ is None: + self._load_models() - return sio.getvalue() + return self.ensemble_.pprint_ensemble_string(self.models_) def _save_ensemble_data(self, X, y): """Split dataset and store Data for the ensemble script. diff --git a/autosklearn/cli/HPOlib_interface.py b/autosklearn/cli/HPOlib_interface.py index d8932bc70b..420e2495f6 100755 --- a/autosklearn/cli/HPOlib_interface.py +++ b/autosklearn/cli/HPOlib_interface.py @@ -82,7 +82,7 @@ def parse_cli(): return args, parameters -def parse_args(dataset, mode, seed, params, fold, folds): +def parse_args(dataset, mode, seed, params, fold, folds, output_dir=None): if seed is None: seed = 1 @@ -107,10 +107,11 @@ def parse_args(dataset, mode, seed, params, fold, folds): mode_args = None else: raise ValueError(mode) - base_interface.main(dataset, mode, seed, params, mode_args=mode_args) + base_interface.main(dataset, mode, seed, params, mode_args=mode_args, + output_dir=output_dir) -def main(): +def main(output_dir=None): args, params = parse_cli() assert 'dataset' in args assert 'mode' in args @@ -124,6 +125,7 @@ def main(): params, int(args['fold']), int(args['folds']), + output_dir=output_dir ) diff --git a/autosklearn/cli/SMAC_interface.py b/autosklearn/cli/SMAC_interface.py index fbd57e0a46..1a3c23c2eb 100644 --- a/autosklearn/cli/SMAC_interface.py +++ b/autosklearn/cli/SMAC_interface.py @@ -3,7 +3,8 @@ from autosklearn.cli import base_interface -def main(): + +def main(output_dir=None): instance_name = sys.argv[1] instance_specific_information = sys.argv[2] cutoff_time = float(sys.argv[3]) @@ -45,7 +46,7 @@ def main(): raise ValueError(mode) base_interface.main(instance_specific_information, mode, - seed, params, mode_args=mode_args) + seed, params, mode_args=mode_args, output_dir=output_dir) if __name__ == '__main__': diff --git a/autosklearn/cli/base_interface.py b/autosklearn/cli/base_interface.py index a4f8bb831e..ad2732d8b4 100644 --- a/autosklearn/cli/base_interface.py +++ b/autosklearn/cli/base_interface.py @@ -54,44 +54,46 @@ def empty_signal_handler(signum, frame): def _get_base_dict(): return { 'with_predictions': True, - 'all_scoring_functions': True, 'output_y_test': True, } -def make_mode_holdout(data, seed, configuration, num_run): +def make_mode_holdout(data, seed, configuration, num_run, output_dir): global evaluator - evaluator = HoldoutEvaluator(data, configuration, + evaluator = HoldoutEvaluator(data, output_dir, configuration, seed=seed, num_run=num_run, + all_scoring_functions=False, **_get_base_dict()) evaluator.fit() signal.signal(15, empty_signal_handler) evaluator.finish_up() - backend = Backend(None, os.getcwd()) + backend = Backend(None, output_dir) if os.path.exists(backend.get_model_dir()): backend.save_model(evaluator.model, num_run, seed) -def make_mode_holdout_iterative_fit(data, seed, configuration, num_run): +def make_mode_holdout_iterative_fit(data, seed, configuration, num_run, + output_dir): global evaluator - evaluator = HoldoutEvaluator(data, configuration, + evaluator = HoldoutEvaluator(data, output_dir, configuration, seed=seed, num_run=num_run, + all_scoring_functions=False, **_get_base_dict()) evaluator.iterative_fit() signal.signal(15, empty_signal_handler) evaluator.finish_up() - backend = Backend(None, os.getcwd()) + backend = Backend(None, output_dir) if os.path.exists(backend.get_model_dir()): backend.save_model(evaluator.model, num_run, seed) -def make_mode_test(data, seed, configuration, metric): +def make_mode_test(data, seed, configuration, metric, output_dir): global evaluator - evaluator = TestEvaluator(data, + evaluator = TestEvaluator(data, output_dir, configuration, seed=seed, all_scoring_functions=True, @@ -112,12 +114,13 @@ def make_mode_test(data, seed, configuration, metric): additional_run_info)) -def make_mode_cv(data, seed, configuration, num_run, folds): +def make_mode_cv(data, seed, configuration, num_run, folds, output_dir): global evaluator - evaluator = CVEvaluator(data, configuration, + evaluator = CVEvaluator(data, output_dir, configuration, cv_folds=folds, seed=seed, num_run=num_run, + all_scoring_functions=False, **_get_base_dict()) evaluator.fit() signal.signal(15, empty_signal_handler) @@ -125,36 +128,35 @@ def make_mode_cv(data, seed, configuration, num_run, folds): def make_mode_partial_cv(data, seed, configuration, num_run, metric, fold, - folds): + folds, output_dir): global evaluator - evaluator = CVEvaluator(data, configuration, + evaluator = CVEvaluator(data, output_dir, configuration, cv_folds=folds, seed=seed, num_run=num_run, + all_scoring_functions=False, **_get_base_dict()) evaluator.partial_fit(fold) signal.signal(15, empty_signal_handler) - scores, _, _, _ = evaluator.predict() + loss, _, _, _ = evaluator.loss_and_predict() duration = time.time() - evaluator.starttime - score = scores[metric] - additional_run_info = ';'.join(['%s: %s' % (m_, value) - for m_, value in scores.items()]) - additional_run_info += ';' + 'duration: ' + str(duration) + additional_run_info = 'duration: ' + str(duration) - print(metric, score, additional_run_info) + print(metric, loss, additional_run_info) print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % - ('SAT', abs(duration), score, evaluator.seed, + ('SAT', abs(duration), loss, evaluator.seed, additional_run_info)) def make_mode_nested_cv(data, seed, configuration, num_run, inner_folds, - outer_folds): + outer_folds, output_dir): global evaluator - evaluator = NestedCVEvaluator(data, configuration, + evaluator = NestedCVEvaluator(data, output_dir, configuration, inner_cv_folds=inner_folds, outer_cv_folds=outer_folds, seed=seed, + all_scoring_functions=False, num_run=num_run, **_get_base_dict()) evaluator.fit() @@ -162,7 +164,8 @@ def make_mode_nested_cv(data, seed, configuration, num_run, inner_folds, evaluator.finish_up() -def main(dataset_info, mode, seed, params, mode_args=None): +def main(dataset_info, mode, seed, params, + mode_args=None, output_dir=None): """This command line interface has three different operation modes: * CV: useful for the Tweakathon @@ -175,10 +178,12 @@ def main(dataset_info, mode, seed, params, mode_args=None): if mode_args is None: mode_args = {} - output_dir = os.getcwd() + if output_dir is None: + output_dir = os.getcwd() if not isinstance(dataset_info, AbstractDataManager): - D = store_and_or_load_data(dataset_info=dataset_info, outputdir=output_dir) + D = store_and_or_load_data(dataset_info=dataset_info, + outputdir=output_dir) else: D = dataset_info metric = D.info['metric'] @@ -210,18 +215,22 @@ def main(dataset_info, mode, seed, params, mode_args=None): global evaluator if mode == 'holdout': - make_mode_holdout(D, seed, configuration, num_run) + make_mode_holdout(D, seed, configuration, num_run, output_dir) elif mode == 'holdout-iterative-fit': - make_mode_holdout_iterative_fit(D, seed, configuration, num_run) + make_mode_holdout_iterative_fit(D, seed, configuration, num_run, + output_dir) elif mode == 'test': - make_mode_test(D, seed, configuration, metric) + make_mode_test(D, seed, configuration, metric, output_dir) elif mode == 'cv': - make_mode_cv(D, seed, configuration, num_run, mode_args['folds']) + make_mode_cv(D, seed, configuration, num_run, mode_args['folds'], + output_dir) elif mode == 'partial-cv': make_mode_partial_cv(D, seed, configuration, num_run, - metric, mode_args['fold'], mode_args['folds']) + metric, mode_args['fold'], mode_args['folds'], + output_dir) elif mode == 'nested-cv': make_mode_nested_cv(D, seed, configuration, num_run, - mode_args['inner_folds'], mode_args['outer_folds']) + mode_args['inner_folds'], mode_args['outer_folds'], + output_dir) else: raise ValueError('Must choose a legal mode.') diff --git a/autosklearn/ensemble_selection_script.py b/autosklearn/ensemble_selection_script.py index 1488729967..702953d592 100644 --- a/autosklearn/ensemble_selection_script.py +++ b/autosklearn/ensemble_selection_script.py @@ -4,7 +4,6 @@ import glob import logging import os -import random import re import sys import time @@ -15,6 +14,7 @@ from autosklearn.constants import STRING_TO_TASK_TYPES, STRING_TO_METRIC from autosklearn.evaluation.util import calculate_score from autosklearn.util import StopWatch, Backend +from autosklearn.ensembles.ensemble_selection import EnsembleSelection logging.basicConfig(format='[%(levelname)s] [%(asctime)s:%(name)s] %(' @@ -23,34 +23,6 @@ logger.setLevel(logging.DEBUG) -def build_ensemble(predictions_train, predictions_valid, predictions_test, - true_labels, ensemble_size, task_type, metric): - indices, trajectory = ensemble_selection(predictions_train, true_labels, - ensemble_size, task_type, metric) - ensemble_predictions_valid = np.mean( - predictions_valid[indices.astype(int)], - axis=0) - ensemble_predictions_test = np.mean(predictions_test[indices.astype(int)], - axis=0) - - logger.info('Trajectory and indices!') - logger.info(trajectory) - logger.info(indices) - - return ensemble_predictions_valid, ensemble_predictions_test, \ - trajectory[-1], indices - - -def pruning(predictions, labels, n_best, task_type, metric): - perf = np.zeros([predictions.shape[0]]) - for i, p in enumerate(predictions): - perf[i] = calculate_score(labels, predictions, task_type, - metric, predictions.shape[1]) - - indcies = np.argsort(perf)[perf.shape[0] - n_best:] - return indcies - - def get_predictions(dir_path, dir_path_list, include_num_runs, model_and_automl_re, precision="32"): result = [] @@ -76,113 +48,8 @@ def get_predictions(dir_path, dir_path_list, include_num_runs, return result -def original_ensemble_selection(predictions, labels, ensemble_size, task_type, - metric, do_pruning=False): - """Rich Caruana's ensemble selection method.""" - - ensemble = [] - trajectory = [] - order = [] - - if do_pruning: - n_best = 20 - indices = pruning(predictions, labels, n_best, task_type, metric) - for idx in indices: - ensemble.append(predictions[idx]) - order.append(idx) - ensemble_ = np.array(ensemble).mean(axis=0) - ensemble_performance = calculate_score( - labels, ensemble_, task_type, metric, ensemble_.shape[1]) - trajectory.append(ensemble_performance) - ensemble_size -= n_best - - for i in range(ensemble_size): - scores = np.zeros([predictions.shape[0]]) - for j, pred in enumerate(predictions): - ensemble.append(pred) - ensemble_prediction = np.mean(np.array(ensemble), axis=0) - scores[j] = calculate_score(labels, ensemble_prediction, - task_type, metric, - ensemble_prediction.shape[1]) - ensemble.pop() - best = np.nanargmax(scores) - ensemble.append(predictions[best]) - trajectory.append(scores[best]) - order.append(best) - - return np.array(order), np.array(trajectory) - - -def ensemble_selection(predictions, labels, ensemble_size, task_type, metric, - do_pruning=False): - """Fast version of Rich Caruana's ensemble selection method.""" - - ensemble = [] - trajectory = [] - order = [] - - if do_pruning: - n_best = 20 - indices = pruning(predictions, labels, n_best, task_type, metric) - for idx in indices: - ensemble.append(predictions[idx]) - order.append(idx) - ensemble_ = np.array(ensemble).mean(axis=0) - ensemble_performance = calculate_score( - labels, ensemble_, task_type, metric, ensemble_.shape[1]) - trajectory.append(ensemble_performance) - ensemble_size -= n_best - - for i in range(ensemble_size): - scores = np.zeros([predictions.shape[0]]) - s = len(ensemble) - if s == 0: - weighted_ensemble_prediction = np.zeros(predictions[0].shape) - else: - ensemble_prediction = np.mean(np.array(ensemble), axis=0) - weighted_ensemble_prediction = (s / float(s + 1) - ) * ensemble_prediction - for j, pred in enumerate(predictions): - # ensemble.append(pred) - # ensemble_prediction = np.mean(np.array(ensemble), axis=0) - fant_ensemble_prediction = weighted_ensemble_prediction + ( - 1. / float(s + 1)) * pred - - scores[j] = calculate_score( - labels, fant_ensemble_prediction, task_type, metric, - fant_ensemble_prediction.shape[1]) - # ensemble.pop() - best = np.nanargmax(scores) - ensemble.append(predictions[best]) - trajectory.append(scores[best]) - order.append(best) - - return np.array(order), np.array(trajectory) - - -def ensemble_selection_bagging(predictions, labels, ensemble_size, task_type, - metric, - fraction=0.5, - n_bags=20, - do_pruning=False): - """Rich Caruana's ensemble selection method with bagging.""" - n_models = predictions.shape[0] - bag_size = int(n_models * fraction) - - order_of_each_bag = [] - for j in range(n_bags): - # Bagging a set of models - indices = sorted(random.sample(range(0, n_models), bag_size)) - bag = predictions[indices, :, :] - order, _ = ensemble_selection(bag, labels, ensemble_size, task_type, - metric, do_pruning) - order_of_each_bag.append(order) - - return np.array(order_of_each_bag) - - def main(autosklearn_tmp_dir, - basename, + dataset_name, task_type, metric, limit, @@ -212,8 +79,6 @@ def main(autosklearn_tmp_dir, 'predictions_test') paths_ = [dir_ensemble, dir_valid, dir_test] - targets_ensemble = backend.load_targets_ensemble() - dir_ensemble_list_mtimes = [] while used_time < limit or (max_iterations > 0 and max_iterations >= num_iteration): @@ -221,6 +86,11 @@ def main(autosklearn_tmp_dir, logger.debug('Time left: %f', limit - used_time) logger.debug('Time last iteration: %f', time_iter) + # Reload the ensemble targets every iteration, important, because cv may + # update the ensemble targets in the cause of running auto-sklearn + # TODO update cv in order to not need this any more! + targets_ensemble = backend.load_targets_ensemble() + # Load the predictions from the models exists = [os.path.isdir(dir_) for dir_ in paths_] if not exists[0]: # all(exists): @@ -305,9 +175,14 @@ def main(autosklearn_tmp_dir, predictions = np.load(os.path.join(dir_ensemble, basename)).astype(dtype=np.float64) else: predictions = np.load(os.path.join(dir_ensemble, basename)) - score = calculate_score(targets_ensemble, predictions, - task_type, metric, - predictions.shape[1]) + + try: + score = calculate_score(targets_ensemble, predictions, + task_type, metric, + predictions.shape[1]) + except: + score = -1 + model_names_to_scores[model_name] = score match = model_and_automl_re.search(model_name) automl_seed = int(match.group(1)) @@ -315,10 +190,9 @@ def main(autosklearn_tmp_dir, if ensemble_nbest is not None: if score <= 0.001: - # include_num_runs.append(True) logger.error('Model only predicts at random: ' + model_name + ' has score: ' + str(score)) - backup_num_runs.append(num_run) + backup_num_runs.append((automl_seed, num_run)) # If we have less models in our ensemble than ensemble_nbest add # the current model if it is better than random elif len(scores_nbest) < ensemble_nbest: @@ -380,43 +254,37 @@ def main(autosklearn_tmp_dir, indices_to_model_names[num_indices] = model_name indices_to_run_num[num_indices] = (automl_seed, num_run) - # logging.info("Indices to model names:") - # logging.info(indices_to_model_names) - - # for i, item in enumerate(sorted(model_names_to_scores.items(), - # key=lambda t: t[1])): - # logging.info("%d: %s", i, item) - - include_num_runs = set(include_num_runs) - all_predictions_train = get_predictions(dir_ensemble, dir_ensemble_list, include_num_runs, model_and_automl_re, precision) -# if len(all_predictions_train) == len(all_predictions_test) == len( -# all_predictions_valid) == 0: if len(include_num_runs) == 0: logger.error('All models do just random guessing') time.sleep(2) continue else: - try: - indices, trajectory = ensemble_selection( - np.array(all_predictions_train), targets_ensemble, - ensemble_size, task_type, metric) + ensemble = EnsembleSelection(ensemble_size=ensemble_size, + task_type=task_type, + metric=metric) - logger.info('Trajectory and indices!') - logger.info(trajectory) - logger.info(indices) + try: + ensemble.fit(all_predictions_train, targets_ensemble, + include_num_runs) + logger.info(ensemble) except ValueError as e: logger.error('Caught ValueError: ' + str(e)) used_time = watch.wall_elapsed('ensemble_builder') time.sleep(2) continue + except IndexError as e: + logger.error('Caught IndexError: ' + str(e)) + used_time = watch.wall_elapsed('ensemble_builder') + time.sleep(2) + continue except Exception as e: logger.error('Caught error! %s', e.message) used_time = watch.wall_elapsed('ensemble_builder') @@ -424,30 +292,10 @@ def main(autosklearn_tmp_dir, continue # Output the score - logger.info('Training performance: %f' % trajectory[-1]) - - # Print the ensemble members: - ensemble_members_run_numbers = dict() - ensemble_members = Counter(indices).most_common() - ensemble_members_string = 'Ensemble members:\n' - logger.info(ensemble_members) - for ensemble_member in ensemble_members: - weight = float(ensemble_member[1]) / len(indices) - ensemble_members_string += \ - (' %s; weight: %10f; performance: %10f\n' % - (indices_to_model_names[ensemble_member[0]], - weight, - model_names_to_scores[ - indices_to_model_names[ensemble_member[0]]])) - - ensemble_members_run_numbers[ - indices_to_run_num[ - ensemble_member[0]]] = weight - logger.info(ensemble_members_string) - - # Save the ensemble indices for later use! - backend.save_ensemble_indices_weights(ensemble_members_run_numbers, - index_run, seed) + logger.info('Training performance: %f' % ensemble.train_score_) + + # Save the ensemble for later use in the main auto-sklearn module! + backend.save_ensemble(ensemble, index_run, seed) all_predictions_valid = get_predictions(dir_valid, dir_valid_list, @@ -458,10 +306,9 @@ def main(autosklearn_tmp_dir, # Save predictions for valid and test data set if len(dir_valid_list) == len(dir_ensemble_list): all_predictions_valid = np.array(all_predictions_valid) - ensemble_predictions_valid = np.mean( - all_predictions_valid[indices.astype(int)], axis=0) + ensemble_predictions_valid = ensemble.predict(all_predictions_valid) backend.save_predictions_as_txt(ensemble_predictions_valid, - 'valid', index_run, prefix=basename) + 'valid', index_run, prefix=dataset_name) else: logger.info('Could not find as many validation set predictions (%d)' 'as ensemble predictions (%d)!.', @@ -476,10 +323,9 @@ def main(autosklearn_tmp_dir, if len(dir_test_list) == len(dir_ensemble_list): all_predictions_test = np.array(all_predictions_test) - ensemble_predictions_test = np.mean( - all_predictions_test[indices.astype(int)], axis=0) + ensemble_predictions_test = ensemble.predict(all_predictions_test) backend.save_predictions_as_txt(ensemble_predictions_test, - 'test', index_run, prefix=basename) + 'test', index_run, prefix=dataset_name) else: logger.info('Could not find as many test set predictions (%d) as ' 'ensemble predictions (%d)!', @@ -501,7 +347,7 @@ def main(autosklearn_tmp_dir, help='TMP directory of auto-sklearn. Predictions to ' 'build the ensemble will be read from here and ' 'the ensemble indices will be saved here.') - parser.add_argument('--basename', required=True, + parser.add_argument('--dataset_name', required=True, help='Name of the dataset. Used to prefix prediction ' 'output files.') parser.add_argument('--task', required=True, @@ -539,7 +385,7 @@ def main(autosklearn_tmp_dir, task = STRING_TO_TASK_TYPES[args.task] metric = STRING_TO_METRIC[args.metric] main(autosklearn_tmp_dir=args.auto_sklearn_tmp_directory, - basename=args.basename, + dataset_name=args.dataset_name, task_type=task, metric=metric, limit=args.limit, diff --git a/autosklearn/ensembles/__init__.py b/autosklearn/ensembles/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/autosklearn/ensembles/abstract_ensemble.py b/autosklearn/ensembles/abstract_ensemble.py new file mode 100644 index 0000000000..8b8e8e1d91 --- /dev/null +++ b/autosklearn/ensembles/abstract_ensemble.py @@ -0,0 +1,68 @@ +from abc import ABCMeta, abstractmethod + + +class AbstractEnsemble(object): + __metaclass__ = ABCMeta + + @abstractmethod + def fit(self, base_models_predictions, true_targets, model_identifiers): + """Fit an ensemble given predictions of base models and targets. + + Parameters + ---------- + base_models_predictions : array of shape = [n_base_models, n_data_points, n_targets] + n_targets is the number of classes in case of classification, + n_targets is 0 or 1 in case of regression + + true_targets : array of shape [n_targets] + + model_identifiers : identifier for each base model. + Can be used for practical text output of the ensemble. + + Returns + ------- + self + + """ + pass + + @abstractmethod + def predict(self, base_models_predictions): + """Create ensemble predictions from the base model predictions. + + Parameters + ---------- + base_models_predictions : array of shape = [n_base_models, n_data_points, n_targets] + Same as in the fit method. + + Returns + ------- + array : [n_data_points] + """ + self + + @abstractmethod + def pprint_ensemble_string(self, models): + """Return a nicely-readable representation of the ensmble. + + Parameters + ---------- + models : dict {identifier : model object} + The identifiers are the same as the one presented to the fit() + method. Models can be used for nice printing. + + Returns + ------- + str + """ + + @abstractmethod + def get_model_identifiers(self): + """Return identifiers of models in the ensemble. + + This includes models which have a weight of zero! + + Returns + ------- + list + """ diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py new file mode 100644 index 0000000000..74bb87431d --- /dev/null +++ b/autosklearn/ensembles/ensemble_selection.py @@ -0,0 +1,213 @@ +from collections import Counter +import random + +import numpy as np +import six + +from autosklearn.constants import * +from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble +from autosklearn.evaluation.util import calculate_score + + +class EnsembleSelection(AbstractEnsemble): + def __init__(self, ensemble_size, task_type, metric, + sorted_initialization=False, bagging=False, mode='fast'): + self.ensemble_size = ensemble_size + self.task_type = task_type + self.metric = metric + self.sorted_initialization = sorted_initialization + self.bagging = bagging + self.mode = mode + + def fit(self, predictions, labels, identifiers): + self.ensemble_size = int(self.ensemble_size) + if self.ensemble_size < 1: + raise ValueError('Ensemble size cannot be less than one!') + if not self.task_type in TASK_TYPES: + raise ValueError('Unknown task type %s.' % self.task_type) + if not self.metric in METRIC: + raise ValueError('Unknown metric %s.' % self.metric) + if self.mode not in ('fast', 'slow'): + raise ValueError('Unknown mode %s' % self.mode) + + if self.bagging: + self._bagging(predictions, labels) + else: + self._fit(predictions, labels) + self._calculate_weights() + self.identifiers_ = identifiers + return self + + def _fit(self, predictions, labels): + if self.mode == 'fast': + self._fast(predictions, labels) + else: + self._slow(predictions, labels) + return self + + def _fast(self, predictions, labels): + """Fast version of Rich Caruana's ensemble selection method.""" + self.num_input_models_ = len(predictions) + + ensemble = [] + trajectory = [] + order = [] + + ensemble_size = self.ensemble_size + + if self.sorted_initialization: + n_best = 20 + indices = self._sorted_initialization(predictions, labels, n_best) + for idx in indices: + ensemble.append(predictions[idx]) + order.append(idx) + ensemble_ = np.array(ensemble).mean(axis=0) + ensemble_performance = calculate_score( + labels, ensemble_, self.task_type, self.metric, + ensemble_.shape[1]) + trajectory.append(ensemble_performance) + ensemble_size -= n_best + + for i in range(ensemble_size): + scores = np.zeros((len(predictions))) + s = len(ensemble) + if s == 0: + weighted_ensemble_prediction = np.zeros(predictions[0].shape) + else: + ensemble_prediction = np.mean(np.array(ensemble), axis=0) + weighted_ensemble_prediction = (s / float(s + 1)) * \ + ensemble_prediction + for j, pred in enumerate(predictions): + fant_ensemble_prediction = weighted_ensemble_prediction + \ + (1. / float(s + 1)) * pred + scores[j] = calculate_score( + labels, fant_ensemble_prediction, self.task_type, + self.metric, fant_ensemble_prediction.shape[1]) + best = np.nanargmax(scores) + ensemble.append(predictions[best]) + trajectory.append(scores[best]) + order.append(best) + + # Handle special case + if len(predictions) == 1: + break + + self.indices_ = order + self.trajectory_ = trajectory + self.train_score_ = trajectory[-1] + + def _slow(self, predictions, labels): + """Rich Caruana's ensemble selection method.""" + self.num_input_models_ = len(predictions) + + ensemble = [] + trajectory = [] + order = [] + + ensemble_size = self.ensemble_size + + if self.sorted_initialization: + n_best = 20 + indices = self._sorted_initialization(predictions, labels, n_best) + for idx in indices: + ensemble.append(predictions[idx]) + order.append(idx) + ensemble_ = np.array(ensemble).mean(axis=0) + ensemble_performance = calculate_score( + labels, ensemble_, self.task_type, self.metric, + ensemble_.shape[1]) + trajectory.append(ensemble_performance) + ensemble_size -= n_best + + for i in range(ensemble_size): + scores = np.zeros([predictions.shape[0]]) + for j, pred in enumerate(predictions): + ensemble.append(pred) + ensemble_prediction = np.mean(np.array(ensemble), axis=0) + scores[j] = calculate_score(labels, ensemble_prediction, + self.task_type, self.metric, + ensemble_prediction.shape[1]) + ensemble.pop() + best = np.nanargmax(scores) + ensemble.append(predictions[best]) + trajectory.append(scores[best]) + order.append(best) + + # Handle special case + if len(predictions) == 1: + break + + self.indices_ = np.array(order) + self.trajectory_ = np.array(trajectory) + self.train_score_ = trajectory[-1] + + def _calculate_weights(self): + ensemble_members = Counter(self.indices_).most_common() + weights = np.zeros((self.num_input_models_,), dtype=float) + for ensemble_member in ensemble_members: + weight = float(ensemble_member[1]) / self.ensemble_size + weights[ensemble_member[0]] = weight + + self.weights_ = weights + + def _sorted_initialization(self, predictions, labels, n_best): + perf = np.zeros([predictions.shape[0]]) + + for i, p in enumerate(predictions): + perf[i] = calculate_score(labels, predictions, self.task_type, + self.metric, predictions.shape[1]) + + indices = np.argsort(perf)[perf.shape[0] - n_best:] + return indices + + def _bagging(self, predictions, labels, fraction=0.5, n_bags=20): + """Rich Caruana's ensemble selection method with bagging.""" + raise ValueError('Bagging might not work with class-based interface!') + n_models = predictions.shape[0] + bag_size = int(n_models * fraction) + + order_of_each_bag = [] + for j in range(n_bags): + # Bagging a set of models + indices = sorted(random.sample(range(0, n_models), bag_size)) + bag = predictions[indices, :, :] + order, _ = self._fit(bag, labels) + order_of_each_bag.append(order) + + return np.array(order_of_each_bag) + + def predict(self, predictions): + for i, weight in enumerate(self.weights_): + predictions[i] *= weight + return np.sum(predictions, axis=0) + + def __str__(self): + return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \ + '\n\tWeights: %s\n\tIdentifiers: %s' % \ + (' '.join(['%d: %5f' % (idx, performance) + for idx, performance in enumerate(self.trajectory_)]), + self.indices_, self.weights_, + ' '.join([str(identifier) for idx, identifier in + enumerate(self.identifiers_) + if self.weights_[idx] > 0])) + + def pprint_ensemble_string(self, models): + output = [] + sio = six.StringIO() + for i, weight in enumerate(self.weights_): + identifier = self.identifiers_[i] + model = models[identifier] + if weight > 0.0: + output.append((weight, model)) + + output.sort(reverse=True, key=lambda t: t[0]) + + sio.write("[") + for weight, model in output: + sio.write("(%f, %s),\n" % (weight, model)) + sio.write("]") + + return sio.getvalue() + + def get_model_identifiers(self): + return self.identifiers_ diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 1cd7086c2a..629f7964c3 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1,17 +1,15 @@ # -*- encoding: utf-8 -*- import os import random -import shutil import numpy as np -from os import stat import six -from autosklearn.automl import AutoML +import autosklearn.automl from autosklearn.constants import * -class AutoSklearnClassifier(AutoML): +class AutoSklearnClassifier(autosklearn.automl.AutoML): """This class implements the classification task. It must not be pickled! Parameters @@ -122,9 +120,7 @@ def __init__(self, # to superinit self._tmp_dir, self._output_dir = self._prepare_create_folders( tmp_dir=tmp_folder, - output_dir=output_folder, - shared_mode=shared_mode - ) + output_dir=output_folder) self._classes = [] self._n_classes = [] @@ -152,7 +148,7 @@ def __init__(self, shared_mode=shared_mode) @staticmethod - def _prepare_create_folders(tmp_dir, output_dir, shared_mode): + def _prepare_create_folders(tmp_dir, output_dir): random_number = random.randint(0, 10000) pid = os.getpid() @@ -161,22 +157,29 @@ def _prepare_create_folders(tmp_dir, output_dir, shared_mode): if output_dir is None: output_dir = '/tmp/autosklearn_output_%d_%d' % (pid, random_number) - if not os.path.exists(tmp_dir): + # Totally weird, this has to be created here, will be deleted in the + # first lines of fit(). If not there, creating the Backend object in the + # superclass will fail + try: os.makedirs(tmp_dir) - if not os.path.exists(output_dir): + except OSError: + pass + try: os.makedirs(output_dir) + except OSError: + pass return tmp_dir, output_dir def _create_output_directories(self): + try: + os.makedirs(self._tmp_dir) + except OSError: + pass try: os.makedirs(self._output_dir) - if self._output_dir != self._tmp_dir: - os.makedirs(self._tmp_dir) except OSError: - print("Did not create tmp/output_dir, already exists") - if not self._shared_mode: - raise + pass def fit(self, X, y, metric='acc_metric', @@ -202,9 +205,9 @@ def fit(self, X, y, `_. feat_type : list, optional (default=None) - List of Bools of `len(X.shape[1])` describing if an attribute is - continuous or categorical. Categorical attributes will - automatically 1Hot encoded. + List of str of `len(X.shape[1])` describing the attribute type. + Possible types are `Categorical` and `Numerical`. `Categorical` + attributes will be automatically One-Hot encoded. dataset_name : str, optional (default=None) Create nicer output. If None, a string will be determined by the @@ -268,7 +271,7 @@ def fit(self, X, y, feat_type, dataset_name) def predict(self, X): - """Predict class for X. + """Predict classes for X. Parameters ---------- @@ -276,14 +279,28 @@ def predict(self, X): Returns ------- - y : array of shape = [n_samples] or [n_samples, n_outputs] + y : array of shape = [n_samples] or [n_samples, n_labels] The predicted classes. """ return super(AutoSklearnClassifier, self).predict(X) + def predict_proba(self, X): + """Predict probabilities of classes for all samples X. + + Parameters + ---------- + X : array-like or sparse matrix of shape = [n_samples, n_features] + + Returns + ------- + y : array of shape = [n_samples, n_classes] or [n_samples, n_labels] + The predicted class probabilities. + """ + return super(AutoSklearnClassifier, self).predict_proba(X) + -class AutoSklearnRegressor(AutoML): +class AutoSklearnRegressor(autosklearn.automl.AutoML): def __init__(self, **kwargs): raise NotImplementedError() diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 833c3bf14c..619a9595ea 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -6,21 +6,21 @@ import traceback import numpy as np -import lockfile -from autosklearn.pipeline.classification import SimpleClassificationPipeline -from autosklearn.pipeline.regression import SimpleRegressionPipeline +import autosklearn.pipeline.classification +import autosklearn.pipeline.regression from sklearn.dummy import DummyClassifier, DummyRegressor from autosklearn.constants import * from autosklearn.evaluation.util import get_new_run_num from autosklearn.util import Backend +from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel +from autosklearn.evaluation.util import calculate_score __all__ = [ 'AbstractEvaluator' ] - class MyDummyClassifier(DummyClassifier): def __init__(self, configuration, random_states): super(MyDummyClassifier, self).__init__(strategy="most_frequent") @@ -39,7 +39,10 @@ def fit_estimator(self, X, y, fit_params=None): def predict_proba(self, X, batch_size=1000): new_X = np.ones((X.shape[0], 1)) - return super(MyDummyClassifier, self).predict_proba(new_X) + probas = super(MyDummyClassifier, self).predict_proba(new_X) + probas = convert_multioutput_multiclass_to_multilabel(probas).astype( + np.float32) + return probas def estimator_supports_iterative_fit(self): return False @@ -63,7 +66,7 @@ def fit_estimator(self, X, y, fit_params=None): def predict(self, X, batch_size=1000): new_X = np.ones((X.shape[0], 1)) - return super(MyDummyRegressor, self).predict(new_X) + return super(MyDummyRegressor, self).predict(new_X).astype(np.float32) def estimator_supports_iterative_fit(self): return False @@ -73,16 +76,16 @@ class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod - def __init__(self, Datamanager, configuration=None, + def __init__(self, Datamanager, output_dir, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, - output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() + self.output_dir = output_dir self.configuration = configuration self.D = Datamanager @@ -93,11 +96,6 @@ def __init__(self, Datamanager, configuration=None, self.task_type = Datamanager.info['task'] self.seed = seed - if output_dir is None: - self.output_dir = os.getcwd() - else: - self.output_dir = output_dir - self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions @@ -106,13 +104,15 @@ def __init__(self, Datamanager, configuration=None, if self.configuration is None: self.model_class = MyDummyRegressor else: - self.model_class = SimpleRegressionPipeline + self.model_class = \ + autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self.predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: - self.model_class = SimpleClassificationPipeline + self.model_class = \ + autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self.predict_proba if num_run is None: @@ -130,6 +130,24 @@ def fit(self): def predict(self): pass + def loss_and_predict(self): + Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() + err = self.loss(self.Y_optimization, Y_optimization_pred) + return err, Y_optimization_pred, Y_valid_pred, Y_test_pred + + def loss(self, y_true, y_hat): + score = calculate_score( + y_true, y_hat, self.task_type, + self.metric, self.D.info['label_num'], + all_scoring_functions=self.all_scoring_functions) + + if hasattr(score, '__len__'): + err = {key: 1 - score[key] for key in score} + else: + err = 1 - score + + return err + # This function does everything necessary after the fitting is done: # predicting # saving the files for the ensembles_statistics @@ -149,13 +167,19 @@ def finish_up(self): print(traceback.format_exc()) print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % - ('TIMEOUT', abs(self.duration), 1.0, self.seed, + ('TIMEOUT', abs(self.duration), 2.0, self.seed, 'No results were produced! Error is %s' % str(e))) def file_output(self): seed = os.environ.get('AUTOSKLEARN_SEED') - errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() + if self.configuration is None: + # Do not calculate the score when creating dummy predictions! + Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() + errs = {self.D.info['metric']: 2.0} + else: + errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = \ + self.loss_and_predict() if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ @@ -186,7 +210,11 @@ def file_output(self): seed, num_run) self.duration = time.time() - self.starttime - err = errs[self.D.info['metric']] + if isinstance(errs, dict): + err = errs[self.D.info['metric']] + else: + err = errs + errs = {} additional_run_info = ';'.join(['%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) @@ -195,20 +223,8 @@ def file_output(self): additional_run_info += ';' + 'num_run:' + num_run return err, additional_run_info - def predict_proba(self, X, model, task_type, Y_train=None): + def predict_proba(self, X, model, task_type, Y_train): Y_pred = model.predict_proba(X, batch_size=1000) - - if task_type == MULTILABEL_CLASSIFICATION: - Y_pred = np.hstack([Y_pred[i][:, -1].reshape((-1, 1)) - for i in range(len(Y_pred))]) - - elif task_type == BINARY_CLASSIFICATION: - if len(Y_pred.shape) != 1: - Y_pred = Y_pred[:, 1].reshape(-1, 1) - - elif task_type == MULTICLASS_CLASSIFICATION: - pass - Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred @@ -225,19 +241,18 @@ def _ensure_prediction_array_sizes(self, prediction, Y_train): if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: - classes = list(np.unique(self.D.data['Y_train'])) - if num_classes == prediction.shape[1]: - return prediction - - if Y_train is not None: - classes = list(np.unique(Y_train)) + if Y_train is None: + raise ValueError('Y_train must not be None!') + classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number - new_predictions = np.zeros((prediction.shape[0], num_classes)) + new_predictions = np.zeros((prediction.shape[0], num_classes), + dtype=np.float32) + for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] diff --git a/autosklearn/evaluation/cv_evaluator.py b/autosklearn/evaluation/cv_evaluator.py index f060ed23d4..c2e1f5ddd3 100644 --- a/autosklearn/evaluation/cv_evaluator.py +++ b/autosklearn/evaluation/cv_evaluator.py @@ -3,7 +3,6 @@ from autosklearn.evaluation.resampling import get_CV_fold from autosklearn.evaluation.abstract_evaluator import AbstractEvaluator -from autosklearn.evaluation.util import calculate_score __all__ = [ @@ -12,21 +11,19 @@ class CVEvaluator(AbstractEvaluator): - - def __init__(self, Datamanager, configuration=None, + def __init__(self, Datamanager, output_dir, + configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, - output_dir=None, output_y_test=False, cv_folds=10, num_run=None): super(CVEvaluator, self).__init__( - Datamanager, configuration, + Datamanager, output_dir, configuration, with_predictions=with_predictions, all_scoring_functions=all_scoring_functions, seed=seed, - output_dir=output_dir, output_y_test=output_y_test, num_run=num_run) @@ -115,6 +112,8 @@ def predict(self): # Average the predictions of several models if len(Y_valid_pred.shape) == 3: Y_valid_pred = np.nanmean(Y_valid_pred, axis=0) + else: + Y_valid_pred = None if self.X_test is not None: Y_test_pred = np.array([Y_test_pred[i] @@ -123,18 +122,9 @@ def predict(self): # Average the predictions of several models if len(Y_test_pred.shape) == 3: Y_test_pred = np.nanmean(Y_test_pred, axis=0) + else: + Y_test_pred = None self.Y_optimization = Y_targets - score = calculate_score( - Y_targets, Y_optimization_pred, self.task_type, self.metric, - self.D.info['label_num'], - all_scoring_functions=self.all_scoring_functions) - - if hasattr(score, '__len__'): - err = {key: 1 - score[key] for key in score} - else: - err = 1 - score - if self.with_predictions: - return err, Y_optimization_pred, Y_valid_pred, Y_test_pred - return err + return Y_optimization_pred, Y_valid_pred, Y_test_pred diff --git a/autosklearn/evaluation/holdout_evaluator.py b/autosklearn/evaluation/holdout_evaluator.py index 00c9599c4b..b111f5f743 100644 --- a/autosklearn/evaluation/holdout_evaluator.py +++ b/autosklearn/evaluation/holdout_evaluator.py @@ -4,7 +4,6 @@ from autosklearn.constants import * from autosklearn.evaluation.resampling import split_data from autosklearn.evaluation.abstract_evaluator import AbstractEvaluator -from autosklearn.evaluation.util import calculate_score __all__ = [ @@ -14,19 +13,18 @@ class HoldoutEvaluator(AbstractEvaluator): - def __init__(self, datamanager, configuration=None, + def __init__(self, datamanager, output_dir, + configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, - output_dir=None, output_y_test=False, num_run=None): super(HoldoutEvaluator, self).__init__( - datamanager, configuration, + datamanager, output_dir, configuration, with_predictions=with_predictions, all_scoring_functions=all_scoring_functions, seed=seed, - output_dir=output_dir, output_y_test=output_y_test, num_run=num_run) @@ -36,7 +34,6 @@ def __init__(self, datamanager, configuration=None, datamanager.data['Y_train'], classification=classification) - def fit(self): self.model.fit(self.X_train, self.Y_train) @@ -56,31 +53,22 @@ def iterative_fit(self): self.file_output() n_iter += 2 - def predict(self): Y_optimization_pred = self.predict_function(self.X_optimization, - self.model, self.task_type) + self.model, self.task_type, + self.Y_train) if self.X_valid is not None: Y_valid_pred = self.predict_function(self.X_valid, self.model, - self.task_type) + self.task_type, + self.Y_train) else: Y_valid_pred = None if self.X_test is not None: Y_test_pred = self.predict_function(self.X_test, self.model, - self.task_type) + self.task_type, + self.Y_train) else: Y_test_pred = None - score = calculate_score( - self.Y_optimization, Y_optimization_pred, self.task_type, - self.metric, self.D.info['label_num'], - all_scoring_functions=self.all_scoring_functions) - - if hasattr(score, '__len__'): - err = {key: 1 - score[key] for key in score} - else: - err = 1 - score + return Y_optimization_pred, Y_valid_pred, Y_test_pred - if self.with_predictions: - return err, Y_optimization_pred, Y_valid_pred, Y_test_pred - return err diff --git a/autosklearn/evaluation/nested_cv_evaluator.py b/autosklearn/evaluation/nested_cv_evaluator.py index 1e03c1c694..17cfb51643 100644 --- a/autosklearn/evaluation/nested_cv_evaluator.py +++ b/autosklearn/evaluation/nested_cv_evaluator.py @@ -17,21 +17,20 @@ class NestedCVEvaluator(AbstractEvaluator): - def __init__(self, Datamanager, configuration=None, + def __init__(self, Datamanager, output_dir, + configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, - output_dir=None, output_y_test=False, inner_cv_folds=5, outer_cv_folds=5, num_run=None): super(NestedCVEvaluator, self).__init__( - Datamanager, configuration, + Datamanager, output_dir, configuration, with_predictions=with_predictions, all_scoring_functions=all_scoring_functions, seed=seed, - output_dir=output_dir, output_y_test=output_y_test, num_run=num_run) @@ -92,8 +91,7 @@ def fit(self): def predict(self): # First, obtain the predictions for the ensembles, the validation and # the test set! - outer_scores = defaultdict(list) - inner_scores = defaultdict(list) + self.outer_scores_ = defaultdict(list) Y_optimization_pred = [None] * self.outer_cv_folds Y_targets = [None] * self.outer_cv_folds Y_valid_pred = [None] * self.outer_cv_folds @@ -131,9 +129,9 @@ def predict(self): all_scoring_functions=self.all_scoring_functions) if self.all_scoring_functions: for score_name in scores: - outer_scores[score_name].append(scores[score_name]) + self.outer_scores_[score_name].append(scores[score_name]) else: - outer_scores[self.metric].append(scores) + self.outer_scores_[self.metric].append(scores) Y_optimization_pred = np.concatenate( [Y_optimization_pred[i] for i in range(self.outer_cv_folds) @@ -160,7 +158,12 @@ def predict(self): self.Y_optimization = Y_targets - # Second, calculate the inner score + return Y_optimization_pred, Y_valid_pred, Y_test_pred + + def loss_and_predict(self): + Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() + inner_scores = defaultdict(list) + for outer_fold in range(self.outer_cv_folds): for inner_fold in range(self.inner_cv_folds): inner_train_indices, inner_test_indices = self.inner_indices[ @@ -168,6 +171,7 @@ def predict(self): Y_test = self.Y_train[inner_test_indices] X_test = self.X_train[inner_test_indices] model = self.inner_models[outer_fold][inner_fold] + Y_hat = self.predict_function( X_test, model, self.task_type, Y_train=self.Y_train[inner_train_indices]) @@ -175,6 +179,7 @@ def predict(self): Y_test, Y_hat, self.task_type, self.metric, self.D.info['label_num'], all_scoring_functions=self.all_scoring_functions) + if self.all_scoring_functions: for score_name in scores: inner_scores[score_name].append(scores[score_name]) @@ -184,17 +189,15 @@ def predict(self): # Average the scores! if self.all_scoring_functions: inner_err = { - key: 1 - np.mean(inner_scores[key]) - for key in inner_scores - } + key: 1 - np.mean(inner_scores[key]) for key in inner_scores} outer_err = { - 'outer:%s' % METRIC_TO_STRING[key]: 1 - np.mean(outer_scores[ - key]) for key in outer_scores - } + 'outer:%s' % METRIC_TO_STRING[key]: + 1 - np.mean(self.outer_scores_[key]) + for key in self.outer_scores_ + } inner_err.update(outer_err) else: inner_err = 1 - np.mean(inner_scores[self.metric]) - if self.with_predictions: - return inner_err, Y_optimization_pred, Y_valid_pred, Y_test_pred - return inner_err + return inner_err, Y_optimization_pred, Y_valid_pred, Y_test_pred + diff --git a/autosklearn/evaluation/resampling.py b/autosklearn/evaluation/resampling.py index 7849191bbb..e7de273cad 100644 --- a/autosklearn/evaluation/resampling.py +++ b/autosklearn/evaluation/resampling.py @@ -93,10 +93,15 @@ def get_CV_fold(X, Y, fold, folds, shuffle=True, random_state=None): raise ValueError('The first dimension of the X and Y array must ' 'be equal.') - kf = sklearn.cross_validation.StratifiedKFold(Y, - n_folds=folds, - shuffle=shuffle, - random_state=random_state) + if len(Y.shape) > 1: + kf = sklearn.cross_validation.KFold(n=Y.shape[0], n_folds=folds, + shuffle=shuffle, + random_state=random_state) + else: + kf = sklearn.cross_validation.StratifiedKFold(Y, + n_folds=folds, + shuffle=shuffle, + random_state=random_state) for idx, split in enumerate(kf): if idx == fold: break diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index f3b5d52971..f5085fa76d 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -10,16 +10,16 @@ class TestEvaluator(AbstractEvaluator): - def __init__(self, Datamanager, configuration=None, + def __init__(self, Datamanager, output_dir, + configuration=None, with_predictions=False, all_scoring_functions=False, seed=1): super(TestEvaluator, self).__init__( - Datamanager, configuration, + Datamanager, output_dir, configuration, with_predictions=with_predictions, all_scoring_functions=all_scoring_functions, seed=seed, - output_dir=None, output_y_test=False, num_run='dummy') self.configuration = configuration diff --git a/autosklearn/evaluation/util.py b/autosklearn/evaluation/util.py index 1bc73a616b..c3628fd868 100644 --- a/autosklearn/evaluation/util.py +++ b/autosklearn/evaluation/util.py @@ -5,7 +5,7 @@ from autosklearn.constants import * from autosklearn.metrics import sanitize_array, \ - normalize_array, regression_metrics, classification_metrics + regression_metrics, classification_metrics, create_multiclass_solution __all__ = [ @@ -16,59 +16,31 @@ def calculate_score(solution, prediction, task_type, metric, num_classes, all_scoring_functions=False, logger=None): - if task_type == MULTICLASS_CLASSIFICATION: - # This used to crash on travis-ci; special treatment to find out why - # it crashed! - try: - solution_binary = np.zeros((prediction.shape[0], num_classes)) - except IndexError as e: - if logger is not None: - logger.error("Prediction shape: %s, solution " - "shape %s", prediction.shape, solution.shape) - raise e - - for i in range(solution_binary.shape[0]): - label = solution[i] - solution_binary[i, label] = 1 - solution = solution_binary - - elif task_type in [BINARY_CLASSIFICATION, REGRESSION]: - if len(solution.shape) == 1: - solution = solution.reshape((-1, 1)) - if task_type not in TASK_TYPES: raise NotImplementedError(task_type) - if solution.shape != prediction.shape: - raise ValueError('Solution shape %s != prediction shape %s' % - (solution.shape, prediction.shape)) - if all_scoring_functions: score = dict() if task_type in REGRESSION_TASKS: + # TODO put this into the regression metric itself cprediction = sanitize_array(prediction) for metric_ in REGRESSION_METRICS: - score[metric_] = regression_metrics.calculate_score(metric_, - solution, - cprediction) + score[metric_] = regression_metrics.calculate_score( + metric_, solution, cprediction) else: - csolution, cprediction = normalize_array(solution, prediction) for metric_ in CLASSIFICATION_METRICS: score[metric_] = classification_metrics.calculate_score( - metric_, csolution, cprediction, task_type) + metric_, solution, prediction, task_type) else: if task_type in REGRESSION_TASKS: + # TODO put this into the regression metric itself cprediction = sanitize_array(prediction) - score = regression_metrics.calculate_score(metric, - solution, - cprediction) + score = regression_metrics.calculate_score( + metric, solution, cprediction) else: - csolution, cprediction = normalize_array(solution, prediction) - score = classification_metrics.calculate_score(metric, - csolution, - cprediction, - task=task_type) + score = classification_metrics.calculate_score( + metric, solution, prediction, task=task_type) return score diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py index f1b0a02a93..a4506792a9 100644 --- a/autosklearn/metalearning/metafeatures/metafeatures.py +++ b/autosklearn/metalearning/metafeatures/metafeatures.py @@ -189,14 +189,10 @@ def _calculate(self, X, y, categorical): def _calculate_sparse(self, X, y, categorical): missing = helper_functions.get_value("MissingValues") - num_missing = [] - if scipy.sparse.isspmatrix_csr(missing): - num_missing = [ - np.sum(missing.data[missing.indptr[i]:missing.indptr[i + 1]]) - for i in range(missing.shape[0])] - elif scipy.sparse.isspmatrix_csc(missing): - num_missing = [np.sum(missing.data[missing.indices == i]) - for i in range(missing.shape[0])] + new_missing = missing.tocsr() + num_missing = [ + np.sum(new_missing.data[new_missing.indptr[i]:new_missing.indptr[i + 1]]) + for i in range(new_missing.shape[0])] return float(np.sum([1 if num > 0 else 0 for num in num_missing])) @@ -217,13 +213,11 @@ def _calculate(self, X, y, categorical): def _calculate_sparse(self, X, y, categorical): missing = helper_functions.get_value("MissingValues") - num_missing = [] - if scipy.sparse.isspmatrix_csr(missing): - num_missing = [np.sum(missing.data[missing.indices == i]) - for i in range(missing.shape[1])] - elif scipy.sparse.isspmatrix_csc(missing): - num_missing = [np.sum(missing.data[missing.indptr[i]:missing.indptr[i+1]]) - for i in range(missing.shape[1])] + new_missing = missing.tocsc() + num_missing = [np.sum( + new_missing.data[new_missing.indptr[i]:new_missing.indptr[i+1]]) + for i in range(missing.shape[1])] + return float(np.sum([1 if num > 0 else 0 for num in num_missing])) @metafeatures.define("PercentageOfFeaturesWithMissingValues", @@ -406,9 +400,10 @@ def _calculate(self, X, y, categorical): def _calculate_sparse(self, X, y, categorical): symbols_per_column = [] - for i in range(X.shape[1]): + new_X = X.tocsc() + for i in range(new_X.shape[1]): if categorical[i]: - unique_values = np.unique(X.getcol(i).data) + unique_values = np.unique(new_X.getcol(i).data) num_unique = np.sum(np.isfinite(unique_values)) symbols_per_column.append(num_unique) return symbols_per_column diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 43404a97a5..1d6300f796 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -22,7 +22,6 @@ # CONNECTION WITH THE USE OR PERFORMANCE OF SOFTWARE, DOCUMENTS, MATERIALS, # PUBLICATIONS, OR INFORMATION MADE AVAILABLE FOR THE CHALLENGE. -from .common import * from .classification_metrics import * from .util import * from .regression_metrics import * diff --git a/autosklearn/metrics/classification_metrics.py b/autosklearn/metrics/classification_metrics.py index cab122f16d..4e33ea6e0d 100644 --- a/autosklearn/metrics/classification_metrics.py +++ b/autosklearn/metrics/classification_metrics.py @@ -5,18 +5,21 @@ # normalize_array from __future__ import print_function - import numpy as np import scipy as sp - +import scipy.stats from autosklearn.constants import MULTICLASS_CLASSIFICATION, \ - BINARY_CLASSIFICATION, METRIC_TO_STRING -from autosklearn.metrics.common import binarize_predictions, \ - acc_stat, tied_rank -from autosklearn.metrics.util import log_loss, prior_log_loss + BINARY_CLASSIFICATION, METRIC_TO_STRING, MULTILABEL_CLASSIFICATION +from autosklearn.metrics.util import log_loss, prior_log_loss, \ + binarize_predictions, normalize_array, create_multiclass_solution def calculate_score(metric, solution, prediction, task): + if solution.shape[0] != prediction.shape[0]: + raise ValueError('Solution and prediction have different number of ' + 'samples: %d and %d' % (solution.shape[0], + prediction.shape[0])) + metric = METRIC_TO_STRING[metric] return globals()[metric](solution, prediction, task) @@ -34,28 +37,78 @@ def acc_metric(solution, prediction, task=BINARY_CLASSIFICATION): :param task: :return: """ + if task == BINARY_CLASSIFICATION: + if len(solution.shape) == 1: + # Solution won't be touched - no copy + solution = solution.reshape((-1, 1)) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = solution.reshape((-1, 1)) + else: + raise ValueError('Solution.shape %s' % solution.shape) + + if len(prediction.shape) == 2: + if prediction.shape[1] > 2: + raise ValueError('A prediction array with probability values ' + 'for %d classes is not a binary ' + 'classification problem' % prediction.shape[1]) + # Prediction will be copied into a new binary array - no copy + prediction = prediction[:, 1].reshape((-1, 1)) + else: + raise ValueError('Invalid prediction shape %s' % prediction.shape) + + elif task == MULTICLASS_CLASSIFICATION: + if len(solution.shape) == 1: + solution = create_multiclass_solution(solution, prediction) + elif len(solution.shape ) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = create_multiclass_solution(solution.reshape((-1, 1)), + prediction) + else: + raise ValueError('Solution.shape %s' % solution.shape) + + elif task == MULTILABEL_CLASSIFICATION: + pass + else: + raise NotImplementedError('acc_metric does not support task type %s' + % task) - label_num = solution.shape[1] bin_predictions = binarize_predictions(prediction, task) - tn, fp, tp, fn = acc_stat(solution, bin_predictions) - # Bounding to avoid division by 0 - eps = np.float(1e-15) + + tn = np.sum(np.multiply((1 - solution), (1 - bin_predictions)), axis=0, + dtype=float) + fn = np.sum(np.multiply(solution, (1 - bin_predictions)), axis=0, + dtype=float) + tp = np.sum(np.multiply(solution, bin_predictions), axis=0, + dtype=float) + fp = np.sum(np.multiply((1 - solution), bin_predictions), axis=0, + dtype=float) + # Bounding to avoid division by 0, 1e-7 because of float32 + eps = np.float(1e-7) tp = np.sum(tp) fp = np.sum(fp) tn = np.sum(tn) fn = np.sum(fn) - if (task != MULTICLASS_CLASSIFICATION) or (label_num == 1): + if task in (BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION): accuracy = (np.sum(tp) + np.sum(tn)) / ( np.sum(tp) + np.sum(fp) + np.sum(tn) + np.sum(fn) ) - else: + elif task == MULTICLASS_CLASSIFICATION: accuracy = np.sum(tp) / (np.sum(tp) + np.sum(fp)) - if (task != MULTICLASS_CLASSIFICATION) or (label_num == 1): + if task in (BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION): base_accuracy = 0.5 # random predictions for binary case - else: + elif task == MULTICLASS_CLASSIFICATION: + label_num = solution.shape[1] base_accuracy = 1. / label_num + # Normalize: 0 for random, 1 for perfect score = (accuracy - base_accuracy) / sp.maximum(eps, (1 - base_accuracy)) return score @@ -72,24 +125,73 @@ def bac_metric(solution, prediction, task=BINARY_CLASSIFICATION): :param task: :return: """ - label_num = solution.shape[1] - score = np.zeros(label_num) + if task == BINARY_CLASSIFICATION: + if len(solution.shape) == 1: + # Solution won't be touched - no copy + solution = solution.reshape((-1, 1)) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = solution.reshape((-1, 1)) + else: + raise ValueError('Solution.shape %s' % solution.shape) + + if len(prediction.shape) == 2: + if prediction.shape[1] > 2: + raise ValueError('A prediction array with probability values ' + 'for %d classes is not a binary ' + 'classification problem' % prediction.shape[1]) + # Prediction will be copied into a new binary array - no copy + prediction = prediction[:, 1].reshape((-1, 1)) + else: + raise ValueError('Invalid prediction shape %s' % prediction.shape) + + elif task == MULTICLASS_CLASSIFICATION: + if len(solution.shape) == 1: + solution = create_multiclass_solution(solution, prediction) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = create_multiclass_solution(solution.reshape((-1, 1)), + prediction) + else: + raise ValueError('Solution.shape %s' % solution.shape) + elif task == MULTILABEL_CLASSIFICATION: + pass + else: + raise NotImplementedError('bac_metric does not support task type %s' + % task) bin_prediction = binarize_predictions(prediction, task) - [tn, fp, tp, fn] = acc_stat(solution, bin_prediction) + + + fn = np.sum(np.multiply(solution, (1 - bin_prediction)), axis=0, + dtype=float) + tp = np.sum(np.multiply(solution, bin_prediction), axis=0, dtype=float) # Bounding to avoid division by 0 eps = 1e-15 tp = sp.maximum(eps, tp) pos_num = sp.maximum(eps, tp + fn) tpr = tp / pos_num # true positive rate (sensitivity) - if (task != MULTICLASS_CLASSIFICATION) or (label_num == 1): + + if task in (BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION): + tn = np.sum(np.multiply((1 - solution), (1 - bin_prediction)), + axis=0, dtype=float) + fp = np.sum(np.multiply((1 - solution), bin_prediction), axis=0, + dtype=float) tn = sp.maximum(eps, tn) neg_num = sp.maximum(eps, tn + fp) tnr = tn / neg_num # true negative rate (specificity) bac = 0.5 * (tpr + tnr) base_bac = 0.5 # random predictions for binary case - else: + elif task == MULTICLASS_CLASSIFICATION: + label_num = solution.shape[1] bac = tpr base_bac = 1. / label_num # random predictions for multiclass case + bac = np.mean(bac) # average over all classes # Normalize: 0 for random, 1 for perfect score = (bac - base_bac) / sp.maximum(eps, (1 - base_bac)) @@ -107,29 +209,59 @@ def pac_metric(solution, prediction, task=BINARY_CLASSIFICATION): :param task: :return: """ - debug_flag = False + if task == BINARY_CLASSIFICATION: + if len(solution.shape) == 1: + # Solution won't be touched - no copy + solution = solution.reshape((-1, 1)) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = solution[:, 1] + else: + raise ValueError('Solution.shape %s' % solution.shape) + solution = solution.copy() + + if len(prediction.shape) == 2: + if prediction.shape[1] > 2: + raise ValueError('A prediction array with probability values ' + 'for %d classes is not a binary ' + 'classification problem' % prediction.shape[1]) + # Prediction will be copied into a new binary array - no copy + prediction = prediction[:, 1].reshape((-1, 1)) + else: + raise ValueError('Invalid prediction shape %s' % prediction.shape) + + elif task == MULTICLASS_CLASSIFICATION: + if len(solution.shape) == 1: + solution = create_multiclass_solution(solution, prediction) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = create_multiclass_solution(solution.reshape((-1, 1)), + prediction) + else: + raise ValueError('Solution.shape %s' % solution.shape) + elif task == MULTILABEL_CLASSIFICATION: + solution = solution.copy() + else: + raise NotImplementedError('auc_metric does not support task type %s' + % task) + solution, prediction = normalize_array(solution, prediction.copy()) + [sample_num, label_num] = solution.shape if label_num == 1: task = BINARY_CLASSIFICATION - eps = 1e-15 - the_log_loss = log_loss(solution, prediction, task) + eps = 1e-7 # Compute the base log loss (using the prior probabilities) - pos_num = 1. * sum(solution) # float conversion! + pos_num = 1. * np.sum(solution, axis=0, dtype=float) # float conversion! frac_pos = pos_num / sample_num # prior proba of positive class the_base_log_loss = prior_log_loss(frac_pos, task) - # Alternative computation of the same thing (slower) - # Should always return the same thing except in the multi-label case - # For which the analytic solution makes more sense - if debug_flag: - base_prediction = np.empty(prediction.shape) - for k in range(sample_num): - base_prediction[k, :] = frac_pos - base_log_loss = log_loss(solution, base_prediction, task) - diff = np.array(abs(the_base_log_loss - base_log_loss)) - if len(diff.shape) > 0: - diff = max(diff) - if (diff) > 1e-10: - print('Arrggh {} != {}'.format(the_base_log_loss, base_log_loss)) + the_log_loss = log_loss(solution, prediction, task) + # Exponentiate to turn into an accuracy-like score. # In the multi-label case, we need to average AFTER taking the exp # because it is an NL operation @@ -153,12 +285,53 @@ def f1_metric(solution, prediction, task=BINARY_CLASSIFICATION): :param task: :return: """ - label_num = solution.shape[1] - score = np.zeros(label_num) + if task == BINARY_CLASSIFICATION: + if len(solution.shape) == 1: + # Solution won't be touched - no copy + solution = solution.reshape((-1, 1)) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = solution.reshape((-1, 1)) + else: + raise ValueError('Solution.shape %s' % solution.shape) + + if len(prediction.shape) == 2: + if prediction.shape[1] > 2: + raise ValueError('A prediction array with probability values ' + 'for %d classes is not a binary ' + 'classification problem' % prediction.shape[1]) + # Prediction will be copied into a new binary array - no copy + prediction = prediction[:, 1].reshape((-1, 1)) + else: + raise ValueError('Invalid prediction shape %s' % prediction.shape) + + elif task == MULTICLASS_CLASSIFICATION: + if len(solution.shape) == 1: + solution = create_multiclass_solution(solution, prediction) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = create_multiclass_solution(solution.reshape((-1, 1)), + prediction) + else: + raise ValueError('Solution.shape %s' % solution.shape) + elif task == MULTILABEL_CLASSIFICATION: + pass + else: + raise NotImplementedError('f1_metric does not support task type %s' + % task) bin_prediction = binarize_predictions(prediction, task) - [tn, fp, tp, fn] = acc_stat(solution, bin_prediction) + # Bounding to avoid division by 0 eps = 1e-15 + fn = np.sum(np.multiply(solution, (1 - bin_prediction)), axis=0, dtype=float) + tp = np.sum(np.multiply(solution, bin_prediction), axis=0, dtype=float) + fp = np.sum(np.multiply((1 - solution), bin_prediction), axis=0, dtype=float) true_pos_num = sp.maximum(eps, tp + fn) found_pos_num = sp.maximum(eps, tp + fp) tp = sp.maximum(eps, tp) @@ -170,7 +343,7 @@ def f1_metric(solution, prediction, task=BINARY_CLASSIFICATION): # Average over all classes f1 = np.mean(f1) # Normalize: 0 for random, 1 for perfect - if (task != MULTICLASS_CLASSIFICATION) or (label_num == 1): + if task in (BINARY_CLASSIFICATION, MULTILABEL_CLASSIFICATION): # How to choose the "base_f1"? # For the binary/multilabel classification case, one may want to predict all 1. # In that case tpr = 1 and ppv = frac_pos. f1 = 2 * frac_pos / (1+frac_pos) @@ -187,7 +360,8 @@ def f1_metric(solution, prediction, task=BINARY_CLASSIFICATION): # For the multiclass case, this is not possible (though it does not make much sense to # use f1 for multiclass problems), so the best would be to assign values at random to get # tpr=ppv=frac_pos, where frac_pos=1/label_num - else: + elif task == MULTICLASS_CLASSIFICATION: + label_num = solution.shape[1] base_f1 = 1. / label_num score = (f1 - base_f1) / sp.maximum(eps, (1 - base_f1)) return score @@ -208,20 +382,61 @@ def auc_metric(solution, prediction, task=BINARY_CLASSIFICATION): :param task: :return: """ - # auc = metrics.roc_auc_score(solution, prediction, average=None) - # There is a bug in metrics.roc_auc_score: auc([1,0,0],[1e-10,0,0]) - # incorrect + if task == BINARY_CLASSIFICATION: + if len(solution.shape) == 1: + # Solution won't be touched - no copy + solution = solution.reshape((-1, 1)) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = solution[:, 1] + else: + raise ValueError('Solution.shape %s' % solution.shape) + solution = solution.copy() + + if len(prediction.shape) == 2: + if prediction.shape[1] > 2: + raise ValueError('A prediction array with probability values ' + 'for %d classes is not a binary ' + 'classification problem' % prediction.shape[1]) + # Prediction will be copied into a new binary array - no copy + prediction = prediction[:, 1].reshape((-1, 1)) + else: + raise ValueError('Invalid prediction shape %s' % prediction.shape) + + elif task == MULTICLASS_CLASSIFICATION: + if len(solution.shape) == 1: + solution = create_multiclass_solution(solution, prediction) + elif len(solution.shape) == 2: + if solution.shape[1] > 1: + raise ValueError('Solution array must only contain one class ' + 'label, but contains %d' % solution.shape[1]) + else: + solution = create_multiclass_solution(solution.reshape((-1, 1)), + prediction) + else: + raise ValueError('Solution.shape %s' % solution.shape) + elif task == MULTILABEL_CLASSIFICATION: + solution = solution.copy() + else: + raise NotImplementedError('auc_metric does not support task type %s' + % task) + + solution, prediction = normalize_array(solution, prediction.copy()) + label_num = solution.shape[1] auc = np.empty(label_num) for k in range(label_num): - r_ = tied_rank(prediction[:, k]) + r_ = scipy.stats.rankdata(prediction[:, k]) s_ = solution[:, k] if sum(s_) == 0: print( 'WARNING: no positive class example in class {}'.format(k + 1)) - npos = sum(s_ == 1) - nneg = sum(s_ < 1) - auc[k] = (sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos) + npos = np.sum(s_ == 1) + nneg = np.sum(s_ < 1) + auc[k] = (np.sum(r_[s_ == 1]) - npos * (npos + 1) / 2) / (nneg * npos) + auc[~np.isfinite(auc)] = 0 return 2 * np.mean(auc) - 1 -# END CLASSIFICATION METRICS diff --git a/autosklearn/metrics/common.py b/autosklearn/metrics/common.py deleted file mode 100644 index 25a7f83ed9..0000000000 --- a/autosklearn/metrics/common.py +++ /dev/null @@ -1,83 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function - -import numpy as np - -from autosklearn.constants import * - - -def binarize_predictions(array, task=BINARY_CLASSIFICATION): - """ - Turn predictions into decisions {0,1} by selecting the class with largest - score for multi class problems and thresh holding at 0.5 for other cases. - - :param array: - :param task: - :return: - """ - # add a very small random value as tie breaker (a bit bad because - # this changes the score every time) - # so to make sure we get the same result every time, we seed it - # eps = 1e-15 - # np.random.seed(sum(array.shape)) - # array = array + eps*np.random.rand(array.shape[0],array.shape[1]) - bin_array = np.zeros(array.shape) - if (task != MULTICLASS_CLASSIFICATION) or (array.shape[1] == 1): - bin_array[array >= 0.5] = 1 - else: - sample_num = array.shape[0] - for i in range(sample_num): - j = np.argmax(array[i, :]) - bin_array[i, j] = 1 - return bin_array - - -def acc_stat(solution, prediction): - """ - Return accuracy statistics TN, FP, TP, FN Assumes that solution and - prediction are binary 0/1 vectors. - :param solution: - :param prediction: - :return: - """ - # This uses floats so the results are floats - tn_value = sum(np.multiply((1 - solution), (1 - prediction))) - fn_value = sum(np.multiply(solution, (1 - prediction))) - tp_value = sum(np.multiply(solution, prediction)) - fp_value = sum(np.multiply((1 - solution), prediction)) - return tn_value, fp_value, tp_value, fn_value - - -def tied_rank(a): - """Return the ranks (with base 1) of a list resolving ties by averaging. - - This works for numpy arrays. - - """ - m = len(a) - # Sort a in ascending order (sa=sorted vals, i=indices) - i = a.argsort() - sa = a[i] - # Find unique values - uval = np.unique(a) - # Test whether there are ties - R = np.arange(m, dtype=float) + 1 # Ranks with base 1 - if len(uval) != m: - # Average the ranks for the ties - oldval = sa[0] - newval = sa[0] - k0 = 0 - for k in range(1, m): - newval = sa[k] - if newval == oldval: - # moving average - R[k0:k + 1] = R[k - 1] * (k - k0) / (k - k0 + - 1) + R[k] / (k - k0 + 1) - else: - k0 = k - oldval = newval - # Invert the index - S = np.empty(m) - S[i] = R - return S - diff --git a/autosklearn/metrics/regression_metrics.py b/autosklearn/metrics/regression_metrics.py index 4e60fbeca6..c5e92d6e2d 100644 --- a/autosklearn/metrics/regression_metrics.py +++ b/autosklearn/metrics/regression_metrics.py @@ -9,12 +9,19 @@ from autosklearn.constants import REGRESSION, METRIC_TO_STRING -def calculate_score(metric, solution, prediction): +def calculate_score(metric, solution, prediction, copy=True): + if solution.shape[0] != prediction.shape[0]: + raise ValueError('Solution and prediction have different number of ' + 'samples: %d and %d' % (solution.shape[0], + prediction.shape[0])) + + if len(solution.shape) == 1: + solution = solution.reshape((-1, 1)) metric = METRIC_TO_STRING[metric] - return globals()[metric](solution, prediction) + return globals()[metric](solution, prediction, copy) -def r2_metric(solution, prediction, task=REGRESSION): +def r2_metric(solution, prediction, task=REGRESSION, copy=True): """ 1 - Mean squared error divided by variance :param solution: @@ -23,12 +30,12 @@ def r2_metric(solution, prediction, task=REGRESSION): :return: """ mse = np.mean((solution - prediction) ** 2, axis=0) - var = np.mean((solution - np.mean(solution)) ** 2, axis=0) + var = np.mean((solution - np.mean(solution, axis=0)) ** 2, axis=0) score = 1 - mse / var return np.mean(score) -def a_metric(solution, prediction, task=REGRESSION): +def a_metric(solution, prediction, task=REGRESSION, copy=True): """ 1 - Mean absolute error divided by mean absolute deviation :param solution: @@ -36,8 +43,9 @@ def a_metric(solution, prediction, task=REGRESSION): :param task: :return: """ - mae = np.mean(np.abs(solution - prediction)) # mean absolute error + mae = np.mean(np.abs(solution - prediction), axis=0) # mean absolute error mad = np.mean( - np.abs(solution - np.mean(solution))) # mean absolute deviation + np.abs(solution - np.mean(solution, axis=0)), axis=0) # mean absolute + # deviation score = 1 - mae / mad return np.mean(score) diff --git a/autosklearn/metrics/util.py b/autosklearn/metrics/util.py index a627776b66..4638e5a8eb 100644 --- a/autosklearn/metrics/util.py +++ b/autosklearn/metrics/util.py @@ -1,12 +1,9 @@ # -*- encoding: utf-8 -*- from __future__ import print_function - import numpy as np import scipy as sp - from autosklearn.constants import MULTICLASS_CLASSIFICATION, \ BINARY_CLASSIFICATION -from autosklearn.metrics.common import binarize_predictions def sanitize_array(array): @@ -16,10 +13,6 @@ def sanitize_array(array): :return: """ a = np.ravel(array) - #maxi = np.nanmax((filter(lambda x: x != float('inf'), a)) - # ) # Max except NaN and Inf - #mini = np.nanmin((filter(lambda x: x != float('-inf'), a)) - # ) # Mini except NaN and Inf maxi = np.nanmax(a[np.isfinite(a)]) mini = np.nanmin(a[np.isfinite(a)]) array[array == float('inf')] = maxi @@ -44,10 +37,6 @@ def normalize_array(solution, prediction): """ # Binarize solution sol = np.ravel(solution) # convert to 1-d array - #maxi = np.nanmax((filter(lambda x: x != float('inf'), sol)) - # ) # Max except NaN and Inf - #mini = np.nanmin((filter(lambda x: x != float('-inf'), sol)) - # ) # Mini except NaN and Inf maxi = np.nanmax(sol[np.isfinite(sol)]) mini = np.nanmin(sol[np.isfinite(sol)]) if maxi == mini: @@ -55,47 +44,53 @@ def normalize_array(solution, prediction): return [solution, prediction] diff = maxi - mini mid = (maxi + mini) / 2. - new_solution = np.copy(solution) - new_solution[solution >= mid] = 1 - new_solution[solution < mid] = 0 + + solution[solution >= mid] = 1 + solution[solution < mid] = 0 # Normalize and threshold predictions (takes effect only if solution not # in {0, 1}) - new_prediction = (np.copy(prediction) - float(mini)) / float(diff) + + prediction -= float(mini) + prediction /= float(diff) + # and if predictions exceed the bounds [0, 1] - new_prediction[new_prediction > 1] = 1 - new_prediction[new_prediction < 0] = 0 + prediction[prediction > 1] = 1 + prediction[prediction < 0] = 0 # Make probabilities smoother # new_prediction = np.power(new_prediction, (1./10)) - return [new_solution, new_prediction] + return [solution, prediction] def log_loss(solution, prediction, task=BINARY_CLASSIFICATION): """Log loss for binary and multiclass.""" [sample_num, label_num] = solution.shape - eps = 1e-15 + # Lower gives problems with float32! + eps = 0.00000003 - pred = np.copy(prediction - ) # beware: changes in prediction occur through this - sol = np.copy(solution) if (task == MULTICLASS_CLASSIFICATION) and (label_num > 1): # Make sure the lines add up to one for multi-class classification norma = np.sum(prediction, axis=1) for k in range(sample_num): - pred[k, :] /= sp.maximum(norma[k], eps) - # Make sure there is a single label active per line for multi-class - # classification - sol = binarize_predictions(solution, task=MULTICLASS_CLASSIFICATION) + prediction[k, :] /= sp.maximum(norma[k], eps) + + sample_num = solution.shape[0] + for i in range(sample_num): + j = np.argmax(solution[i, :]) + solution[i, :] = 0 + solution[i, j] = 1 + + solution = solution.astype(np.int32, copy=False) # For the base prediction, this solution is ridiculous in the # multi-label case # Bounding of predictions to avoid log(0),1/0,... - pred = sp.minimum(1 - eps, sp.maximum(eps, pred)) + prediction = sp.minimum(1 - eps, sp.maximum(eps, prediction)) # Compute the log loss - pos_class_log_loss = -np.mean(sol * np.log(pred), axis=0) + pos_class_log_loss = -np.mean(solution * np.log(prediction), axis=0) if (task != MULTICLASS_CLASSIFICATION) or (label_num == 1): # The multi-label case is a bunch of binary problems. # The second class is the negative class for each column. - neg_class_log_loss = -np.mean((1 - sol) * np.log(1 - pred), axis=0) + neg_class_log_loss = -np.mean((1 - solution) * np.log(1 - prediction), axis=0) log_loss = pos_class_log_loss + neg_class_log_loss # Each column is an independent problem, so we average. # The probabilities in one line do not add up to one. @@ -139,3 +134,41 @@ def prior_log_loss(frac_pos, task=BINARY_CLASSIFICATION): base_log_loss = np.sum(pos_class_log_loss_) return base_log_loss + +def binarize_predictions(array, task=BINARY_CLASSIFICATION): + """ + Turn predictions into decisions {0,1} by selecting the class with largest + score for multi class problems and thresh holding at 0.5 for other cases. + + :param array: + :param task: + :return: + """ + # add a very small random value as tie breaker (a bit bad because + # this changes the score every time) + # so to make sure we get the same result every time, we seed it + # eps = 1e-15 + # np.random.seed(sum(array.shape)) + # array = array + eps*np.random.rand(array.shape[0],array.shape[1]) + bin_array = np.zeros(array.shape, dtype=np.int32) + if (task != MULTICLASS_CLASSIFICATION) or (array.shape[1] == 1): + bin_array[array >= 0.5] = 1 + else: + sample_num = array.shape[0] + for i in range(sample_num): + j = np.argmax(array[i, :]) + bin_array[i, j] = 1 + return bin_array + + +def create_multiclass_solution(solution, prediction): + solution_binary = np.zeros((prediction.shape), dtype=np.int32) + + for i in range(solution_binary.shape[0]): + try: + solution_binary[i, solution[i]] = 1 + except IndexError as e: + raise IndexError('too many indices to array. array has shape %s, ' + 'indices are "%s %s"' % + (solution_binary.shape, str(i), solution[i])) + return solution_binary \ No newline at end of file diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py index 1aa94770b6..64c55c2bb1 100644 --- a/autosklearn/pipeline/base.py +++ b/autosklearn/pipeline/base.py @@ -75,11 +75,8 @@ def pre_transform(self, X, y, fit_params=None, init_params=None): method, param = init_param.split(":") init_params_per_method[method][param] = value - # List of preprocessing steps (and their order) - preprocessors_names = [preprocessor[0] for - preprocessor in self._get_pipeline()[:-1]] - - for preproc_name in preprocessors_names: + # Instantiate preprocessor objects + for preproc_name, preproc_class in self._get_pipeline()[:-1]: preproc_params = {} for instantiated_hyperparameter in self.configuration: if not instantiated_hyperparameter.startswith( @@ -92,20 +89,11 @@ def pre_transform(self, X, y, fit_params=None, init_params=None): preproc_params[name_] = self.configuration[ instantiated_hyperparameter] - if preproc_name in \ - components.feature_preprocessing_components._preprocessors: - _preprocessors = components.feature_preprocessing_components._preprocessors - elif preproc_name in \ - components.data_preprocessing_components._preprocessors: - _preprocessors = components.data_preprocessing_components._preprocessors - else: - raise ValueError(preproc_name) - - preprocessor_object = _preprocessors[preproc_name]( + preprocessor_object = preproc_class( random_state=self.random_state, **preproc_params) # Ducktyping... - if hasattr(preprocessor_object, 'get_components'): + if hasattr(preproc_class, 'get_components'): preprocessor_object = preprocessor_object.choice steps.append((preproc_name, preprocessor_object)) @@ -183,16 +171,17 @@ def predict(self, X, batch_size=None): # TODO check if fit() was called before... if batch_size is None: - return self.pipeline_.predict(X) + return self.pipeline_.predict(X).astype(self._output_dtype) else: if type(batch_size) is not int or batch_size <= 0: raise Exception("batch_size must be a positive integer") else: if self.num_targets == 1: - y = np.zeros((X.shape[0],)) + y = np.zeros((X.shape[0],), dtype=self._output_dtype) else: - y = np.zeros((X.shape[0], self.num_targets)) + y = np.zeros((X.shape[0], self.num_targets), + dtype=self._output_dtype) # Copied and adapted from the scikit-learn GP code for k in range(max(1, int(np.ceil(float(X.shape[0]) / diff --git a/autosklearn/pipeline/classification.py b/autosklearn/pipeline/classification.py index a41cc49125..df28224676 100644 --- a/autosklearn/pipeline/classification.py +++ b/autosklearn/pipeline/classification.py @@ -8,7 +8,12 @@ from HPOlibConfigSpace.configuration_space import ConfigurationSpace from HPOlibConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction -from autosklearn.pipeline import components as components +from autosklearn.pipeline.components import classification as \ + classification_components +from autosklearn.pipeline.components import data_preprocessing as \ + data_preprocessing_components +from autosklearn.pipeline.components import feature_preprocessing as \ + feature_preprocessing_components from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.constants import SPARSE from autosklearn.pipeline.components.data_preprocessing.balancing import Balancing @@ -62,6 +67,11 @@ class SimpleClassificationPipeline(ClassifierMixin, BasePipeline): """ + def __init__(self, configuration, random_state=None): + self._output_dtype = np.int32 + super(SimpleClassificationPipeline, self).__init__(configuration, + random_state) + def pre_transform(self, X, y, fit_params=None, init_params=None): self.num_targets = 1 if len(y.shape) == 1 else y.shape[1] @@ -111,7 +121,8 @@ def predict_proba(self, X, batch_size=None): # Binary or Multiclass if len(target) == 1: - y = np.zeros((X.shape[0], target.shape[1])) + y = np.zeros((X.shape[0], target.shape[1]), + dtype=np.float32) for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): @@ -119,10 +130,12 @@ def predict_proba(self, X, batch_size=None): batch_to = min([(k + 1) * batch_size, X.shape[0]]) y[batch_from:batch_to] = \ self.predict_proba(X[batch_from:batch_to], - batch_size=None) + batch_size=None).\ + astype(np.float32) elif len(target) > 1: - y = [np.zeros((X.shape[0], target[i].shape[1])) + y = [np.zeros((X.shape[0], target[i].shape[1]), + dtype=np.float32) for i in range(len(target))] for k in range(max(1, int(np.ceil(float(X.shape[0]) / @@ -131,7 +144,8 @@ def predict_proba(self, X, batch_size=None): batch_to = min([(k + 1) * batch_size, X.shape[0]]) predictions = \ self.predict_proba(X[batch_from:batch_to], - batch_size=None) + batch_size=None).\ + astype(np.float32) for i in range(len(target)): y[i][batch_from:batch_to] = predictions[i] @@ -275,21 +289,21 @@ def _get_pipeline(cls): # Add the always active preprocessing components steps.extend( [["one_hot_encoding", - components.data_preprocessing._preprocessors['one_hot_encoding']], + data_preprocessing_components._preprocessors['one_hot_encoding']], ["imputation", - components.data_preprocessing._preprocessors['imputation']], + data_preprocessing_components._preprocessors['imputation']], ["rescaling", - components.data_preprocessing._preprocessors['rescaling']], + data_preprocessing_components._preprocessors['rescaling']], ["balancing", - components.data_preprocessing._preprocessors['balancing']]]) + data_preprocessing_components._preprocessors['balancing']]]) # Add the preprocessing component steps.append(['preprocessor', - components.feature_preprocessing._preprocessors['preprocessor']]) + feature_preprocessing_components.FeaturePreprocessorChoice]) # Add the classification component steps.append(['classifier', - components.classification_components._classifiers['classifier']]) + classification_components.ClassifierChoice]) return steps def _get_estimator_hyperparameter_name(self): diff --git a/autosklearn/pipeline/components/__init__.py b/autosklearn/pipeline/components/__init__.py index 3312b4d12a..e69de29bb2 100644 --- a/autosklearn/pipeline/components/__init__.py +++ b/autosklearn/pipeline/components/__init__.py @@ -1,46 +0,0 @@ -"""auto-sklearn can be easily extended with new classification and -preprocessing methods. At import time, auto-sklearn checks the directory -``autosklearn/pipeline/components/classification`` for classification -algorithms and ``autosklearn/pipeline/components/preprocessing`` for -preprocessing algorithms. To be found, the algorithm must be provide a class -implementing one of the given -interfaces. - -Coding Guidelines -================= -Please try to adhere to the `scikit-learn coding guidelines `_. - -Own Implementation of Algorithms -================================ -When adding new algorithms, it is possible to implement it directly in the -fit/predict/transform method of a component. We do not recommend this, -but rather recommend to implement an algorithm in a scikit-learn compatible -way (`see here `_). -Such an implementation should then be put into the `implementation` directory. -and can then be easily wrapped with to become a component in auto-sklearn. - -Classification -============== - -The SimpleClassificationPipeline provides an interface for -Classification Algorithms inside auto-sklearn. It provides four important -functions. Two of them, -:meth:`get_hyperparameter_search_space() ` -and -:meth:`get_properties() ` -are used to -automatically create a valid configuration space. The other two, -:meth:`fit() ` and -:meth:`predict() ` -are an implementation of the `scikit-learn predictor API `_. - -Preprocessing -=============""" - -from . import classification as classification_components -from . import regression as regression_components -from . import feature_preprocessing as feature_preprocessing_components -from . import data_preprocessing as data_preprocessing_components - - - diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py index ea1df4b719..f4de3c8aa7 100644 --- a/autosklearn/pipeline/components/base.py +++ b/autosklearn/pipeline/components/base.py @@ -1,9 +1,72 @@ +from collections import OrderedDict +import importlib +import inspect +import pkgutil +import sys + + +def find_components(package, directory, base_class): + components = OrderedDict() + + for module_loader, module_name, ispkg in pkgutil.iter_modules( + [directory]): + full_module_name = "%s.%s" % (package, module_name) + if full_module_name not in sys.modules and not ispkg: + module = importlib.import_module(full_module_name) + + for member_name, obj in inspect.getmembers(module): + if inspect.isclass( + obj) and base_class in obj.__bases__: + # TODO test if the obj implements the interface + # Keep in mind that this only instantiates the ensemble_wrapper, + # but not the real target classifier + classifier = obj + components[module_name] = classifier + + return components + + +class ThirdPartyComponents(object): + def __init__(self, base_class): + self.base_class = base_class + self.components = OrderedDict() + + def add_component(self, obj): + if inspect.isclass(obj) and self.base_class in obj.__bases__: + name = obj.__name__ + classifier = obj + else: + raise TypeError('add_component works only with a subclass of %s' % + str(self.base_class)) + + properties = set(classifier.get_properties()) + should_be_there = set(['shortname', + 'name', + 'handles_regression', + 'handles_classification', + 'handles_multiclass', + 'handles_multilabel', + 'is_deterministic', + 'input', 'output']) + for property in properties: + if property not in should_be_there: + raise ValueError('Property %s must not be specified for ' + 'algorithm %s. Only the following properties ' + 'can be specified: %s' % + (property, name, str(should_be_there))) + for property in should_be_there: + if property not in properties: + raise ValueError('Property %s not specified for algorithm %s') + + self.components[name] = classifier + print(name, classifier) + + class AutoSklearnClassificationAlgorithm(object): """Provide an abstract interface for classification algorithms in auto-sklearn. - Make a subclass of this and put it into the directory - `autosklearn/pipeline/components/classification` to make it available.""" + See :ref:`extending` for more information.""" def __init__(self): self.estimator = None @@ -11,30 +74,14 @@ def __init__(self): @staticmethod def get_properties(dataset_properties=None): - """Get the properties of the underlying algorithm. These are: - - * Short name - * Full name - * Can the algorithm handle missing values? - (handles_missing_values : {True, False}) - * Can the algorithm handle nominal features? - (handles_nominal_features : {True, False}) - * Can the algorithm handle numerical features? - (handles_numerical_features : {True, False}) - * Does the algorithm prefer data scaled in [0,1]? - (prefers_data_scaled : {True, False} - * Does the algorithm prefer data normalized to 0-mean, 1std? - (prefers_data_normalized : {True, False} - * Can the algorithm handle multiclass-classification problems? - (handles_multiclass : {True, False}) - * Can the algorithm handle multilabel-classification problems? - (handles_multilabel : {True, False} - * Is the algorithm deterministic for a given seed? - (is_deterministic : {True, False) - * Can the algorithm handle sparse data? - (handles_sparse : {True, False} - * What are the preferred types of the data array? - (preferred_dtype : list of tuples) + """Get the properties of the underlying algorithm. + + Find more information at :ref:`get_properties` + + Parameters + ---------- + + dataset_properties : dict, optional (default=None) Returns ------- @@ -46,6 +93,11 @@ def get_properties(dataset_properties=None): def get_hyperparameter_search_space(dataset_properties=None): """Return the configuration space of this classification algorithm. + Parameters + ---------- + + dataset_properties : dict, optional (default=None) + Returns ------- HPOlibConfigspace.configuration_space.ConfigurationSpace @@ -62,7 +114,7 @@ def fit(self, X, y): X : array-like, shape = (n_samples, n_features) Training data - y : array-like, shape = [n_samples] + y : array-like, shape = (n_samples,) or shape = (n_sample, n_labels) Returns ------- @@ -86,7 +138,7 @@ def predict(self, X): Returns ------- - array, shape = (n_samples,) + array, shape = (n_samples,) or shape = (n_samples, n_labels) Returns the predicted values Notes @@ -127,42 +179,21 @@ class AutoSklearnPreprocessingAlgorithm(object): """Provide an abstract interface for preprocessing algorithms in auto-sklearn. - Make a subclass of this and put it into the directory - `autosklearn/pipeline/components/preprocessing` to make it available.""" + See :ref:`extending` for more information.""" def __init__(self): self.preprocessor = None @staticmethod def get_properties(dataset_properties=None): - """Get the properties of the underlying algorithm. These are: - - * Short name - * Full name - * Can the algorithm handle missing values? - (handles_missing_values : {True, False}) - * Can the algorithm handle nominal features? - (handles_nominal_features : {True, False}) - * Can the algorithm handle numerical features? - (handles_numerical_features : {True, False}) - * Does the algorithm prefer data scaled in [0,1]? - (prefers_data_scaled : {True, False} - * Does the algorithm prefer data normalized to 0-mean, 1std? - (prefers_data_normalized : {True, False} - * Can preprocess regression data? - (handles_regression : {True, False} - * Can preprocess classification data? - (handles_classification : {True, False} - * Can the algorithm handle multiclass-classification problems? - (handles_multiclass : {True, False}) - * Can the algorithm handle multilabel-classification problems? - (handles_multilabel : {True, False} - * Is the algorithm deterministic for a given seed? - (is_deterministic : {True, False) - * Can the algorithm handle sparse data? - (handles_sparse : {True, False} - * What are the preferred types of the data array? - (preferred_dtype : list of tuples) + """Get the properties of the underlying algorithm. + + Find more information at :ref:`get_properties` + + Parameters + ---------- + + dataset_properties : dict, optional (default=None) Returns ------- @@ -174,6 +205,11 @@ def get_properties(dataset_properties=None): def get_hyperparameter_search_space(dataset_properties=None): """Return the configuration space of this preprocessing algorithm. + Parameters + ---------- + + dataset_properties : dict, optional (default=None) + Returns ------- HPOlibConfigspace.configuration_space.ConfigurationSpace @@ -190,7 +226,7 @@ def fit(self, X, Y): X : array-like, shape = (n_samples, n_features) Training data - y : array-like, shape = [n_samples] + y : array-like, shape = (n_samples,) or shape = (n_sample, n_labels) Returns ------- @@ -234,7 +270,7 @@ def get_preprocessor(self): def __str__(self): name = self.get_properties()['name'] - return "autosklearn.pipeline %" % name + return "autosklearn.pipeline %s" % name class AutoSklearnRegressionAlgorithm(object): @@ -248,28 +284,15 @@ def __init__(self): self.estimator = None self.properties = None - @staticmethod def get_properties(dataset_properties=None): - """Get the properties of the underlying algorithm. These are: - - * Short name - * Full name - * Can the algorithm handle missing values? - (handles_missing_values : {True, False}) - * Can the algorithm handle nominal features? - (handles_nominal_features : {True, False}) - * Can the algorithm handle numerical features? - (handles_numerical_features : {True, False}) - * Does the algorithm prefer data scaled in [0,1]? - (prefers_data_scaled : {True, False} - * Does the algorithm prefer data normalized to 0-mean, 1std? - (prefers_data_normalized : {True, False} - * Is the algorithm deterministic for a given seed? - (is_deterministic : {True, False) - * Can the algorithm handle sparse data? - (handles_sparse : {True, False} - * What are the preferred types of the data array? - (preferred_dtype : list of tuples) + """Get the properties of the underlying algorithm. + + Find more information at :ref:`get_properties` + + Parameters + ---------- + + dataset_properties : dict, optional (default=None) Returns ------- @@ -281,6 +304,11 @@ def get_properties(dataset_properties=None): def get_hyperparameter_search_space(dataset_properties=None): """Return the configuration space of this regression algorithm. + Parameters + ---------- + + dataset_properties : dict, optional (default=None) + Returns ------- HPOlibConfigspace.configuration_space.ConfigurationSpace @@ -331,19 +359,6 @@ def predict(self, X): -learn-objects>`_ for further information.""" raise NotImplementedError() - def predict_proba(self, X): - """Predict probabilities. - - Parameters - ---------- - X : array-like, shape = (n_samples, n_features) - - Returns - ------- - array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) - """ - raise NotImplementedError() - def get_estimator(self): """Return the underlying estimator object. @@ -355,6 +370,5 @@ def get_estimator(self): def __str__(self): name = self.get_properties()['name'] - return "autosklearn.pipeline %" % name - + return "autosklearn.pipeline %s" % name diff --git a/autosklearn/pipeline/components/classification/__init__.py b/autosklearn/pipeline/components/classification/__init__.py index 6b62ed19b9..e4d65a5035 100644 --- a/autosklearn/pipeline/components/classification/__init__.py +++ b/autosklearn/pipeline/components/classification/__init__.py @@ -2,33 +2,23 @@ from collections import OrderedDict import copy -import importlib -import inspect import os -import pkgutil -import sys -from ..base import AutoSklearnClassificationAlgorithm +from ..base import AutoSklearnClassificationAlgorithm, find_components, \ + ThirdPartyComponents from HPOlibConfigSpace.configuration_space import ConfigurationSpace from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter from HPOlibConfigSpace.conditions import EqualsCondition classifier_directory = os.path.split(__file__)[0] -_classifiers = OrderedDict() +_classifiers = find_components(__package__, + classifier_directory, + AutoSklearnClassificationAlgorithm) +_addons = ThirdPartyComponents(AutoSklearnClassificationAlgorithm) -for module_loader, module_name, ispkg in pkgutil.iter_modules([classifier_directory]): - full_module_name = "%s.%s" % (__package__, module_name) - if full_module_name not in sys.modules and not ispkg: - module = importlib.import_module(full_module_name) - - for member_name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and AutoSklearnClassificationAlgorithm in obj.__bases__: - # TODO test if the obj implements the interface - # Keep in mind that this only instantiates the ensemble_wrapper, - # but not the real target classifier - classifier = obj - _classifiers[module_name] = classifier +def add_classifier(classifier): + _addons.add_component(classifier) class ClassifierChoice(object): @@ -39,7 +29,10 @@ def __init__(self, **params): @classmethod def get_components(cls): - return _classifiers + components = OrderedDict() + components.update(_classifiers) + components.update(_addons.components) + return components @classmethod def get_available_components(cls, data_prop, @@ -164,6 +157,3 @@ def get_hyperparameter_search_space(cls, dataset_properties, cs.add_forbidden_clause(forbidden_clause) return cs - - -_classifiers['classifier'] = ClassifierChoice \ No newline at end of file diff --git a/autosklearn/pipeline/components/classification/adaboost.py b/autosklearn/pipeline/components/classification/adaboost.py index abcaf1bc61..92427a75c7 100644 --- a/autosklearn/pipeline/components/classification/adaboost.py +++ b/autosklearn/pipeline/components/classification/adaboost.py @@ -63,22 +63,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'AB', 'name': 'AdaBoost Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/bernoulli_nb.py b/autosklearn/pipeline/components/classification/bernoulli_nb.py index fc4e34f3a7..c3d740e54e 100644 --- a/autosklearn/pipeline/components/classification/bernoulli_nb.py +++ b/autosklearn/pipeline/components/classification/bernoulli_nb.py @@ -76,22 +76,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'BernoulliNB', 'name': 'Bernoulli Naive Bayes classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - # sklearn website says: ... BernoulliNB is designed for - # binary/boolean features. - 'handles_numerical_features': False, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, - 'handles_multiclass': False, + 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - 'preferred_dtype': np.bool} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/decision_tree.py b/autosklearn/pipeline/components/classification/decision_tree.py index e0804d555b..b42834fc81 100644 --- a/autosklearn/pipeline/components/classification/decision_tree.py +++ b/autosklearn/pipeline/components/classification/decision_tree.py @@ -8,6 +8,7 @@ from autosklearn.pipeline.components.base import \ AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import * +from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel class DecisionTree(AutoSklearnClassificationAlgorithm): @@ -62,29 +63,21 @@ def predict(self, X): def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() - return self.estimator.predict_proba(X) + probas = self.estimator.predict_proba(X) + probas = convert_multioutput_multiclass_to_multilabel(probas) + return probas @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'DT', 'name': 'Decision Tree Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/extra_trees.py b/autosklearn/pipeline/components/classification/extra_trees.py index e4276a50df..d0fb7cc9b7 100644 --- a/autosklearn/pipeline/components/classification/extra_trees.py +++ b/autosklearn/pipeline/components/classification/extra_trees.py @@ -7,6 +7,7 @@ from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import * +from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel class ExtraTreesClassifier(AutoSklearnClassificationAlgorithm): @@ -110,29 +111,21 @@ def predict(self, X): def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() - return self.estimator.predict_proba(X) + probas = self.estimator.predict_proba(X) + probas = convert_multioutput_multiclass_to_multilabel(probas) + return probas @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'ET', 'name': 'Extra Trees Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/gaussian_nb.py b/autosklearn/pipeline/components/classification/gaussian_nb.py index 2c53d158de..334d4f658b 100644 --- a/autosklearn/pipeline/components/classification/gaussian_nb.py +++ b/autosklearn/pipeline/components/classification/gaussian_nb.py @@ -66,20 +66,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'GaussianNB', 'name': 'Gaussian Naive Bayes classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/gradient_boosting.py b/autosklearn/pipeline/components/classification/gradient_boosting.py index cc95870f24..4a83b8fdf7 100644 --- a/autosklearn/pipeline/components/classification/gradient_boosting.py +++ b/autosklearn/pipeline/components/classification/gradient_boosting.py @@ -113,23 +113,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'GB', 'name': 'Gradient Boosting Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/k_nearest_neighbors.py b/autosklearn/pipeline/components/classification/k_nearest_neighbors.py index f0631b9eb4..bf4d8872bd 100644 --- a/autosklearn/pipeline/components/classification/k_nearest_neighbors.py +++ b/autosklearn/pipeline/components/classification/k_nearest_neighbors.py @@ -45,22 +45,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'KNN', 'name': 'K-Nearest Neighbor Classification', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - 'preferred_dtype' : None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/lda.py b/autosklearn/pipeline/components/classification/lda.py index 1802e642bf..1df49668d0 100644 --- a/autosklearn/pipeline/components/classification/lda.py +++ b/autosklearn/pipeline/components/classification/lda.py @@ -65,22 +65,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'LDA', 'name': 'Linear Discriminant Analysis', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/liblinear_svc.py b/autosklearn/pipeline/components/classification/liblinear_svc.py index 3b66ccde59..a31e61e210 100644 --- a/autosklearn/pipeline/components/classification/liblinear_svc.py +++ b/autosklearn/pipeline/components/classification/liblinear_svc.py @@ -75,21 +75,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'Liblinear-SVC', 'name': 'Liblinear Support Vector Classification', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': False, - 'handles_sparse': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/libsvm_svc.py b/autosklearn/pipeline/components/classification/libsvm_svc.py index 67d5058348..32c1082ed8 100644 --- a/autosklearn/pipeline/components/classification/libsvm_svc.py +++ b/autosklearn/pipeline/components/classification/libsvm_svc.py @@ -142,25 +142,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'LibSVM-SVC', 'name': 'LibSVM Support Vector Classification', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # TODO find out if this is good because of sparsity... - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - # TODO find out of this is right! - # this here suggests so http://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # C-continouos and double precision... - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/multinomial_nb.py b/autosklearn/pipeline/components/classification/multinomial_nb.py index bc144676b4..929a335dff 100644 --- a/autosklearn/pipeline/components/classification/multinomial_nb.py +++ b/autosklearn/pipeline/components/classification/multinomial_nb.py @@ -84,23 +84,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'MultinomialNB', 'name': 'Multinomial Naive Bayes classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - # sklearn website says: The multinomial distribution normally - # requires integer feature counts. However, in practice, - # fractional counts such as tf-idf may also work. - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, SPARSE, SIGNED_DATA), - 'output': (PREDICTIONS,), - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/passive_aggressive.py b/autosklearn/pipeline/components/classification/passive_aggressive.py index 9b9da05d2c..231004e76e 100644 --- a/autosklearn/pipeline/components/classification/passive_aggressive.py +++ b/autosklearn/pipeline/components/classification/passive_aggressive.py @@ -65,23 +65,14 @@ def predict_proba(self, X): @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'PassiveAggressive Classifier', - 'name': 'Passive Aggressive Stochastic Gradient Descent ' - 'Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, + 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/proj_logit.py b/autosklearn/pipeline/components/classification/proj_logit.py index c9c4d1b4be..2452284001 100644 --- a/autosklearn/pipeline/components/classification/proj_logit.py +++ b/autosklearn/pipeline/components/classification/proj_logit.py @@ -35,20 +35,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'PLogit', 'name': 'Logistic Regresion using Least Squares', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} diff --git a/autosklearn/pipeline/components/classification/qda.py b/autosklearn/pipeline/components/classification/qda.py index ed9a99326b..987b1bc113 100644 --- a/autosklearn/pipeline/components/classification/qda.py +++ b/autosklearn/pipeline/components/classification/qda.py @@ -6,6 +6,8 @@ from autosklearn.pipeline.constants import * from autosklearn.pipeline.implementations.util import softmax +import numpy as np + class QDA(AutoSklearnClassificationAlgorithm): @@ -24,6 +26,20 @@ def fit(self, X, Y): self.estimator = estimator self.estimator.fit(X, Y) + + if len(Y.shape) == 2 and Y.shape[1] > 1: + problems = [] + for est in self.estimator.estimators_: + problem = np.any(np.any([np.any(s <= 0.0) for s in + est.scalings_])) + problems.append(problem) + problem = np.any(problems) + else: + problem = np.any(np.any([np.any(s <= 0.0) for s in + self.estimator.scalings_])) + if problem: + raise ValueError('Numerical problems in QDA. QDA.scalings_ ' + 'contains values <= 0.0') return self def predict(self, X): @@ -42,22 +58,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'QDA', 'name': 'Quadratic Discriminant Analysis', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/random_forest.py b/autosklearn/pipeline/components/classification/random_forest.py index 9a0ad37eb6..e1a1ebf5d8 100644 --- a/autosklearn/pipeline/components/classification/random_forest.py +++ b/autosklearn/pipeline/components/classification/random_forest.py @@ -7,6 +7,7 @@ from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm from autosklearn.pipeline.constants import * +from autosklearn.pipeline.implementations.util import convert_multioutput_multiclass_to_multilabel class RandomForest(AutoSklearnClassificationAlgorithm): @@ -103,28 +104,21 @@ def predict(self, X): def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() - return self.estimator.predict_proba(X) + probas = self.estimator.predict_proba(X) + probas = convert_multioutput_multiclass_to_multilabel(probas) + return probas @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'RF', 'name': 'Random Forest Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/classification/sgd.py b/autosklearn/pipeline/components/classification/sgd.py index 217f2dccc5..fc04d39e9d 100644 --- a/autosklearn/pipeline/components/classification/sgd.py +++ b/autosklearn/pipeline/components/classification/sgd.py @@ -94,21 +94,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - 'preferred_dtype' : None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/__init__.py b/autosklearn/pipeline/components/feature_preprocessing/__init__.py index a4ce03c5af..9b51dc45e0 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/__init__.py +++ b/autosklearn/pipeline/components/feature_preprocessing/__init__.py @@ -6,28 +6,21 @@ import pkgutil import sys -from ..base import AutoSklearnPreprocessingAlgorithm +from ..base import AutoSklearnPreprocessingAlgorithm, find_components, \ + ThirdPartyComponents from HPOlibConfigSpace.configuration_space import ConfigurationSpace from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter from HPOlibConfigSpace.conditions import EqualsCondition, AbstractConjunction +classifier_directory = os.path.split(__file__)[0] +_preprocessors = find_components(__package__, + classifier_directory, + AutoSklearnPreprocessingAlgorithm) +_addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -preprocessors_directory = os.path.split(__file__)[0] -_preprocessors = OrderedDict() - -for module_loader, module_name, ispkg in pkgutil.iter_modules([preprocessors_directory]): - full_module_name = "%s.%s" % (__package__, module_name) - if full_module_name not in sys.modules and not ispkg: - module = importlib.import_module(full_module_name) - - for member_name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and AutoSklearnPreprocessingAlgorithm in obj.__bases__: - # TODO test if the obj implements the interface - # Keep in mind that this only instantiates the ensemble_wrapper, - # but not the real target classifier - preprocessor = obj - _preprocessors[module_name] = preprocessor +def add_preprocessor(preprocessor): + _addons.add_component(preprocessor) class FeaturePreprocessorChoice(object): @@ -38,7 +31,10 @@ def __init__(self, **params): @classmethod def get_components(cls): - return _preprocessors + components = OrderedDict() + components.update(_preprocessors) + components.update(_addons.components) + return components @classmethod def get_available_components(cls, data_prop, @@ -162,6 +158,3 @@ def get_hyperparameter_search_space(cls, dataset_properties, cs.add_forbidden_clause(forbidden_clause) return cs - - -_preprocessors['preprocessor'] = FeaturePreprocessorChoice \ No newline at end of file diff --git a/autosklearn/pipeline/components/feature_preprocessing/densifier.py b/autosklearn/pipeline/components/feature_preprocessing/densifier.py index 893c768ee9..76342ce9a8 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/densifier.py +++ b/autosklearn/pipeline/components/feature_preprocessing/densifier.py @@ -23,21 +23,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'RandomTreesEmbedding', 'name': 'Random Trees Embedding', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': False, 'input': (SPARSE, UNSIGNED_DATA), - 'output': (DENSE, INPUT), - 'preferred_dtype': None} + 'output': (DENSE, INPUT)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py index 6bed2c257c..844359da74 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_classification.py @@ -10,7 +10,8 @@ from autosklearn.pipeline.constants import * -class ExtraTreesPreprocessor(AutoSklearnPreprocessingAlgorithm): +class ExtraTreesPreprocessorClassification(AutoSklearnPreprocessingAlgorithm): + def __init__(self, n_estimators, criterion, min_samples_leaf, min_samples_split, max_features, max_leaf_nodes_or_max_depth="max_depth", @@ -69,21 +70,14 @@ def fit(self, X, Y, sample_weight=None): # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) self.preprocessor = ExtraTreesClassifier( - n_estimators=0, criterion=self.criterion, + n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, - random_state=self.random_state, class_weight=self.class_weight, - warm_start=True + random_state=self.random_state, class_weight=self.class_weight ) - # JTS TODO: I think we might have to copy here if we want self.estimator - # to always be consistent on sigabort - while len(self.preprocessor.estimators_) < self.n_estimators: - tmp = self.preprocessor # TODO copy ? - tmp.n_estimators += self.estimator_increment - tmp.fit(X, Y, sample_weight=sample_weight) - self.preprocessor = tmp + self.preprocessor.fit(X, Y, sample_weight=sample_weight) return self def transform(self, X): @@ -93,25 +87,15 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'ET', + return {'shortname': 'ETC', 'name': 'Extra Trees Classifier Preprocessing', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (INPUT,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py new file mode 100644 index 0000000000..9efb94cbb1 --- /dev/null +++ b/autosklearn/pipeline/components/feature_preprocessing/extra_trees_preproc_for_regression.py @@ -0,0 +1,120 @@ +import numpy as np + +from HPOlibConfigSpace.configuration_space import ConfigurationSpace +from HPOlibConfigSpace.hyperparameters import UniformFloatHyperparameter, \ + UniformIntegerHyperparameter, CategoricalHyperparameter, \ + UnParametrizedHyperparameter, Constant + +from autosklearn.pipeline.components.base import \ + AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import * + + +class ExtraTreesPreprocessorRegression(AutoSklearnPreprocessingAlgorithm): + + def __init__(self, n_estimators, criterion, min_samples_leaf, + min_samples_split, max_features, + max_leaf_nodes_or_max_depth="max_depth", + bootstrap=False, max_leaf_nodes=None, max_depth="None", + min_weight_fraction_leaf=0.0, + oob_score=False, n_jobs=1, random_state=None, verbose=0): + + self.n_estimators = int(n_estimators) + self.estimator_increment = 10 + if criterion not in ("mse", ): + raise ValueError("'criterion' is not in ('mse', ): " + "%s" % criterion) + self.criterion = criterion + + if max_leaf_nodes_or_max_depth == "max_depth": + self.max_leaf_nodes = None + if max_depth == "None": + self.max_depth = None + else: + self.max_depth = int(max_depth) + # if use_max_depth == "True": + # self.max_depth = int(max_depth) + #elif use_max_depth == "False": + # self.max_depth = None + else: + if max_leaf_nodes == "None": + self.max_leaf_nodes = None + else: + self.max_leaf_nodes = int(max_leaf_nodes) + self.max_depth = None + + self.min_samples_leaf = int(min_samples_leaf) + self.min_samples_split = int(min_samples_split) + + self.max_features = float(max_features) + + if bootstrap == "True": + self.bootstrap = True + elif bootstrap == "False": + self.bootstrap = False + + self.oob_score = oob_score + self.n_jobs = int(n_jobs) + self.random_state = random_state + self.verbose = int(verbose) + self.preprocessor = None + + def fit(self, X, Y): + from sklearn.ensemble import ExtraTreesRegressor + + num_features = X.shape[1] + max_features = int( + float(self.max_features) * (np.log(num_features) + 1)) + # Use at most half of the features + max_features = max(1, min(int(X.shape[1] / 2), max_features)) + self.preprocessor = ExtraTreesRegressor( + n_estimators=self.n_estimators, criterion=self.criterion, + max_depth=self.max_depth, min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, + max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, + oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, + random_state=self.random_state) + self.preprocessor.fit(X, Y) + + return self + + def transform(self, X): + if self.preprocessor is None: + raise NotImplementedError + return self.preprocessor.transform(X) + + @staticmethod + def get_properties(dataset_properties=None): + return {'shortname': 'ETR', + 'name': 'Extra Trees Regressor Preprocessing', + 'handles_regression': True, + 'handles_classification': False, + 'handles_multiclass': False, + 'handles_multilabel': False, + 'is_deterministic': True, + 'input': (DENSE, SPARSE, UNSIGNED_DATA), + 'output': (INPUT,)} + + @staticmethod + def get_hyperparameter_search_space(dataset_properties=None): + cs = ConfigurationSpace() + + n_estimators = cs.add_hyperparameter(Constant("n_estimators", 100)) + criterion = cs.add_hyperparameter(Constant("criterion", "mse")) + max_features = cs.add_hyperparameter(UniformFloatHyperparameter( + "max_features", 0.5, 5, default=1)) + + max_depth = cs.add_hyperparameter( + UnParametrizedHyperparameter(name="max_depth", value="None")) + + min_samples_split = cs.add_hyperparameter(UniformIntegerHyperparameter( + "min_samples_split", 2, 20, default=2)) + min_samples_leaf = cs.add_hyperparameter(UniformIntegerHyperparameter( + "min_samples_leaf", 1, 20, default=1)) + min_weight_fraction_leaf = cs.add_hyperparameter(Constant( + 'min_weight_fraction_leaf', 0.)) + + bootstrap = cs.add_hyperparameter(CategoricalHyperparameter( + "bootstrap", ["True", "False"], default="False")) + + return cs diff --git a/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py b/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py index 01009dd5c9..3a9f3f7265 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py +++ b/autosklearn/pipeline/components/feature_preprocessing/fast_ica.py @@ -32,7 +32,7 @@ def fit(self, X, Y=None): try: self.preprocessor.fit(X) except ValueError as e: - if e.message == 'array must not contain infs or NaNs': + if 'array must not contain infs or NaNs' in e.args[0]: raise ValueError("Bug in scikit-learn: https://github.com/scikit-learn/scikit-learn/pull/2738") else: import traceback @@ -50,21 +50,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'FastICA', 'name': 'Fast Independent Component Analysis', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': False, - 'handles_sparse': True, - 'handles_dense': True, 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT, UNSIGNED_DATA), - 'preferred_dtype': None} + 'output': (INPUT, UNSIGNED_DATA)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py b/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py index 92ff1f0c75..acaa20d494 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py +++ b/autosklearn/pipeline/components/feature_preprocessing/feature_agglomeration.py @@ -46,21 +46,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'Feature Agglomeration', 'name': 'Feature Agglomeration', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/gem.py b/autosklearn/pipeline/components/feature_preprocessing/gem.py index e3cbdff135..f5bd6ae2c1 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/gem.py +++ b/autosklearn/pipeline/components/feature_preprocessing/gem.py @@ -25,21 +25,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'GEM', 'name': 'Generalized Eigenvector extraction', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, - 'handles_dense': True, 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT, UNSIGNED_DATA), - 'preferred_dtype': None} + 'output': (INPUT, UNSIGNED_DATA)} @staticmethod diff --git a/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py b/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py index d7eddf86d6..5ba1d842fb 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py +++ b/autosklearn/pipeline/components/feature_preprocessing/kernel_pca.py @@ -30,12 +30,15 @@ def fit(self, X, Y=None): n_components=self.n_components, kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, remove_zero_eig=True) - # Make the RuntimeWarning an Exception! if scipy.sparse.issparse(X): X = X.astype(np.float64) with warnings.catch_warnings(): warnings.filterwarnings("error") self.preprocessor.fit(X) + # Raise an informative error message, equation is based ~line 249 in + # kernel_pca.py in scikit-learn + if len(self.preprocessor.alphas_ / self.preprocessor.lambdas_) == 0: + raise ValueError('KernelPCA removed all features!') return self def transform(self, X): @@ -50,21 +53,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'KernelPCA', 'name': 'Kernel Principal Component Analysis', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': False, - 'handles_sparse': True, - 'handles_dense': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (DENSE, UNSIGNED_DATA), - 'preferred_dtype': None} + 'output': (DENSE, UNSIGNED_DATA)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py index d95568ddea..55dfdd7ea1 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py +++ b/autosklearn/pipeline/components/feature_preprocessing/kitchen_sinks.py @@ -36,21 +36,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'KitchenSink', 'name': 'Random Kitchen Sinks', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT, UNSIGNED_DATA), - 'preferred_dtype': None} + 'output': (INPUT, UNSIGNED_DATA)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py index 61071f1727..5358ac5d9d 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py +++ b/autosklearn/pipeline/components/feature_preprocessing/liblinear_svc_preprocessor.py @@ -59,24 +59,14 @@ def transform(self, X): @staticmethod def get_properties(dataset_properties=None): - return {'shortname': 'Liblinear-Preprocessor', - 'name': 'Liblinear Support Vector Preprocessing', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, + return {'shortname': 'LinearSVC Preprocessor', + 'name': 'Liblinear Support Vector Classification Preprocessing', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, - 'is_deterministic': False, - 'handles_sparse': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py b/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py index 0caeb4e6ca..185098708a 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py +++ b/autosklearn/pipeline/components/feature_preprocessing/no_preprocessing.py @@ -23,21 +23,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'no', 'name': 'NoPreprocessing', - 'handles_missing_values': True, - 'handles_nominal_values': True, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py b/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py index 216017b362..9440ed0f5a 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py +++ b/autosklearn/pipeline/components/feature_preprocessing/nystroem_sampler.py @@ -65,21 +65,13 @@ def get_properties(dataset_properties=None): data_type = SIGNED_DATA if signed is True else UNSIGNED_DATA return {'shortname': 'Nystroem', 'name': 'Nystroem kernel approximation', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, 'input': (SPARSE, DENSE, data_type), - 'output': (INPUT, UNSIGNED_DATA), - 'preferred_dtype': None} + 'output': (INPUT, UNSIGNED_DATA)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/pca.py b/autosklearn/pipeline/components/feature_preprocessing/pca.py index 26362ffc29..4827f959fb 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/pca.py +++ b/autosklearn/pipeline/components/feature_preprocessing/pca.py @@ -36,25 +36,14 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'PCA', 'name': 'Principle Component Analysis', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - # TODO write a test to make sure that the PCA scales data itself - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparsity... - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, # TODO document that we have to be very careful 'is_deterministic': False, - 'handles_sparse': False, - 'handles_dense': True, 'input': (DENSE, UNSIGNED_DATA), - 'output': (DENSE, UNSIGNED_DATA), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (DENSE, UNSIGNED_DATA)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/polynomial.py b/autosklearn/pipeline/components/feature_preprocessing/polynomial.py index 9596427801..2e00af2204 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/polynomial.py +++ b/autosklearn/pipeline/components/feature_preprocessing/polynomial.py @@ -33,24 +33,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'PolynomialFeatures', 'name': 'PolynomialFeatures', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - # TODO find out of this is right! - # this here suggests so http://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use - 'handles_sparse': True, 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py b/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py index 9fe95e577b..1a7bce918e 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py +++ b/autosklearn/pipeline/components/feature_preprocessing/random_trees_embedding.py @@ -55,21 +55,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'RandomTreesEmbedding', 'name': 'Random Trees Embedding', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': False, - 'handles_dense': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (SPARSE, SIGNED_DATA), - 'preferred_dtype': None} + 'output': (SPARSE, SIGNED_DATA)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py index a5548c102b..20f3001417 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_classification.py @@ -78,21 +78,13 @@ def get_properties(dataset_properties=None): return {'shortname': 'SPC', 'name': 'Select Percentile Classification', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, 'input': (SPARSE, DENSE, data_type), - 'output': (INPUT,), - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py index ba96074889..5566f79352 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_percentile_regression.py @@ -29,21 +29,13 @@ def __init__(self, percentile, score_func="f_classif", random_state=None): def get_properties(dataset_properties=None): return {'shortname': 'SPR', 'name': 'Select Percentile Regression', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, - 'handles_dense': True, 'input': (DENSE, UNSIGNED_DATA), - 'output': (INPUT,), - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py b/autosklearn/pipeline/components/feature_preprocessing/select_rates.py index 243fa88e8b..4ac9d2e522 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/select_rates.py +++ b/autosklearn/pipeline/components/feature_preprocessing/select_rates.py @@ -83,21 +83,13 @@ def get_properties(dataset_properties=None): return {'shortname': 'SR', 'name': 'Univariate Feature Selection based on rates', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': True, 'input': (SPARSE, DENSE, data_type), - 'output': (INPUT,), - 'preferred_dtype': None} + 'output': (INPUT,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py b/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py index 9108eee2c3..7093a73fbb 100644 --- a/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py +++ b/autosklearn/pipeline/components/feature_preprocessing/truncatedSVD.py @@ -36,21 +36,13 @@ def transform(self, X): def get_properties(dataset_properties=None): return {'shortname': 'TSVD', 'name': 'Truncated Singular Value Decomposition', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': True, - 'handles_sparse': True, - 'handles_dense': False, 'input': (SPARSE, UNSIGNED_DATA), - 'output': (DENSE, INPUT), - 'preferred_dtype': np.float32} + 'output': (DENSE, INPUT)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/__init__.py b/autosklearn/pipeline/components/regression/__init__.py index b1c488acb1..517af9848a 100644 --- a/autosklearn/pipeline/components/regression/__init__.py +++ b/autosklearn/pipeline/components/regression/__init__.py @@ -6,27 +6,21 @@ import pkgutil import sys -from ..base import AutoSklearnRegressionAlgorithm +from ..base import AutoSklearnRegressionAlgorithm, find_components, \ + ThirdPartyComponents from HPOlibConfigSpace.configuration_space import ConfigurationSpace from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter from HPOlibConfigSpace.conditions import EqualsCondition regressor_directory = os.path.split(__file__)[0] -_regressors = OrderedDict() +_regressors = find_components(__package__, + regressor_directory, + AutoSklearnRegressionAlgorithm) +_addons = ThirdPartyComponents(AutoSklearnRegressionAlgorithm) -for module_loader, module_name, ispkg in pkgutil.iter_modules([regressor_directory]): - full_module_name = "%s.%s" % (__package__, module_name) - if full_module_name not in sys.modules and not ispkg: - module = importlib.import_module(full_module_name) - - for member_name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and AutoSklearnRegressionAlgorithm in obj.__bases__: - # TODO test if the obj implements the interface - # Keep in mind that this only instantiates the ensemble_wrapper, - # but not the real target classifier - classifier = obj - _regressors[module_name] = classifier +def add_regressor(regressor): + _addons.add_component(regressor) class RegressorChoice(object): @@ -37,7 +31,10 @@ def __init__(self, **params): @classmethod def get_components(cls): - return _regressors + components = OrderedDict() + components.update(_regressors) + components.update(_addons.components) + return components @classmethod def get_available_components(cls, data_prop, @@ -157,6 +154,3 @@ def get_hyperparameter_search_space(cls, dataset_properties, cs.add_forbidden_clause(forbidden_clause) return cs - - -_regressors['regressor'] = RegressorChoice \ No newline at end of file diff --git a/autosklearn/pipeline/components/regression/adaboost.py b/autosklearn/pipeline/components/regression/adaboost.py index c6b06e99c8..d50321f6a9 100644 --- a/autosklearn/pipeline/components/regression/adaboost.py +++ b/autosklearn/pipeline/components/regression/adaboost.py @@ -47,23 +47,13 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'AB', 'name': 'AdaBoost Regressor', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS, ), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS, )} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/ard_regression.py b/autosklearn/pipeline/components/regression/ard_regression.py new file mode 100644 index 0000000000..5469708549 --- /dev/null +++ b/autosklearn/pipeline/components/regression/ard_regression.py @@ -0,0 +1,94 @@ +import numpy as np + +from HPOlibConfigSpace.configuration_space import ConfigurationSpace +from HPOlibConfigSpace.hyperparameters import UniformFloatHyperparameter, \ + UnParametrizedHyperparameter + +from autosklearn.pipeline.components.base import AutoSklearnRegressionAlgorithm +from autosklearn.pipeline.constants import * + + +class ARDRegression(AutoSklearnRegressionAlgorithm): + def __init__(self, n_iter, tol, alpha_1, alpha_2, lambda_1, lambda_2, + threshold_lambda, fit_intercept, random_state=None): + self.random_state = random_state + self.estimator = None + + self.n_iter = int(n_iter) + self.tol = float(tol) + self.alpha_1 = float(alpha_1) + self.alpha_2 = float(alpha_2) + self.lambda_1 = float(lambda_1) + self.lambda_2 = float(lambda_2) + self.threshold_lambda = float(threshold_lambda) + self.fit_intercept = fit_intercept == True + + def fit(self, X, Y): + import sklearn.linear_model + self.estimator = sklearn.linear_model.\ + ARDRegression(n_iter=self.n_iter, + tol=self.tol, + alpha_1=self.alpha_1, + alpha_2=self.alpha_2, + lambda_1=self.lambda_1, + lambda_2=self.lambda_2, + compute_score=False, + threshold_lambda=self.threshold_lambda, + fit_intercept=True, + normalize=False, + copy_X=False, + verbose=False) + self.estimator.fit(X, Y) + return self + + def predict(self, X): + if self.estimator is None: + raise NotImplementedError + return self.estimator.predict(X) + + @staticmethod + def get_properties(dataset_properties=None): + return {'shortname': 'ARD', + 'name': 'ARD Regression', + 'handles_regression': True, + 'handles_classification': False, + 'handles_multiclass': False, + 'handles_multilabel': False, + 'prefers_data_normalized': True, + 'is_deterministic': True, + 'input': (DENSE, UNSIGNED_DATA), + 'output': (PREDICTIONS,)} + + @staticmethod + def get_hyperparameter_search_space(dataset_properties=None): + cs = ConfigurationSpace() + n_iter = cs.add_hyperparameter( + UnParametrizedHyperparameter("n_iter", value=300)) + tol = cs.add_hyperparameter( + UniformFloatHyperparameter("tol", 10 ** -5, 10 ** -1, + default=10 ** -4, log=True)) + alpha_1 = cs.add_hyperparameter( + UniformFloatHyperparameter(name="alpha_1", lower=10 ** -10, + upper=10 ** -3, default=10 ** -6)) + alpha_2 = cs.add_hyperparameter( + UniformFloatHyperparameter(name="alpha_2", log=True, + lower=10 ** -10, upper=10 ** -3, + default=10 ** -6)) + lambda_1 = cs.add_hyperparameter( + UniformFloatHyperparameter(name="lambda_1", log=True, + lower=10 ** -10, upper=10 ** -3, + default=10 ** -6)) + lambda_2 = cs.add_hyperparameter( + UniformFloatHyperparameter(name="lambda_2", log=True, + lower=10 ** -10, upper=10 ** -3, + default=10 ** -6)) + threshold_lambda = cs.add_hyperparameter( + UniformFloatHyperparameter(name="threshold_lambda", + log=True, + lower=10 ** 3, + upper=10 ** 5, + default=10 ** 4)) + fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter( + "fit_intercept", "True")) + + return cs diff --git a/autosklearn/pipeline/components/regression/decision_tree.py b/autosklearn/pipeline/components/regression/decision_tree.py index 1fa5259aa8..d9c7f6be6f 100644 --- a/autosklearn/pipeline/components/regression/decision_tree.py +++ b/autosklearn/pipeline/components/regression/decision_tree.py @@ -61,23 +61,13 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'DT', 'name': 'Decision Tree Classifier', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': False, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/extra_trees.py b/autosklearn/pipeline/components/regression/extra_trees.py index f62ecb2143..dcae4271d3 100644 --- a/autosklearn/pipeline/components/regression/extra_trees.py +++ b/autosklearn/pipeline/components/regression/extra_trees.py @@ -113,23 +113,13 @@ def predict_proba(self, X): def get_properties(dataset_properties=None): return {'shortname': 'ET', 'name': 'Extra Trees Regressor', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/gaussian_process.py b/autosklearn/pipeline/components/regression/gaussian_process.py index b74e1fdfcc..b293c2304e 100644 --- a/autosklearn/pipeline/components/regression/gaussian_process.py +++ b/autosklearn/pipeline/components/regression/gaussian_process.py @@ -51,23 +51,13 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'GP', 'name': 'Gaussian Process', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # TODO find out if this is good because of sparcity... - 'prefers_data_normalized': True, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/gradient_boosting.py b/autosklearn/pipeline/components/regression/gradient_boosting.py index 370a535498..654d5f5338 100644 --- a/autosklearn/pipeline/components/regression/gradient_boosting.py +++ b/autosklearn/pipeline/components/regression/gradient_boosting.py @@ -113,23 +113,14 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'GB', 'name': 'Gradient Boosting Regressor', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, - # TODO find out if this is good because of sparcity... 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'prefers_data_normalized': False, 'is_deterministic': True, - 'handles_sparse': False, 'input': (DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py index d73819c4e5..0e5b8f1b9d 100644 --- a/autosklearn/pipeline/components/regression/k_nearest_neighbors.py +++ b/autosklearn/pipeline/components/regression/k_nearest_neighbors.py @@ -33,22 +33,13 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'KNN', 'name': 'K-Nearest Neighbor Classification', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/liblinear_svr.py b/autosklearn/pipeline/components/regression/liblinear_svr.py index cf9766bbb3..e843055f61 100644 --- a/autosklearn/pipeline/components/regression/liblinear_svr.py +++ b/autosklearn/pipeline/components/regression/liblinear_svr.py @@ -53,21 +53,13 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'Liblinear-SVR', 'name': 'Liblinear Support Vector Regression', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # Find out if this is good because of sparsity - 'prefers_data_normalized': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': False, - 'handles_sparse': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - 'preferred_dtype': None} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/libsvm_svr.py b/autosklearn/pipeline/components/regression/libsvm_svr.py index 977242d077..08b5c552ad 100644 --- a/autosklearn/pipeline/components/regression/libsvm_svr.py +++ b/autosklearn/pipeline/components/regression/libsvm_svr.py @@ -88,23 +88,14 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'SVR', 'name': 'Support Vector Regression', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # TODO find out if this is good because of sparcity... 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'prefers_data_normalized': True, 'is_deterministic': True, - 'handles_sparse': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/random_forest.py b/autosklearn/pipeline/components/regression/random_forest.py index fb7ee082bc..ded45f73b2 100644 --- a/autosklearn/pipeline/components/regression/random_forest.py +++ b/autosklearn/pipeline/components/regression/random_forest.py @@ -100,22 +100,14 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'RF', 'name': 'Random Forest Regressor', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': False, 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'prefers_data_normalized': False, 'is_deterministic': True, - 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/components/regression/ridge_regression.py b/autosklearn/pipeline/components/regression/ridge_regression.py index 95b15918ed..de3bba4637 100644 --- a/autosklearn/pipeline/components/regression/ridge_regression.py +++ b/autosklearn/pipeline/components/regression/ridge_regression.py @@ -35,23 +35,14 @@ def predict(self, X): def get_properties(dataset_properties=None): return {'shortname': 'Rigde', 'name': 'Ridge Regression', - 'handles_missing_values': False, - 'handles_nominal_values': False, - 'handles_numerical_features': True, - 'prefers_data_scaled': True, - # TODO find out if this is good because of sparcity... 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'prefers_data_normalized': True, 'is_deterministic': True, - 'handles_sparse': True, 'input': (SPARSE, DENSE, UNSIGNED_DATA), - 'output': (PREDICTIONS,), - # TODO find out what is best used here! - # But rather fortran or C-contiguous? - 'preferred_dtype': np.float32} + 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): diff --git a/autosklearn/pipeline/implementations/util.py b/autosklearn/pipeline/implementations/util.py index 555fe3d323..d0b2dbf47a 100644 --- a/autosklearn/pipeline/implementations/util.py +++ b/autosklearn/pipeline/implementations/util.py @@ -14,4 +14,19 @@ def softmax(df): # http://www.iro.umontreal.ca/~bengioy/dlbook/numerical.html tmp = df - np.max(df, axis=1).reshape((-1, 1)) tmp = np.exp(tmp) - return tmp / np.sum(tmp, axis=1).reshape((-1, 1)) \ No newline at end of file + return tmp / np.sum(tmp, axis=1).reshape((-1, 1)) + + +def convert_multioutput_multiclass_to_multilabel(probas): + if isinstance(probas, np.ndarray) and len(probas.shape) > 2: + raise ValueError('New unsupported sklearn output!') + if isinstance(probas, list): + multioutput_probas = np.ndarray((probas[0].shape[0], len(probas))) + for i, output in enumerate(probas): + # Only copy the probability of something having class 1 + multioutput_probas[:, i] = output[:, 1] + if output.shape[1] > 2: + raise ValueError('Multioutput-Multiclass supported by ' + 'scikit-learn, but not by auto-sklearn!') + probas = multioutput_probas + return probas \ No newline at end of file diff --git a/autosklearn/pipeline/regression.py b/autosklearn/pipeline/regression.py index 492a706629..542ced7c36 100644 --- a/autosklearn/pipeline/regression.py +++ b/autosklearn/pipeline/regression.py @@ -2,12 +2,17 @@ import copy from itertools import product +import numpy as np from sklearn.base import RegressorMixin from HPOlibConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction from HPOlibConfigSpace.configuration_space import ConfigurationSpace -from autosklearn.pipeline import components as components +from autosklearn.pipeline.components import regression as regression_components +from autosklearn.pipeline.components import data_preprocessing as \ + data_preprocessing_components +from autosklearn.pipeline.components import feature_preprocessing as \ + feature_preprocessing_components from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.constants import SPARSE @@ -59,6 +64,10 @@ class SimpleRegressionPipeline(RegressorMixin, BasePipeline): -------- """ + def __init__(self, configuration, random_state=None): + self._output_dtype = np.float32 + super(SimpleRegressionPipeline, self).__init__(configuration, + random_state) def pre_transform(self, X, Y, fit_params=None, init_params=None): X, fit_params = super(SimpleRegressionPipeline, self).pre_transform( @@ -66,6 +75,28 @@ def pre_transform(self, X, Y, fit_params=None, init_params=None): self.num_targets = 1 if len(Y.shape) == 1 else Y.shape[1] return X, fit_params + def fit_estimator(self, X, y, fit_params=None): + self.y_max_ = np.nanmax(y) + self.y_min_ = np.nanmin(y) + return super(SimpleRegressionPipeline, self).fit_estimator( + X, y, fit_params=fit_params) + + def iterative_fit(self, X, y, fit_params=None, n_iter=1): + self.y_max_ = np.nanmax(y) + self.y_min_ = np.nanmin(y) + return super(SimpleRegressionPipeline, self).iterative_fit( + X, y, fit_params=fit_params, n_iter=n_iter) + + def predict(self, X, batch_size=None): + y = super(SimpleRegressionPipeline, self).\ + predict(X, batch_size=batch_size) + y[y > (2 * self.y_max_)] = 2 * self.y_max_ + if self.y_min_ < 0: + y[y < (2 * self.y_min_)] = 2 * self.y_min_ + elif self.y_min_ > 0: + y[y < (0.5 * self.y_min_)] = 0.5 * self.y_min_ + return y + @classmethod def get_available_components(cls, available_comp, data_prop, inc, exc): components_dict = OrderedDict() @@ -211,7 +242,7 @@ def get_hyperparameter_search_space(cls, include=None, exclude=None, @staticmethod def _get_estimator_components(): - return components.regression_components._regressors + return regression_components._regressors @classmethod def _get_pipeline(cls): @@ -220,20 +251,19 @@ def _get_pipeline(cls): # Add the always active preprocessing components steps.extend( [["one_hot_encoding", - components.data_preprocessing._preprocessors['one_hot_encoding']], + data_preprocessing_components._preprocessors['one_hot_encoding']], ["imputation", - components.data_preprocessing._preprocessors['imputation']], + data_preprocessing_components._preprocessors['imputation']], ["rescaling", - components.data_preprocessing._preprocessors['rescaling']]]) + data_preprocessing_components._preprocessors['rescaling']]]) # Add the preprocessing component steps.append(['preprocessor', - components.feature_preprocessing._preprocessors[ - 'preprocessor']]) + feature_preprocessing_components.FeaturePreprocessorChoice]) # Add the classification component steps.append(['regressor', - components.regression_components._regressors['regressor']]) + regression_components.RegressorChoice]) return steps def _get_estimator_hyperparameter_name(self): diff --git a/autosklearn/pipeline/util.py b/autosklearn/pipeline/util.py index fcc3e01ce9..0aa52b256d 100644 --- a/autosklearn/pipeline/util.py +++ b/autosklearn/pipeline/util.py @@ -45,7 +45,8 @@ def find_sklearn_classes(class_): def get_dataset(dataset='iris', make_sparse=False, add_NaNs=False, - train_size_maximum=150): + train_size_maximum=150, make_multilabel=False, + make_binary=False): iris = getattr(sklearn.datasets, "load_%s" % dataset)() X = iris.data.astype(np.float32) Y = iris.target @@ -74,14 +75,37 @@ def get_dataset(dataset='iris', make_sparse=False, add_NaNs=False, X_test = scipy.sparse.csc_matrix(X_test) X_test.eliminate_zeros() + if make_binary and make_multilabel: + raise ValueError('Can convert dataset only to one of the two ' + 'options binary or multilabel!') + + if make_binary: + Y_train[Y_train > 1] = 1 + Y_test[Y_test > 1] = 1 + + if make_multilabel: + num_classes = len(np.unique(Y)) + Y_train_ = np.zeros((Y_train.shape[0], num_classes)) + for i in range(Y_train.shape[0]): + Y_train_[i, Y_train[i]] = 1 + Y_train = Y_train_ + Y_test_ = np.zeros((Y_test.shape[0], num_classes)) + for i in range(Y_test.shape[0]): + Y_test_[i, Y_test[i]] = 1 + Y_test = Y_test_ + return X_train, Y_train, X_test, Y_test def _test_classifier(classifier, dataset='iris', sparse=False, - train_size_maximum=150): + train_size_maximum=150, make_multilabel=False, + make_binary=False): X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=sparse, - train_size_maximum=train_size_maximum) + train_size_maximum=train_size_maximum, + make_multilabel=make_multilabel, + make_binary=make_binary) + configuration_space = classifier.get_hyperparameter_search_space( dataset_properties={'sparse': sparse}) default = configuration_space.get_default_configuration() @@ -109,10 +133,14 @@ def _test_classifier_iterative_fit(classifier, dataset='iris', sparse=False): def _test_classifier_predict_proba(classifier, dataset='iris', sparse=False, - train_size_maximum=150): + train_size_maximum=150, + make_multilabel=False, + make_binary=False): X_train, Y_train, X_test, Y_test = get_dataset(dataset=dataset, make_sparse=sparse, - train_size_maximum=train_size_maximum) + train_size_maximum=train_size_maximum, + make_multilabel=make_multilabel, + make_binary=make_binary) configuration_space = classifier.get_hyperparameter_search_space() default = configuration_space.get_default_configuration() classifier = classifier(random_state=1, diff --git a/autosklearn/util/backend.py b/autosklearn/util/backend.py index 585eb4385f..c8726dfabc 100644 --- a/autosklearn/util/backend.py +++ b/autosklearn/util/backend.py @@ -166,45 +166,41 @@ def load_all_models(self, seed): return models - def get_ensemble_indices_dir(self): - return os.path.join(self.internals_directory, 'ensemble_indices') + def get_ensemble_dir(self): + return os.path.join(self.internals_directory, 'ensembles') - def load_ensemble_indices_weights(self, seed): - indices_dir = self.get_ensemble_indices_dir() + def load_ensemble(self, seed): + ensemble_dir = self.get_ensemble_dir() - if not os.path.exists(indices_dir): - self.logger.warning('Directory %s does not exist' % indices_dir) - return {} + if not os.path.exists(ensemble_dir): + self.logger.warning('Directory %s does not exist' % ensemble_dir) + return None if seed >= 0: - indices_files = glob.glob(os.path.join(indices_dir, - '%s.*.indices' % seed)) + indices_files = glob.glob(os.path.join(ensemble_dir, + '%s.*.ensemble' % seed)) indices_files.sort() else: - indices_files = os.listdir(indices_dir) - indices_files = [os.path.join(indices_dir, f) for f in indices_files] + indices_files = os.listdir(ensemble_dir) + indices_files = [os.path.join(ensemble_dir, f) for f in indices_files] indices_files.sort(key=lambda f: time.ctime(os.path.getmtime(f))) with open(indices_files[-1], 'rb') as fh: ensemble_members_run_numbers = pickle.load(fh) - if len(ensemble_members_run_numbers) == 0: - self.logger.error('Ensemble indices file %s does not contain any ' - 'ensemble information.', indices_files[-1]) - return ensemble_members_run_numbers - def save_ensemble_indices_weights(self, indices, idx, seed): + def save_ensemble(self, ensemble, idx, seed): try: - os.makedirs(self.get_ensemble_indices_dir()) + os.makedirs(self.get_ensemble_dir()) except Exception: pass - filepath = os.path.join(self.get_ensemble_indices_dir(), - '%s.%s.indices' % (str(seed), str(idx).zfill( + filepath = os.path.join(self.get_ensemble_dir(), + '%s.%s.ensemble' % (str(seed), str(idx).zfill( 10))) with open(filepath, 'wb') as fh: - pickle.dump(indices, fh) + pickle.dump(ensemble, fh) def _get_prediction_output_dir(self, subset): return os.path.join(self.internals_directory, diff --git a/autosklearn/util/submit_process.py b/autosklearn/util/submit_process.py index dbffd7b1b8..6ef189272f 100644 --- a/autosklearn/util/submit_process.py +++ b/autosklearn/util/submit_process.py @@ -58,7 +58,7 @@ def run_ensemble_builder(tmp_dir, dataset_name, task_type, metric, limit, call = [ensemble_script, '--auto-sklearn-tmp-directory', tmp_dir, - '--basename', dataset_name, + '--dataset_name', dataset_name, '--task', task_type, '--metric', metric, '--limit', str(limit - 5), diff --git a/example/example_crossvalidation.py b/example/example_crossvalidation.py new file mode 100644 index 0000000000..c9aa12a425 --- /dev/null +++ b/example/example_crossvalidation.py @@ -0,0 +1,41 @@ +# -*- encoding: utf-8 -*- +from __future__ import print_function + +import sklearn.datasets +import numpy as np + +import autosklearn.classification + + +def main(): + digits = sklearn.datasets.load_digits() + X = digits.data + y = digits.target + indices = np.arange(X.shape[0]) + np.random.shuffle(indices) + X = X[indices] + y = y[indices] + X_train = X[:1000] + y_train = y[:1000] + X_test = X[1000:] + y_test = y[1000:] + automl = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=60, per_run_time_limit=30, + tmp_folder='/tmp/autoslearn_example_tmp', + output_folder='/tmp/autosklearn_example_out', + delete_tmp_folder_after_terminate=False, + resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}) + + # fit() changes the data in place, but refit needs the original data. We + # therefore copy the data. In practice, one should reload the data + automl.fit(X_train.copy(), y_train.copy(), dataset_name='digits') + automl.refit(X_train.copy(), y_train.copy()) + + print(automl.show_models()) + + predictions = automl.predict(X_test) + print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions)) + + +if __name__ == '__main__': + main() diff --git a/example/example1.py b/example/example_holdout.py similarity index 53% rename from example/example1.py rename to example/example_holdout.py index 5188bdd5a8..0370c40b25 100644 --- a/example/example1.py +++ b/example/example_holdout.py @@ -1,10 +1,11 @@ # -*- encoding: utf-8 -*- from __future__ import print_function -import sklearn.datasets import numpy as np +import sklearn.datasets +import sklearn.metrics -import autosklearn +import autosklearn.classification def main(): @@ -19,12 +20,15 @@ def main(): y_train = y[:1000] X_test = X[1000:] y_test = y[1000:] - automl = autosklearn.AutoSklearnClassifier(time_left_for_this_task=600, - per_run_time_limit=30, - tmp_folder='/tmp/autoslearn_example_tmp', - output_folder='/tmp/autosklearn_example_out') + automl = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=60, per_run_time_limit=30, + tmp_folder='/tmp/autoslearn_example_tmp', + output_folder='/tmp/autosklearn_example_out') automl.fit(X_train, y_train, dataset_name='digits') - print(automl.score(X_test, y_test)) + + print(automl.show_models()) + predictions = automl.predict(X_test) + print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions)) if __name__ == '__main__': diff --git a/example/example_lib_score.py b/example/example_lib_score.py deleted file mode 100644 index 800f5236e3..0000000000 --- a/example/example_lib_score.py +++ /dev/null @@ -1,220 +0,0 @@ -# -*- encoding: utf-8 -*- - -from __future__ import print_function - -import os -from sys import stderr - -import numpy as np - -from autosklearn.metrics.libscores import show_all_scores - -swrite = stderr.write - -if (os.name == 'nt'): - filesep = '\\' -else: - filesep = '/' - - -def main(): - # This shows a bug in metrics.roc_auc_score - # print('\n\nBug in sklearn.metrics.roc_auc_score:') - # print('auc([1,0,0],[1e-10,0,0])=1') - # print('Correct (ours): ' +str(auc_metric(np.array([[1,0,0]]).transpose(),np.array([[1e-10,0,0]]).transpose()))) - # print('Incorrect (sklearn): ' +str(metrics.roc_auc_score(np.array([1,0,0]),np.array([1e-10,0,0])))) - - # This checks the binary and multi-class cases are well implemented - # In the 2-class case, all results should be identical, except for f1 because - # this is a score that is not symmetric in the 2 classes. - eps = 1e-15 - print('\n\nBinary score verification:') - print('\n\n==========================') - - sol0 = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) - - comment = ['PERFECT'] - Pred = [sol0] - Sol = [sol0] - - comment.append('ANTI-PERFECT, very bad for r2_score') - Pred.append(1 - sol0) - Sol.append(sol0) - - comment.append( - 'UNEVEN PROBA, BUT BINARIZED VERSION BALANCED (bac and auc=0.5)') - Pred.append(np.array([[0.7, 0.3], [0.4, 0.6], [0.49, 0.51], [0.2, 0.8]]) - ) # here is we have only 2, pac not 0 in uni-col - Sol.append(sol0) - - comment.append( - 'PROBA=0.5, TIES BROKEN WITH SMALL VALUE TO EVEN THE BINARIZED VERSION') - Pred.append(np.array([[0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps], - [0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps]])) - Sol.append(sol0) - - comment.append('PROBA=0.5, TIES NOT BROKEN (bad for f1 score)') - Pred.append(np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]])) - Sol.append(sol0) - - sol1 = np.array([[1, 0], [0, 1], [0, 1]]) - - comment.append( - 'EVEN PROBA, but wrong PAC prior because uneven number of samples') - Pred.append(np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]])) - Sol.append(sol1) - - comment.append( - 'Correct PAC prior; score generally 0. But 100% error on positive class because of binarization so f1 (1 col) is at its worst.') - p = len(sol1) - Pred.append(np.array([sum(sol1) * 1. / p] * p)) - Sol.append(sol1) - - comment.append('All positive') - Pred.append(np.array([[1, 1], [1, 1], [1, 1]])) - Sol.append(sol1) - - comment.append('All negative') - Pred.append(np.array([[0, 0], [0, 0], [0, 0]])) - Sol.append(sol1) - - for k in range(len(Sol)): - sol = Sol[k] - pred = Pred[k] - print('****** ({}) {} ******'.format(k, comment[k])) - print('------ 2 columns ------') - show_all_scores(sol, pred) - print('------ 1 column ------') - sol = np.array([sol[:, 0]]).transpose() - pred = np.array([pred[:, 0]]).transpose() - show_all_scores(sol, pred) - - print('\n\nMulticlass score verification:') - print('\n\n==========================') - sol2 = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 0]]) - - comment = ['Three classes perfect'] - Pred = [sol2] - Sol = [sol2] - - comment.append('Three classes all wrong') - Pred.append(np.array([[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]])) - Sol.append(sol2) - - comment.append('Three classes equi proba') - Pred.append(np.array([[1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3], - [1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3]])) - Sol.append(sol2) - - comment.append('Three classes some proba that do not add up') - Pred.append(np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], - [0.7, 0.3, 0.3]])) - Sol.append(sol2) - - comment.append('Three classes predict prior') - Pred.append(np.array([[0.75, 0.25, 0.], [0.75, 0.25, 0.], [0.75, 0.25, 0.], - [0.75, 0.25, 0.]])) - Sol.append(sol2) - - for k in range(len(Sol)): - sol = Sol[k] - pred = Pred[k] - print('****** ({}) {} ******'.format(k, comment[k])) - show_all_scores(sol, pred) - - print('\n\nMulti-label score verification: 1) all identical labels') - print('\n\n=======================================================') - print( - '\nIt is normal that for more then 2 labels the results are different for the multiclass scores.') - print('\nBut they should be indetical for the multilabel scores.') - num = 2 - - sol = np.array([[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) - sol3 = sol[:, 0:num] - if num == 1: - sol3 = np.array([sol3[:, 0]]).transpose() - - comment = ['{} labels perfect'.format(num)] - Pred = [sol3] - Sol = [sol3] - - comment.append('All wrong, in the multi-label sense') - Pred.append(1 - sol3) - Sol.append(sol3) - - comment.append('All equi proba: 0.5') - sol = np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], - [0.5, 0.5, 0.5]]) - if num == 1: - Pred.append(np.array([sol[:, 0]]).transpose()) - else: - Pred.append(sol[:, 0:num]) - Sol.append(sol3) - - comment.append('All equi proba, prior: 0.25') - sol = np.array([[0.25, 0.25, 0.25], [0.25, 0.25, 0.25], [0.25, 0.25, 0.25], - [0.25, 0.25, 0.25]]) - if num == 1: - Pred.append(np.array([sol[:, 0]]).transpose()) - else: - Pred.append(sol[:, 0:num]) - Sol.append(sol3) - - comment.append('Some proba') - sol = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], - [0.7, 0.7, 0.7]]) - if num == 1: - Pred.append(np.array([sol[:, 0]]).transpose()) - else: - Pred.append(sol[:, 0:num]) - Sol.append(sol3) - - comment.append('Invert both solution and prediction') - if num == 1: - Pred.append(np.array([sol[:, 0]]).transpose()) - else: - Pred.append(sol[:, 0:num]) - Sol.append(1 - sol3) - - for k in range(len(Sol)): - sol = Sol[k] - pred = Pred[k] - print('****** ({}) {} ******'.format(k, comment[k])) - show_all_scores(sol, pred) - - print('\n\nMulti-label score verification:') - print('\n\n==========================') - - sol4 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) - - comment = ['Three labels perfect'] - Pred = [sol4] - Sol = [sol4] - - comment.append('Three classes all wrong, in the multi-label sense') - Pred.append(1 - sol4) - Sol.append(sol4) - - comment.append('Three classes equi proba') - Pred.append(np.array([[1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3], - [1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3]])) - Sol.append(sol4) - - comment.append('Three classes some proba that do not add up') - Pred.append(np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], - [0.7, 0.3, 0.3]])) - Sol.append(sol4) - - comment.append('Three classes predict prior') - Pred.append(np.array([[0.25, 0.25, 0.5], [0.25, 0.25, 0.5], - [0.25, 0.25, 0.5], [0.25, 0.25, 0.5]])) - Sol.append(sol4) - - for k in range(len(Sol)): - sol = Sol[k] - pred = Pred[k] - print('****** ({}) {} ******'.format(k, comment[k])) - show_all_scores(sol, pred) - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/misc/regressors.csv b/misc/regressors.csv index 83a162e65c..9be39cd8bf 100644 --- a/misc/regressors.csv +++ b/misc/regressors.csv @@ -16,7 +16,7 @@ class,added,comment ,False,Calibration instead of prediction method ,False,Add ,False,No -,False,Wait for Tobias' feedback +,True, ,False,Wait for Tobias' feedback ,False,Wait for Tobias' feedback ,False, diff --git a/requ.txt b/requ.txt index c8a3ddae5f..26d7be4080 100644 --- a/requ.txt +++ b/requ.txt @@ -1,3 +1,4 @@ +unittest2 setuptools mock nose diff --git a/source/api.rst b/source/api.rst index 23b72523e2..69e15c3f26 100644 --- a/source/api.rst +++ b/source/api.rst @@ -8,14 +8,13 @@ APIs Main modules ============ -.. autoclass:: ParamSklearn.classification.ParamSklearnClassifier - +.. autoclass:: autosklearn.classification.AutoSklearnClassifier Extension Interfaces ==================== -.. autoclass:: ParamSklearn.components.classification_base.ParamSklearnClassificationAlgorithm +.. autoclass:: autosklearn.pipeline.components.base.AutoSklearnClassificationAlgorithm + +.. autoclass:: autosklearn.pipeline.components.base.AutoSklearnRegressionAlgorithm -.. autoclass:: ParamSklearn.components.regression_base.ParamSklearnRegressionAlgorithm - -.. autoclass:: ParamSklearn.components.preprocessor_base.ParamSklearnPreprocessingAlgorithm +.. autoclass:: autosklearn.pipeline.components.base.AutoSklearnPreprocessingAlgorithm diff --git a/source/components.rst b/source/components.rst deleted file mode 100644 index 52b14bc0a0..0000000000 --- a/source/components.rst +++ /dev/null @@ -1,97 +0,0 @@ -:orphan: - -.. _components: - -Available Components -******************** - -Classification -============== - -A list of all classification algorithms considered in the ParamSklearn search space. - -.. autoclass:: ParamSklearn.components.classification.adaboost.AdaboostClassifier - :members: - -.. autoclass:: ParamSklearn.components.classification.bernoulli_nb.BernoulliNB - :members: - -.. autoclass:: ParamSklearn.components.classification.extra_trees.ExtraTreesClassifier - :members: - -.. autoclass:: ParamSklearn.components.classification.gaussian_nb.GaussianNB - :members: - -.. autoclass:: ParamSklearn.components.classification.gradient_boosting.GradientBoostingClassifier - :members: - -.. autoclass:: ParamSklearn.components.classification.k_nearest_neighbors.KNearestNeighborsClassifier - :members: - -.. autoclass:: ParamSklearn.components.classification.liblinear.LibLinear_SVC - :members: - -.. autoclass:: ParamSklearn.components.classification.libsvm_svc.LibSVM_SVC - :members: - -.. autoclass:: ParamSklearn.components.classification.multinomial_nb.MultinomialNB - :members: - -.. autoclass:: ParamSklearn.components.classification.random_forest.RandomForest - :members: - -.. autoclass:: ParamSklearn.components.classification.sgd.SGD - :members: - -Regression -========== - -A list of all regression algorithms considered in the ParamSklearn search space. - -.. autoclass:: ParamSklearn.components.regression.gaussian_process.GaussianProcess - :members: - -.. autoclass:: ParamSklearn.components.regression.gradient_boosting.GradientBoosting - :members: - -.. autoclass:: ParamSklearn.components.regression.random_forest.RandomForest - :members: - -.. autoclass:: ParamSklearn.components.regression.ridge_regression.RidgeRegression - :members: - - -Preprocessing -============= - -.. autoclass:: ParamSklearn.components.preprocessing.densifier.Densifier - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.imputation.Imputation - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.kitchen_sinks.RandomKitchenSinks - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.no_preprocessing.NoPreprocessing - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.pca.PCA - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.random_trees_embedding.RandomTreesEmbedding - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.rescaling.Rescaling - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.select_percentile_classification.SelectPercentileClassification - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.select_percentile_regression.SelectPercentileRegression - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.sparse_filtering.SparseFiltering - :members: - -.. autoclass:: ParamSklearn.components.preprocessing.truncatedSVD.TruncatedSVD diff --git a/source/conf.py b/source/conf.py index 9381aebdef..1839821aba 100644 --- a/source/conf.py +++ b/source/conf.py @@ -20,9 +20,14 @@ # If your documentation needs a minimal Sphinx version, state it here. # needs_sphinx = '1.0' +import os +import sys + # Mock out stuff for readthedocs.org -import sys +#on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +#if on_rtd: + try: from mock import Mock as MagicMock except: @@ -46,8 +51,9 @@ class BaseEstimator(object): 'arff', 'pandas', 'Cython', - 'numpy', + 'numpy', 'numpy.random', 'scipy', 'scipy.sparse', 'scipy.stats', 'scipy.linalg', + 'scipy.sparse.linalg', 'sklearn', 'sklearn.base', 'sklearn.cross_validation', @@ -58,16 +64,19 @@ class BaseEstimator(object): 'sklearn.utils', 'psutil','pyyaml','pandas', 'matplotlib', - 'autosklearn.pipeline', - 'autosklearn.pipeline.implementations', + 'autosklearn.cli.base_interface', 'autosklearn.pipeline.implementations.OneHotEncoder', 'autosklearn.pipeline.implementations.Imputation', 'autosklearn.pipeline.implementations.StandardScaler', + 'autosklearn.pipeline.implementations.MultilabelClassifier', 'autosklearn.pipeline.classification', 'autosklearn.pipeline.regression', 'HPOlibConfigSpace', 'HPOlibConfigSpace.converters', - 'HPOlibConfigSpace.configuration_space'] + 'HPOlibConfigSpace.configuration_space', + 'HPOlibConfigSpace.hyperparameters', + 'HPOlibConfigSpace.conditions', + 'HPOlibConfigSpace.forbidden'] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) @@ -110,7 +119,8 @@ class BaseEstimator(object): # General information about the project. project = u'AutoSklearn' -copyright = u'2015, Matthias Feurer, Aaron Klein, Katharina Eggensperger' +copyright = u'2014-2016, Matthias Feurer, Aaron Klein, Katharina ' \ + u'Eggensperger, Jost Tobias Springenberg, Manuel Blum, Frank Hutter' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/source/extending.rst b/source/extending.rst new file mode 100644 index 0000000000..a8a683d1bc --- /dev/null +++ b/source/extending.rst @@ -0,0 +1,153 @@ +:orphan: + +.. _extending: + +====================== +Extending auto-sklearn +====================== + +auto-sklearn can be easily extended with new classification, regression and +feature preprocessing methods. In order to do so, a user has to implement a +wrapper class and make it known to auto-sklearn. This manual will walk you +through the process. + + +Writing a component +=================== + +Depending on the purpose, the component has to be a subclass of one of the +following base classes: + +* classification: :class:`autosklearn.pipeline.components.base.AutoSklearnClassificationAlgorithm` +* regression: :class:`autosklearn.pipeline.components.base.AutoSklearnRegressionAlgorithm` +* proprocessing: :class:`autosklearn.pipeline.components.base.AutoSklearnPreprocessingAlgorithm` + +In general, these classes are wrappers around existing machine learning +models and only add the functionality auto-sklearn needs. Of course you can +also implement a machine learning algorithm directly inside a component. + +Each component has to implement a method which returns its configuration +space, a method for querying properties of the component and methods like +`fit()`, `predict()` or `transform()` based on the task of the component. +These are described in the subsections +:ref:`get_hyperparameter_search_space` and :ref:`get_properties` + +After writing a component class, you have to tell auto-sklearn about its +existence. You have to add it with the following function calls, depending on +the type of component: + +.. autofunction:: autosklearn.pipeline.components.classification.add_classifier + +.. autofunction:: autosklearn.pipeline.components.regression.add_regressor + +.. autofunction:: autosklearn.pipeline.components.feature_preprocessing.add_preprocessor + + +.. _get_hyperparameter_search_space: + +get_hyperparameter_search_space() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Return an instance of ``HPOlibConfigSpace.configuration_space +.ConfigurationSpace``. + +See also the abstract definitions: +:meth:`AutoSklearnClassificationAlgorithm.get_hyperparameter_search_space() ` +:meth:`AutoSklearnRegressionAlgorithm.get_hyperparameter_search_space() ` +:meth:`AutoSklearnPreprocessingAlgorithm.get_hyperparameter_search_space() ` + +To find out about how to create a ``ConfigurationSpace``-object, please look +at the source code on `github.com `_. + +.. _get_properties: + +get_properties() +~~~~~~~~~~~~~~~~ + +Return a dictionary which defines how the component can be used when +constructing a machine learning pipeline. The following fields must be +specified: + +* shortname : str + an abbreviation of the component +* name : str + the full name of the component +* handles_regression : bool + whether the component can handle regression data +* handles_classification : bool + whether the component can handle classification data +* handles_multiclass : bool + whether the component can handle multiclass classification data +* handles_multilabel : bool + whether the component can multilabel classification data +* is_deterministic : bool + whether the component gives the same result when using several times, + but with the same random seed +* input : tuple + type of input data the component can handle, can have multiple values: + + * **autosklearn.constants.DENSE** + dense data arrays, mutually exclusive with autosklearn.constants.SPARSE + * **autosklearn.constants.SPARSE** + sparse data matrices, mutually exclusive with autosklearn.constants.DENSE + * **autosklearn.constants.UNSIGNED_DATA** + unsigned data array, meaning only positive input, mutually exclusive + with autosklearn.constants.SIGNED_DATA + * **autosklearn.constants.SIGNED_DATA** + signed data array, meaning both positive and negative input values, + mutually exclusive with autosklearn.constants.UNSIGNED_DATA +* output : tuple + type of output data the component produces + + * **autosklearn.constants.PREDICTIONS** + predictions, for example by a classifier + * **autosklearn.constants.INPUT** + data in the same form as the input + * **autosklearn.constants.DENSE** + dense data arrays, mutually exclusive with autosklearn.constants.SPARSE. + This implies that sparse data will be converted into a dense + representation. + * **autosklearn.constants.SPARSE** + sparse data matrices, mutually exclusive with + autosklearn.constants.DENSE. This implies that dense data will + be converted into a sparse representation + * **autosklearn.constants.UNSIGNED_DATA** + unsigned data array, meaning only positive input, mutually exclusive + with autosklearn.constants.SIGNED_DATA. This allows for algorithms which + can only work on positive data. + * **autosklearn.constants.SIGNED_DATA** + signed data array, meaning both positive and negative input values, + mutually exclusive with autosklearn.constants.UNSIGNED_DATA + +Classification +============== + +In addition two `get_properties()` and `get_hyperparameter_search_space()` +you have to implement +:meth:`AutoSklearnClassificationAlgorithm.fit() ` +and +:meth:`AutoSklearnClassificationAlgorithm.predict() ` +. These are an implementation of the `scikit-learn predictor API +`_. + +Regression +========== + +In addition two `get_properties()` and `get_hyperparameter_search_space()` +you have to implement +:meth:`AutoSklearnRegressionAlgorithm.fit() ` +and +:meth:`AutoSklearnRegressionAlgorithm.predict() ` +. These are an implementation of the `scikit-learn predictor API +`_. + +Feature Preprocessing +===================== + +In addition two `get_properties()` and `get_hyperparameter_search_space()` +you have to implement +:meth:`AutoSklearnPreprocessingAlgorithm.fit() ` +and +:meth:`AutoSklearnPreprocessingAlgorithm.transform() ` +. These are an implementation of the `scikit-learn predictor API +`_. diff --git a/source/extending_ParamSklearn.rst b/source/extending_ParamSklearn.rst deleted file mode 100644 index 4b1123bf49..0000000000 --- a/source/extending_ParamSklearn.rst +++ /dev/null @@ -1,4 +0,0 @@ -Extending ParamSklearn -********************** - -.. automodule:: ParamSklearn.components diff --git a/source/index.rst b/source/index.rst index 5dec85da48..ef8a2ab4d7 100644 --- a/source/index.rst +++ b/source/index.rst @@ -55,11 +55,12 @@ with Ubuntu. It should run on other Linux distributions, but won't work on a MAC or on a windows PC. It requires scikit-learn 0.16.1, which in turn requires numpy and scipy. -*auto-sklearn* has a dependency, which are not yet automatically resolved: +*auto-sklearn* has at least one dependency, which is not yet automatically +resolved: * `HPOlibConfigSpace `_ -Please install these manually with: +Please install all dependencies manually with: .. code:: bash @@ -77,10 +78,12 @@ We recommend installing *auto-sklearn* into a `virtual environment seen strange things happening when installing it using :bash:`python setup.py --user`. -API -*** +Manual +****** -.. autoclass:: autosklearn.classification.AutoSklearnClassifier +* :ref:`API` +* :ref:`resampling` +* :ref:`extending` License diff --git a/source/installation.rst b/source/installation.rst deleted file mode 100644 index 9c8eaa0d42..0000000000 --- a/source/installation.rst +++ /dev/null @@ -1,4 +0,0 @@ -Install ParamSklearn -******************** - -Please see the file `README.md`. \ No newline at end of file diff --git a/source/introduction.rst b/source/introduction.rst deleted file mode 100644 index 43a62256ad..0000000000 --- a/source/introduction.rst +++ /dev/null @@ -1,40 +0,0 @@ -Introduction to ParamSklearn -**************************** - -What is ParamSklearn? -===================== - -.. automodule:: ParamSklearn - -Get involved -============ - -License -======= -We chose to license ParamSklearn the same way as scikit-learn. It is available under the open source and commercially usable 3-clause BSD license. - -Copyright (c) 2014, Matthias Feurer -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. -* Neither the name of the University of Freiburg, nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/source/resampling.rst b/source/resampling.rst new file mode 100644 index 0000000000..3a03319a89 --- /dev/null +++ b/source/resampling.rst @@ -0,0 +1,9 @@ +:orphan: + +.. _resampling: + +Resampling strategies +********************* + +Examples for using holdout and cross-validation can be found in the example +directory. \ No newline at end of file diff --git a/test/.data/adult/adult_feat.type b/test/.data/adult/adult_feat.type new file mode 100755 index 0000000000..a9bb66ac93 --- /dev/null +++ b/test/.data/adult/adult_feat.type @@ -0,0 +1,24 @@ +Categorical +Numerical +Numerical +Categorical +Numerical +Numerical +Categorical +Categorical +Numerical +Categorical +Categorical +Numerical +Categorical +Categorical +Numerical +Categorical +Numerical +Categorical +Categorical +Numerical +Numerical +Categorical +Numerical +Categorical diff --git a/test/.data/adult/adult_public.info b/test/.data/adult/adult_public.info new file mode 100755 index 0000000000..e969bf635a --- /dev/null +++ b/test/.data/adult/adult_public.info @@ -0,0 +1,16 @@ +usage = '' +name = 'adult' +task = 'multilabel.classification' +target_type = 'Binary' +feat_type = 'Mixed' +metric = 'f1_metric' +time_budget = 300 +feat_num = 24 +target_num = 3 +label_num = 3 +train_num = 34190 +valid_num = 4884 +test_num = 9768 +has_categorical = 1 +has_missing = 1 +is_sparse = 0 diff --git a/test/.data/adult/adult_test.data b/test/.data/adult/adult_test.data new file mode 100755 index 0000000000..c197ff9f77 --- /dev/null +++ b/test/.data/adult/adult_test.data @@ -0,0 +1,50 @@ +6 60 48 1 59 73289 2 1 0 14 6 0 7 1 16 2 0 3 3 181758 0 1 37 2 +5 50 30 1 67 212490 1 1 0 1 3 0 1 1 13 1 0 1 1 112115 0 1 50 13 +5 58 28 21 23 289293 1 1 0 11 4 0 1 1 14 4 0 3 3 184806 0 1 39 1 +2 45 51 1 59 154950 1 1 0 6 3 3464 15 1 12 4 0 1 2 177727 0 1 20 7 +12 40 18 1 43 93449 1 3 0 4 4 0 11 1 9 4 3103 1 3 184016 0 1 27 1 +6 37 59 1 33 182074 1 1 0 1 4 0 7 1 13 5 4650 5 3 113838 0 5 42 7 +6 50 40 1 26 164299 5 1 0 1 4 0 1 1 13 4 0 3 3 27444 0 5 71 2 +9 40 31 1 17 386120 1 2 0 4 3 0 13 1 9 4 0 1 1 145439 0 1 34 4 +9 40 21 1 75 211013 1 1 0 2 4 0 1 1 10 4 0 3 3 225823 0 NaN 40 11 +5 25 23 1 32 178649 1 1 0 1 4 0 2 1 13 3 0 3 1 365881 0 5 51 2 +8 40 36 1 35 154641 1 1 0 4 6 0 8 1 9 2 0 2 1 484024 0 1 31 2 +1 40 29 1 38 159449 1 NaN 0 4 6 0 4 1 9 7 0 3 3 198210 0 1 39 11 +13 50 46 1 50 192485 5 1 1887 6 3 0 6 1 12 10 0 1 2 238162 0 5 18 2 +2 35 33 1 45 103643 2 1 0 4 3 0 4 1 9 9 0 1 3 134737 0 1 44 4 +11 40 32 1 30 172714 5 1 0 4 3 2202 11 1 9 4 0 1 4 257849 0 2 29 13 +2 50 47 25 26 207277 1 1 0 4 3 0 11 1 9 3 0 1 2 120131 0 1 35 4 +5 50 30 1 54 116839 1 1 0 2 4 0 2 1 10 4 8614 2 1 225231 0 1 65 4 +9 16 19 1 26 104958 1 2 0 2 2 0 4 1 10 2 0 3 3 25429 0 1 35 4 +5 45 31 1 61 94937 4 5 0 1 4 7298 7 1 13 4 0 3 2 165949 0 1 52 1 +4 52 71 1 18 223660 2 3 2392 4 3 0 4 1 9 10 0 1 3 200540 1485 1 34 4 +9 12 50 6 41 142711 4 5 0 4 6 0 4 1 9 1 0 3 5 306707 0 1 24 2 +4 35 63 1 46 372317 1 1 0 4 1 0 4 1 9 1 0 1 3 236338 0 1 29 4 +2 15 23 1 18 278414 1 1 0 4 5 0 4 1 9 4 0 3 3 100345 0 1 60 4 +6 40 27 1 41 124808 1 5 0 1 4 0 4 1 13 6 13550 3 3 186454 0 1 64 1 +8 60 39 1 51 147510 1 1 0 2 3 0 2 1 10 1 0 1 3 38145 0 1 34 1 +6 16 30 1 42 243666 1 2 0 11 4 0 2 1 14 4 0 3 3 124569 0 1 26 4 +3 35 25 1 33 132670 1 1 0 13 2 0 1 1 6 2 0 1 1 190350 0 2 50 2 +5 65 30 1 17 105422 1 1 0 4 3 0 4 1 9 3 0 1 2 84119 0 1 51 1 +3 20 59 1 31 91384 2 1 0 4 4 0 2 1 9 3 0 5 1 49996 0 6 37 2 +6 55 32 1 51 230238 1 3 0 4 4 0 4 1 9 2 0 4 3 155193 0 1 71 9 +9 40 20 1 61 133654 1 1 0 2 4 0 3 NaN 10 4 0 3 2 346341 0 2 66 4 +9 40 28 1 32 89922 1 1 0 1 2 0 4 1 13 2 0 3 3 298696 0 5 48 4 +8 40 26 1 20 169180 1 6 0 4 4 0 2 1 9 4 0 2 2 127202 2206 NaN 23 4 +6 25 20 1 65 65325 1 3 0 2 6 0 4 1 10 5 0 3 2 148709 0 4 30 2 +5 40 51 1 29 145964 2 6 0 6 3 0 4 1 12 7 0 1 1 99185 0 1 35 4 +4 40 30 1 17 114520 1 1 0 2 4 0 7 1 10 13 0 3 1 97306 0 1 39 1 +9 50 52 1 42 168906 1 1 0 4 4 0 7 1 9 14 0 6 2 72743 0 NaN 68 2 +5 40 65 18 59 236222 1 NaN 0 14 3 0 1 1 16 15 0 1 3 115880 0 1 61 7 +9 47 46 1 71 219906 1 NaN 0 6 4 0 4 13 12 4 0 3 2 231515 0 5 47 4 +4 40 33 1 44 147654 1 1 0 1 3 0 1 1 13 2 0 1 1 150570 1485 NaN 31 2 +2 40 64 1 27 285004 5 6 0 4 3 0 8 1 9 1 3137 1 1 202984 0 1 36 2 +2 36 32 1 40 216608 1 5 0 1 4 0 6 1 13 13 0 3 3 178109 0 5 40 1 +4 52 48 1 45 139671 1 1 0 4 4 0 7 1 9 4 0 2 3 154033 0 1 34 10 +NaN 30 27 1 51 168334 NaN 1 0 2 4 0 6 1 10 13 0 2 3 188711 0 2 45 5 +3 40 22 1 52 246739 1 1 0 2 2 0 2 1 10 1 0 3 1 140001 0 1 40 6 +6 45 37 10 55 190290 1 2 0 6 3 0 4 1 12 9 0 1 1 193815 0 1 54 4 +5 50 67 1 23 152109 3 4 0 4 3 0 3 1 9 1 9386 1 3 73559 0 NaN 32 1 +3 60 32 1 33 75073 1 5 0 4 3 0 5 1 9 4 0 1 2 203181 0 NaN 21 4 +6 40 50 1 69 354739 5 1 0 11 1 10605 2 1 14 7 15024 1 2 259377 0 1 54 2 +6 40 28 1 36 188882 1 1 0 1 4 0 14 1 13 1 2174 3 5 32291 0 1 27 8 diff --git a/test/.data/adult/adult_train.data b/test/.data/adult/adult_train.data new file mode 100755 index 0000000000..df8361c1dc --- /dev/null +++ b/test/.data/adult/adult_train.data @@ -0,0 +1,200 @@ +11 45 34 1 55 127921 1 1 0 4 3 0 2 1 9 1 0 1 3 241885 0 1 44 4 +4 50 41 1 60 231619 1 2 0 2 3 0 2 1 10 5 0 1 3 104334 0 1 59 9 +2 40 36 1 26 119941 1 1 0 4 3 0 1 1 9 7 0 1 2 77953 0 1 44 15 +1 40 26 1 38 215766 1 1 0 2 3 0 6 1 10 2 3103 1 3 167350 0 1 31 4 +1 40 28 1 38 170525 1 1 0 7 6 3471 4 1 11 2 0 2 2 109857 0 1 18 1 +7 60 33 NaN 42 329408 1 1 0 4 3 0 11 1 9 1 0 1 1 51543 0 NaN 29 4 +2 48 67 1 25 137142 3 1 0 9 3 0 2 1 4 1 0 1 1 325373 0 1 22 4 +2 40 44 1 48 59313 1 5 0 1 3 0 1 1 13 4 0 1 1 210525 0 1 41 6 +NaN 45 20 1 35 175856 NaN 1 0 4 1 0 4 1 9 4 0 1 3 84375 0 1 35 1 +7 35 20 1 47 142766 1 1 0 4 2 3781 1 1 9 2 0 3 2 162688 0 2 34 4 +5 40 18 1 30 189666 1 1 0 4 1 0 2 1 9 2 0 1 1 163787 0 1 21 2 +9 10 19 1 37 26880 6 2 0 2 2 0 8 1 10 2 0 3 1 135162 0 5 35 4 +4 60 40 9 21 165218 1 1 0 2 3 0 4 4 10 4 0 1 1 184378 0 1 26 4 +5 45 44 1 36 218785 2 5 1977 4 3 0 4 1 9 4 0 1 1 179557 0 1 22 1 +6 40 36 1 28 254781 1 1 0 1 4 0 1 1 13 4 0 3 1 102568 0 2 56 15 +6 40 46 1 30 209900 1 2 0 1 4 0 11 1 13 1 0 2 1 125492 0 3 23 4 +4 30 25 1 21 161922 1 1 0 4 2 0 4 1 9 14 0 3 3 197130 0 2 24 4 +6 50 23 1 67 294434 1 4 0 1 4 0 4 1 13 1 0 3 3 203924 0 1 45 2 +4 40 45 1 51 185216 3 5 0 4 6 0 2 1 9 4 0 3 1 81534 0 1 44 4 +3 39 22 1 53 275095 6 1 0 2 5 0 2 29 10 3 0 3 1 264102 0 1 38 1 +9 40 43 1 42 186934 6 5 0 4 6 0 4 1 9 4 0 3 1 218542 0 1 42 4 +NaN 40 27 1 41 210448 NaN 1 0 4 4 0 13 1 9 2 0 3 1 204074 0 2 20 4 +6 30 55 1 46 170721 1 1 0 14 3 4865 2 1 16 3 15024 1 4 116878 0 3 20 6 +6 40 44 33 42 166304 1 1 0 1 3 0 3 NaN 13 1 99999 1 1 227065 0 1 20 2 +2 40 38 1 24 324445 1 1 0 4 2 0 1 1 9 4 0 3 2 218490 0 1 57 9 +6 40 53 1 38 27242 6 1 0 14 3 0 2 1 16 1 0 1 3 71417 0 1 47 2 +8 40 46 1 25 443809 1 NaN 0 13 4 0 2 1 6 4 0 3 5 161508 0 6 42 4 +NaN 40 31 1 24 133503 NaN NaN 0 1 4 0 9 1 13 9 0 3 3 317761 0 1 25 8 +5 40 33 1 55 46868 1 1 0 4 6 0 2 1 9 1 0 2 2 180551 0 2 25 4 +6 25 63 1 64 229465 1 1 0 4 1 0 4 1 9 3 0 1 3 151364 0 6 22 2 +9 40 63 1 30 266070 1 1 0 4 6 0 3 1 9 1 0 5 2 38352 0 1 49 4 +3 40 55 1 66 200352 2 1 0 4 3 0 4 1 9 2 0 1 1 271795 0 1 24 2 +6 60 33 1 27 225395 2 1 0 1 3 0 8 1 13 1 0 1 1 175502 0 NaN 47 4 +4 15 81 1 30 100669 1 1 0 2 3 0 4 1 10 1 0 1 1 122651 0 1 32 4 +9 40 23 1 31 192995 1 1 0 4 4 0 1 1 9 4 0 3 5 85139 0 1 26 4 +5 45 44 1 42 138994 3 1 0 2 4 3137 1 1 10 2 2202 3 6 56236 0 1 55 4 +7 40 38 1 30 203488 1 1 0 4 4 0 11 1 9 1 0 3 1 175441 0 1 25 4 +9 50 38 1 59 190205 1 5 0 11 4 0 8 17 14 7 0 3 1 353263 0 1 37 4 +6 3 75 1 27 326936 2 1 0 14 3 7688 1 1 16 7 4931 1 3 231741 0 3 49 1 +2 45 35 1 29 185764 1 1 0 4 3 0 4 1 9 8 0 1 2 173586 0 1 54 2 +6 45 32 1 27 331894 1 1 1902 11 3 0 15 9 14 13 0 1 1 154210 0 1 49 11 +13 40 31 1 46 196125 5 1 0 4 3 0 4 1 9 13 0 1 2 206297 0 NaN 65 1 +NaN 24 36 1 39 213092 NaN 1 0 3 2 0 6 1 7 2 0 3 1 320183 0 5 30 1 +8 40 56 1 34 112507 1 1 0 4 3 0 2 1 9 11 0 1 3 53481 0 6 28 11 +6 45 33 1 69 101266 1 1 0 1 4 0 4 1 13 1 10520 3 3 356823 0 4 28 4 +11 40 34 1 61 161155 1 1 0 4 3 0 1 1 9 2 0 1 4 381153 0 1 44 4 +6 60 38 16 22 105422 1 5 0 5 3 0 10 1 15 7 0 1 1 348739 0 2 37 1 +6 5 30 1 20 146365 6 1 0 2 4 0 4 1 10 4 0 3 2 61989 0 1 36 2 +9 40 53 1 22 31826 1 1 0 11 1 0 11 1 14 2 0 1 3 285621 0 1 53 3 +7 40 27 1 36 142470 1 1 0 4 6 99999 4 1 9 4 0 3 1 188909 0 1 52 10 +NaN 40 41 1 44 245361 NaN 1 0 4 4 0 4 1 9 4 0 2 1 119207 0 2 39 4 +11 40 18 1 38 156033 4 1 0 4 2 0 2 1 9 2 0 3 2 263162 0 3 22 15 +5 40 76 1 41 289886 1 1 0 1 3 0 11 1 13 4 0 1 3 125784 0 1 25 4 +3 40 26 1 38 139012 1 3 0 1 4 0 4 1 13 7 0 3 3 55929 0 1 39 4 +8 40 51 1 39 405526 1 2 0 4 3 0 3 1 9 2 0 1 1 136913 0 1 34 4 +10 70 64 1 17 37937 1 1 0 4 3 0 2 1 9 4 0 1 1 298546 0 1 53 2 +4 40 26 16 47 199806 1 2 0 1 2 0 2 1 13 2 0 3 3 188767 0 1 20 1 +5 35 25 1 26 98466 1 5 0 1 4 0 7 1 13 1 0 3 1 160300 0 5 52 4 +13 40 41 1 31 378723 5 1 0 4 6 0 3 1 9 4 0 2 1 216116 2057 1 41 2 +3 38 22 1 25 336951 1 1 0 2 2 0 11 1 10 7 0 3 6 195075 0 1 29 2 +5 40 46 1 23 157332 1 1 0 4 3 0 2 1 9 11 0 1 1 29696 0 4 62 2 +10 45 28 1 35 232782 2 4 0 7 2 0 4 1 11 11 10520 3 1 29974 0 1 45 1 +2 40 36 1 19 211804 1 1 0 2 3 0 4 1 10 2 0 1 3 241306 0 1 57 1 +5 50 63 1 66 73019 1 1 0 4 6 0 4 1 9 4 0 5 3 181929 0 3 34 11 +4 50 52 1 46 148084 1 1 0 2 4 0 2 1 10 4 0 2 2 95128 0 NaN 34 2 +2 40 20 5 26 244408 1 2 0 4 2 0 7 1 9 4 0 3 2 257509 0 1 49 1 +NaN 30 34 1 40 258339 NaN 1 0 1 3 0 4 1 13 2 0 1 1 35595 0 1 64 11 +7 40 42 1 36 190759 1 1 0 4 3 0 9 1 9 4 0 1 3 124692 0 1 32 2 +10 40 19 1 23 400004 1 1 0 4 2 0 1 1 9 4 0 3 2 220819 0 1 26 4 +3 42 31 5 33 164190 1 1 0 7 3 0 1 1 11 4 0 1 1 77634 0 1 69 4 +NaN 48 33 1 55 158363 NaN 1 0 4 6 0 7 1 9 1 0 2 3 33404 0 2 58 4 +9 46 55 1 39 57233 4 1 0 4 3 0 4 1 9 4 0 1 3 171870 0 1 60 4 +6 80 27 1 34 97176 1 2 0 14 3 0 1 1 16 4 0 1 1 201017 0 1 62 4 +13 40 20 1 45 174533 1 1 0 2 4 0 4 1 10 13 0 3 5 20057 0 6 51 13 +NaN 40 41 1 67 320084 NaN NaN 0 8 1 0 4 40 5 1 0 1 1 217921 0 1 38 7 +NaN 40 38 1 56 411068 NaN NaN 0 2 3 0 4 1 10 10 0 1 5 320811 0 1 20 1 +4 25 35 21 34 147921 1 5 0 4 4 7688 4 1 9 4 0 3 1 140752 0 1 27 2 +6 40 43 1 60 93415 1 6 1902 11 3 15024 4 1 14 4 0 1 1 256813 0 1 38 2 +8 40 61 1 43 172256 1 5 0 4 2 0 4 1 9 2 0 2 1 221534 0 1 44 1 +2 40 37 1 24 222221 3 1 0 2 6 0 7 5 10 2 0 2 3 95634 0 1 25 4 +13 40 35 1 27 217304 1 NaN 0 11 4 0 4 1 14 1 0 6 1 342642 0 1 31 1 +7 30 22 1 41 229180 1 1 0 1 2 0 4 1 13 4 0 3 1 195767 0 2 18 1 +2 35 45 1 22 366618 1 1 0 7 3 0 2 1 11 4 0 1 3 180931 0 1 57 2 +4 40 50 1 63 325372 3 1 0 11 3 0 4 1 14 4 0 1 1 240374 1719 1 34 13 +10 70 37 1 45 147548 2 1 0 4 3 0 4 1 9 2 0 1 2 33394 0 1 44 1 +3 16 20 1 25 246011 1 NaN 0 2 2 0 4 1 10 2 0 3 1 196745 0 1 47 7 +NaN 40 39 1 53 197332 NaN 3 0 4 6 0 2 1 9 14 0 2 1 71701 0 1 22 2 +5 40 33 1 43 99199 1 1 0 11 4 4064 2 1 14 4 0 3 3 101562 0 1 17 2 +NaN 40 64 1 48 192149 NaN 5 0 9 4 0 9 1 4 4 0 5 3 286732 0 1 27 1 +3 60 36 1 25 149650 1 1 0 4 3 0 2 1 9 2 0 1 3 151835 0 1 30 2 +2 40 28 1 32 158685 1 1 0 13 3 0 4 1 6 11 0 1 2 263015 0 1 32 1 +2 40 39 1 55 174127 1 1 0 2 3 0 2 1 10 13 0 1 3 329980 0 1 44 2 +NaN 30 31 1 34 253860 NaN 1 0 4 3 0 2 21 9 13 0 1 1 505438 0 1 23 11 +1 80 37 1 43 155066 1 1 0 4 3 0 2 1 9 4 0 1 2 117381 0 2 33 4 +NaN 40 22 1 30 127366 NaN 1 0 2 2 0 13 1 10 3 0 3 3 367655 0 5 41 4 +7 30 20 1 23 245487 1 1 0 2 4 0 4 1 10 1 0 3 2 219835 0 1 23 4 +8 56 29 1 26 140644 1 3 0 4 6 0 2 1 9 2 0 2 1 190562 0 5 27 4 +4 40 35 1 49 178326 1 NaN 0 1 3 0 4 1 13 4 0 1 2 218955 0 2 65 15 +6 55 28 1 42 170336 1 3 0 5 6 0 7 1 15 3 0 2 2 187160 0 1 44 4 +4 25 60 1 21 141118 2 1 0 4 1 1797 2 1 9 1 0 1 1 184362 0 6 21 13 +2 40 59 1 30 162297 4 1 1887 4 3 0 2 1 9 11 0 1 4 117299 0 1 28 4 +8 40 55 1 23 349910 1 1 0 2 3 7688 1 1 10 2 0 1 3 173422 0 1 49 4 +4 20 21 1 24 393376 1 1 0 2 2 0 2 1 10 8 0 3 1 34616 0 1 47 1 +8 40 23 1 49 163867 1 3 0 4 2 0 10 1 9 1 0 3 1 162282 0 1 50 2 +6 60 46 1 20 305090 5 1 0 1 3 0 4 1 13 1 0 1 5 122177 0 1 30 4 +6 40 43 1 61 54929 1 NaN 0 2 4 0 7 1 10 10 0 2 3 102895 0 1 56 11 +2 40 38 1 43 181557 1 1 0 2 3 0 2 1 10 2 0 1 3 212245 0 1 48 4 +9 40 25 1 23 158319 5 1 0 1 3 0 7 1 13 5 0 1 1 227886 0 1 30 13 +1 40 24 1 33 218899 6 2 0 4 3 0 1 1 9 2 0 1 1 155775 0 1 43 4 +3 40 51 1 59 193511 5 4 0 8 5 0 10 1 5 1 0 4 1 114508 0 1 52 2 +9 40 30 1 18 189203 1 3 0 4 2 0 7 1 9 2 0 2 2 110594 0 4 35 4 +6 15 81 1 45 83893 1 1 0 12 3 0 15 18 2 4 0 1 2 100675 0 NaN 29 7 +9 40 23 21 25 260046 1 1 0 2 4 0 4 1 10 4 0 3 3 132053 0 1 37 11 +3 21 45 1 65 116975 1 5 0 9 6 0 4 1 4 4 0 5 3 347025 1887 1 56 9 +5 40 47 NaN 62 23037 1 1 1138 1 6 0 2 1 13 8 0 2 1 50092 0 1 57 1 +5 50 33 1 29 64940 1 1 0 6 3 0 1 1 12 2 0 1 1 219553 0 1 36 4 +NaN 20 68 1 76 53497 NaN 1 0 2 3 0 4 1 10 4 0 1 3 407338 0 1 17 4 +4 40 45 1 56 112761 2 1 0 6 3 0 1 1 12 11 5178 1 2 244194 0 5 26 4 +5 40 55 1 28 142297 1 1 0 5 3 0 13 1 15 5 99999 1 3 115439 0 1 31 1 +3 36 45 1 32 36228 1 1 0 3 1 0 1 1 7 3 0 1 4 45857 0 4 23 1 +8 60 54 1 32 99894 1 6 0 2 4 0 2 1 10 3 0 2 3 150999 0 1 43 2 +2 40 40 NaN 26 261677 1 1 0 4 3 0 6 1 9 4 0 1 1 168113 0 1 26 3 +11 60 61 NaN 36 275507 2 1 0 1 3 0 2 1 13 1 0 1 1 352448 0 1 39 4 +5 40 47 1 58 140206 5 1 0 4 3 0 10 1 9 4 0 1 3 166863 0 1 42 4 +6 24 40 1 58 290763 1 1 0 11 1 594 13 1 14 4 15024 1 1 99604 0 1 36 4 +9 20 43 4 49 296485 1 1 0 4 6 0 2 1 9 4 0 4 3 199657 0 1 24 11 +6 40 47 1 29 113364 5 1 0 1 3 0 4 1 13 2 0 1 1 39986 0 1 34 7 +2 40 42 1 57 285131 1 5 0 4 6 0 4 1 9 6 0 2 7 236323 0 NaN 42 2 +1 40 30 1 27 158688 1 1 0 1 4 0 4 1 13 2 0 3 3 100734 0 4 23 4 +6 50 28 1 47 209641 2 1 0 14 3 0 2 1 16 1 0 1 1 146735 0 1 34 2 +4 60 47 30 30 345697 1 1 0 1 4 0 2 1 13 7 0 3 1 262244 0 1 36 9 +9 25 23 1 18 133503 1 1 0 2 2 0 8 1 10 15 0 3 1 123586 0 1 40 4 +3 20 24 1 61 143533 1 1 0 4 2 0 5 NaN 9 2 0 3 3 229553 0 1 41 1 +4 40 46 1 62 166459 1 1 0 2 3 0 2 1 10 2 15024 1 4 117849 0 2 20 4 +6 60 47 1 27 191429 4 1 0 1 3 0 2 NaN 13 6 7298 1 3 169549 0 2 38 1 +8 40 26 1 27 111567 1 1 0 1 5 0 2 1 13 4 0 3 3 59367 0 1 44 8 +1 40 27 1 39 197919 1 1 0 6 3 0 3 1 12 4 0 1 1 130807 1887 1 29 4 +5 35 24 1 31 132112 1 2 0 7 2 0 1 1 11 4 0 3 3 306779 0 1 38 4 +4 10 17 21 67 126779 1 1 0 10 4 0 6 NaN 8 2 0 3 1 160118 0 5 53 6 +7 40 46 1 39 212213 1 1 0 4 3 0 6 1 9 1 0 1 1 216164 0 1 44 3 +12 40 60 1 33 276218 1 1 0 6 5 0 2 NaN 12 3 0 2 1 420842 0 5 37 2 +6 30 25 1 44 421223 5 1 0 1 4 15024 4 1 13 1 0 3 1 48317 0 5 45 2 +4 45 47 1 27 119742 3 1 0 2 3 0 2 1 10 4 0 1 2 337825 0 1 46 2 +9 35 34 1 36 167087 1 1 0 4 6 0 6 1 9 1 0 2 3 136997 0 1 56 3 +2 40 37 1 44 325374 1 6 0 1 3 0 7 1 13 4 0 1 2 192939 0 1 47 2 +1 50 37 1 53 257621 1 5 1485 2 3 0 3 1 10 4 0 1 1 261241 0 1 53 4 +6 32 45 1 20 284343 1 2 0 2 6 0 4 1 10 3 0 2 1 102076 1672 2 66 2 +8 40 46 1 32 124111 1 2 0 4 3 0 2 1 9 1 0 1 2 358886 0 2 55 13 +2 40 64 1 39 308608 1 1 2179 3 3 0 1 1 7 1 0 1 2 181232 0 1 54 6 +9 40 23 21 31 193012 1 5 0 1 4 0 3 1 13 4 0 3 3 140798 0 1 71 11 +9 40 19 1 31 111971 6 1 0 4 2 0 1 1 9 11 0 3 1 176634 0 1 35 4 +4 40 45 1 49 205947 1 1 0 6 6 0 4 13 12 4 0 5 3 297676 0 5 34 3 +11 40 45 1 33 311446 1 1 0 2 3 0 1 1 10 4 0 1 3 362883 0 1 52 2 +NaN 80 34 1 42 102058 NaN 6 0 4 3 0 5 1 9 1 2885 1 3 205256 0 NaN 38 4 +3 20 18 1 17 257017 1 5 0 4 2 0 7 1 9 2 0 3 4 338836 0 5 42 11 +9 48 25 1 50 180869 1 1 0 4 4 0 2 1 9 2 0 3 2 171114 0 1 42 4 +1 40 65 1 42 409172 2 1 0 5 4 0 4 1 15 2 0 2 1 55894 0 1 37 7 +7 35 22 1 28 280093 1 1 0 2 2 0 4 1 10 1 0 3 3 181557 0 1 47 4 +NaN 10 48 1 53 117210 NaN 1 0 3 6 0 9 29 7 2 0 2 1 155509 0 NaN 21 2 +3 40 19 1 41 116138 1 6 0 4 2 0 4 1 9 2 0 3 3 225294 0 NaN 18 2 +NaN 40 60 22 38 348960 NaN 1 0 7 4 0 2 1 11 1 0 5 1 366531 0 1 38 2 +3 45 41 1 19 177675 2 1 0 2 6 0 2 1 10 1 0 2 1 154374 1887 1 38 2 +13 40 34 1 27 111128 6 1 0 2 3 0 11 1 10 4 0 1 2 189843 0 1 26 7 +6 37 58 1 18 219863 5 1 0 1 4 0 13 1 13 1 0 3 3 215245 0 1 49 4 +3 30 36 1 20 50164 1 1 0 2 6 0 13 1 10 1 0 2 3 345310 1980 1 70 2 +5 65 45 1 59 142030 2 NaN 0 9 3 0 11 1 4 13 0 1 3 155489 0 3 50 2 +4 20 17 1 30 77665 1 1 0 3 2 0 2 1 7 4 0 3 1 262511 0 NaN 54 6 +4 40 27 1 29 150817 1 1 0 2 4 0 2 1 10 3 0 3 2 129528 0 1 27 1 +11 70 39 1 32 160035 1 NaN 0 10 3 0 1 1 8 6 15024 1 3 322143 0 2 33 14 +9 40 22 1 49 273640 1 1 0 2 2 0 6 1 10 4 0 3 3 416165 1977 1 44 1 +2 40 37 1 17 119859 1 2 0 13 6 0 6 1 6 2 0 2 1 385452 0 1 31 11 +4 40 23 1 27 219838 1 2 0 2 3 0 1 1 10 2 0 1 1 165064 0 5 35 1 +4 30 20 1 32 196630 1 2 0 2 2 8614 14 1 10 9 0 3 3 206869 0 1 50 1 +7 40 31 1 54 118941 1 3 0 4 4 0 1 21 9 2 0 6 1 256609 0 1 51 1 +5 38 47 1 29 147476 6 1 0 7 3 0 7 1 11 2 0 1 1 207120 0 1 48 2 +7 30 24 21 64 303954 1 1 0 2 2 0 2 1 10 4 0 3 1 177287 0 6 57 4 +6 60 46 1 25 81132 3 4 0 5 3 0 11 1 15 4 99999 1 3 120131 0 1 20 4 +2 53 32 1 38 146660 1 3 0 4 4 0 4 1 9 1 0 4 1 152156 0 2 24 4 +2 40 50 1 20 423605 1 1 0 2 3 0 2 1 10 4 0 1 1 283676 0 3 50 4 +13 16 65 1 39 119177 1 1 0 8 3 0 2 1 5 4 0 1 1 274637 0 1 45 2 +3 10 45 1 37 180624 1 1 0 10 2 0 2 21 8 1 0 3 3 358701 0 1 37 4 +11 40 31 1 48 403625 1 NaN 0 4 2 0 2 1 9 5 0 3 3 224234 0 1 44 3 +2 50 39 1 27 28683 1 1 0 11 4 0 11 1 14 1 0 3 1 192702 0 1 40 2 +11 45 29 1 33 255407 1 1 0 4 4 0 4 1 9 4 0 3 1 146719 0 NaN 37 2 +2 35 26 1 42 159247 1 6 0 4 2 0 4 1 9 2 0 3 3 167350 0 1 54 15 +4 35 22 1 36 150084 1 1 0 2 2 0 8 1 10 15 0 3 1 288132 0 NaN 47 4 +6 40 27 1 22 276369 1 1 0 1 4 0 7 1 13 4 0 3 2 142621 0 1 52 4 +2 45 34 1 29 186845 1 2 0 13 3 0 4 1 6 4 0 1 3 144949 0 5 30 4 +6 40 28 1 26 39054 1 1 0 11 4 0 2 1 14 2 0 3 1 355259 0 1 43 4 +2 40 26 1 47 82488 1 NaN 0 4 3 0 2 1 9 5 0 1 3 463194 0 1 58 2 +NaN 30 68 1 28 36989 NaN 1 1510 11 3 0 3 1 14 4 0 1 3 150250 0 2 44 1 +3 40 20 1 42 165468 1 1 0 4 4 0 4 1 9 2 0 3 1 181675 0 1 59 4 +1 35 39 1 25 124483 1 1 0 2 3 0 5 8 10 4 0 1 1 79586 2559 1 48 4 +3 20 18 1 61 179446 1 1 0 2 2 0 4 21 10 1 0 3 3 184693 0 1 57 4 +6 20 42 1 18 121055 1 1 0 1 6 0 11 1 13 7 25236 2 1 259727 0 1 57 2 +5 45 32 1 36 225603 2 1 0 13 3 0 1 1 6 4 0 1 1 52647 0 1 50 4 +10 60 31 NaN 49 155403 1 1 0 9 5 0 2 22 4 6 0 3 1 361497 0 1 35 4 +9 35 56 1 43 190151 1 4 0 4 2 2174 4 1 9 1 0 3 4 183169 0 2 25 6 +4 50 28 1 52 268832 2 1 0 1 4 0 13 1 13 1 0 3 3 190391 0 1 35 4 +3 25 61 1 27 41356 5 1 0 4 4 0 4 1 9 3 0 6 4 119563 0 1 36 2 diff --git a/test/.data/adult/adult_train.solution b/test/.data/adult/adult_train.solution new file mode 100755 index 0000000000..50ff5d24d7 --- /dev/null +++ b/test/.data/adult/adult_train.solution @@ -0,0 +1,200 @@ +1 1 1 +1 1 0 +0 1 1 +1 1 0 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 1 1 +1 0 1 +1 0 1 +1 1 1 +1 1 0 +1 0 0 +0 0 1 +1 0 1 +1 1 1 +0 1 1 +0 1 1 +0 0 1 +1 1 1 +1 1 0 +1 1 0 +1 1 1 +1 1 1 +0 1 1 +1 1 1 +1 0 1 +1 0 1 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +0 1 1 +0 1 1 +1 0 0 +1 1 1 +1 1 0 +0 1 0 +1 1 0 +0 1 1 +1 1 1 +1 0 0 +1 1 1 +1 1 1 +1 0 1 +1 0 0 +0 0 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +0 0 0 +1 0 1 +1 1 0 +1 1 0 +1 1 0 +1 1 0 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 1 1 +1 1 1 +1 1 0 +1 1 1 +0 0 1 +0 0 1 +1 1 1 +1 1 1 +1 1 0 +1 0 1 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 1 0 +1 1 1 +1 0 1 +1 0 1 +1 0 1 +1 0 1 +1 1 0 +1 1 1 +1 1 0 +1 1 1 +1 1 0 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 1 0 +1 1 1 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 0 1 +1 1 1 +1 0 1 +1 0 1 +0 1 1 +1 1 1 +1 1 1 +1 1 0 +1 1 0 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 1 0 +1 0 0 +1 0 1 +1 1 1 +1 0 1 +1 0 1 +1 1 0 +0 1 0 +0 1 1 +0 0 1 +1 1 0 +1 1 0 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 1 1 +1 0 1 +1 1 1 +1 1 0 +1 0 1 +1 1 0 +1 1 1 +1 0 1 +1 1 1 +1 1 1 +1 0 1 +1 0 1 +1 0 1 +1 1 1 +1 1 1 +1 1 1 +1 0 1 +1 1 1 +0 1 1 +0 0 1 +0 0 1 +1 0 1 +1 1 1 +1 1 1 +1 0 1 +1 0 1 +1 1 0 +1 1 1 +1 0 1 +1 1 0 +1 0 1 +1 0 1 +1 1 1 +1 0 1 +1 1 1 +1 1 0 +1 0 1 +1 1 0 +1 1 1 +1 1 0 +1 1 1 +1 1 1 +0 1 1 +1 0 1 +1 0 1 +1 1 1 +1 0 1 +1 0 1 +1 1 0 +1 0 0 +1 1 1 +1 1 1 +1 1 1 +0 1 0 +1 0 1 +0 0 0 +1 1 1 +1 1 1 +1 0 1 +1 1 1 +1 0 1 diff --git a/test/.data/adult/adult_valid.data b/test/.data/adult/adult_valid.data new file mode 100755 index 0000000000..6aed845767 --- /dev/null +++ b/test/.data/adult/adult_valid.data @@ -0,0 +1,50 @@ +6 35 64 1 47 45522 1 4 0 11 4 0 4 1 14 2 0 3 1 256019 0 1 40 1 +4 40 60 1 28 208238 1 1 0 1 3 0 2 1 13 12 0 1 2 145995 0 1 30 2 +1 30 46 1 38 205246 1 2 0 4 4 0 16 1 9 4 0 2 4 295791 0 1 40 1 +5 40 52 1 34 170125 1 5 0 2 3 0 2 1 10 4 0 1 5 117674 0 1 68 2 +2 48 32 1 59 398827 1 1 0 4 3 0 3 1 9 11 0 1 5 42596 0 3 24 2 +2 40 25 1 20 207202 1 1 0 1 4 0 4 21 13 4 0 3 3 308144 0 1 55 3 +4 44 22 1 37 177905 1 1 0 2 4 0 4 1 10 3 0 3 3 147397 0 5 29 4 +5 40 32 37 41 157473 4 1 0 4 4 0 7 1 9 4 0 3 1 131534 0 1 38 2 +5 40 49 1 36 139391 1 1 0 2 4 0 1 1 10 5 0 3 1 36032 0 4 44 4 +8 30 18 1 42 179048 1 1 0 4 4 15024 6 1 9 3 0 3 3 155752 2042 1 27 14 +5 40 39 1 43 234387 2 5 0 4 3 0 7 1 9 6 0 1 2 52187 0 1 28 11 +5 44 30 1 18 99761 1 1 0 1 4 0 6 1 13 2 0 3 2 206512 0 1 44 2 +4 20 22 1 51 497788 1 1 0 7 2 0 13 1 11 2 0 3 2 213834 0 2 21 2 +5 40 36 1 23 297152 2 1 0 1 1 0 5 1 13 3 0 1 1 294672 0 1 36 11 +13 40 33 1 25 120277 5 1 0 2 3 0 1 1 10 4 0 1 2 154874 0 1 35 14 +9 25 31 1 44 140092 6 1 0 11 4 0 1 1 14 5 0 3 3 151763 0 1 37 4 +2 50 59 1 27 46247 1 NaN 0 4 3 0 4 1 9 4 0 1 3 198435 0 1 32 2 +6 40 43 1 35 189702 1 1 0 1 4 0 4 1 13 4 0 2 3 178417 0 3 44 1 +2 45 46 1 39 107231 3 1 0 10 3 0 2 NaN 8 8 0 1 1 175958 0 5 28 2 +8 40 39 1 37 240521 1 NaN 0 2 3 0 1 1 10 4 0 1 3 193689 0 1 30 2 +9 40 48 1 35 300760 1 1 0 7 6 0 4 1 11 4 0 3 1 167159 0 1 27 4 +11 40 49 1 59 197462 1 1 0 3 3 0 4 1 7 2 0 1 4 239865 0 3 37 11 +7 48 26 1 31 72393 1 NaN 0 4 3 0 4 1 9 4 0 1 3 177951 0 2 22 13 +4 15 23 1 31 129009 1 1 0 1 4 0 1 1 13 4 0 3 2 240398 0 NaN 19 14 +4 40 28 1 38 391074 1 2 0 7 4 0 4 1 11 4 0 3 6 189186 0 5 53 3 +6 20 28 NaN 22 194138 1 1 0 1 4 0 4 1 13 11 0 3 3 56340 0 NaN 34 2 +5 48 21 1 46 117381 1 1 0 2 4 0 2 21 10 1 0 3 2 129674 0 4 51 2 +6 40 61 1 18 274907 5 6 0 4 4 0 2 1 9 4 0 5 2 260167 0 5 55 4 +9 50 33 1 27 268051 1 1 0 2 6 0 1 1 10 1 0 2 2 119017 0 1 27 15 +7 40 41 1 35 176566 5 1 0 2 3 0 2 1 10 14 0 1 3 488706 0 1 23 4 +4 45 52 11 52 190786 1 1 0 4 3 0 2 1 9 7 0 1 1 217663 0 1 45 3 +1 40 35 1 18 399904 6 1 0 4 2 0 6 1 9 4 0 3 2 98776 0 5 29 1 +3 40 52 1 58 264834 1 1 0 1 5 0 4 29 13 9 0 6 1 82285 0 1 27 1 +4 10 18 1 36 31725 1 2 0 2 2 0 3 1 10 1 0 3 3 171088 0 5 22 13 +11 48 39 1 39 211968 1 1 0 4 3 0 4 1 9 13 7298 1 3 33355 0 1 32 4 +6 50 30 1 28 126319 2 2 0 11 4 0 2 8 14 11 0 2 2 116666 0 1 51 2 +4 38 20 1 33 103345 1 2 0 4 6 0 4 1 9 2 0 3 3 267706 0 3 27 2 +4 40 38 1 20 95949 1 1 0 4 1 0 1 1 9 1 0 1 5 177134 0 2 50 3 +4 50 67 1 23 191024 3 1 0 1 3 0 9 1 13 4 0 1 3 273239 0 1 43 4 +3 35 21 1 20 436361 1 1 0 6 4 0 6 19 12 1 0 3 1 211385 0 1 50 4 +11 40 46 1 19 177720 1 1 0 4 3 0 7 1 9 1 0 1 3 28334 0 1 48 6 +2 40 25 1 24 81132 1 1 0 2 4 15024 4 1 10 3 0 3 3 187540 0 5 45 4 +9 20 19 1 26 172846 1 1 0 4 2 0 1 1 9 2 0 3 3 393712 0 1 57 4 +2 40 90 1 22 174233 1 1 0 4 3 0 1 11 9 2 0 1 3 225063 0 6 40 13 +6 50 26 12 62 29235 1 1 0 1 4 0 11 1 13 4 0 3 3 38232 0 1 67 2 +3 40 38 1 23 183850 1 1 0 15 1 27828 4 37 3 1 0 1 2 43311 0 2 29 4 +2 58 30 1 33 173652 1 4 0 2 3 0 1 1 10 4 0 1 1 151967 0 1 36 4 +2 40 43 1 27 55854 1 4 0 4 3 0 5 1 9 10 0 1 2 403276 625 1 45 4 +2 40 46 1 31 192060 6 1 0 1 3 0 2 1 13 10 0 1 3 121586 0 1 63 4 +2 48 26 NaN 49 154164 1 1 0 4 2 0 9 1 9 1 0 3 1 164386 0 1 47 4 diff --git a/test/.data/cadata/cadata_feat.type b/test/.data/cadata/cadata_feat.type new file mode 100755 index 0000000000..43f9a9da03 --- /dev/null +++ b/test/.data/cadata/cadata_feat.type @@ -0,0 +1,16 @@ +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical +Numerical diff --git a/test/.data/cadata/cadata_public.info b/test/.data/cadata/cadata_public.info new file mode 100755 index 0000000000..ff414fd439 --- /dev/null +++ b/test/.data/cadata/cadata_public.info @@ -0,0 +1,16 @@ +usage = 'AutoML challenge 2014' +name = 'cadata' +task = 'regression' +target_type = 'Numerical' +feat_type = 'Numerical' +metric = 'r2_metric' +feat_num = 16 +target_num = 1 +label_num = 0 +train_num = 5000 +valid_num = 5000 +test_num = 10640 +has_categorical = 0 +has_missing = 0 +is_sparse = 0 +time_budget = 200 diff --git a/test/.data/cadata/cadata_test.data b/test/.data/cadata/cadata_test.data new file mode 100755 index 0000000000..5dc00f5814 --- /dev/null +++ b/test/.data/cadata/cadata_test.data @@ -0,0 +1,50 @@ +1.1667 16 172 -118.17 451 1010 1854 209 52 -118.51 -117.22 604 37.59 187 37.81 -122.3 +4.8371 31 448 -118.12 228 2727 1212 462 17 -122.09 -118.29 1691 36.06 448 33.98 -117.49 +2.3594 10 367 -117.35 325 2972 1377 635 30 -117.29 -119.82 1940 38.47 590 37.29 -120.46 +3.6818 19 696 -118.56 573 1813 1971 393 35 -117.29 -117.74 1093 34.16 374 37.65 -122.08 +5.2713 32 300 -118.19 424 1930 1368 354 36 -120.02 -119.83 915 34 328 33.82 -118.1 +1.425 16 467 -117.68 280 382 5210 86 17 -117.04 -118.22 272 34.03 81 33.83 -117.92 +1.7135 36 514 -118.98 237 1639 1002 367 42 -118.62 -118.44 929 33.77 366 37.85 -122.27 +4.5304 27 584 -117.13 430 3041 3477 677 34 -122.01 -117.05 1920 34.44 640 33.91 -118.32 +2.4097 30 1524 -117.68 314 2109 1241 427 19 -117.97 -121.41 1742 33.84 426 37.92 -121.25 +4.875 35 314 -117.15 326 981 1540 222 31 -120.88 -120.97 734 33.88 239 33.9 -118.35 +1.9309 16 174 -122 276 1761 937 515 31 -117.25 -118.34 1810 33.94 468 33.75 -117.86 +4.8966 25 345 -120.44 438 4126 1399 696 45 -122.24 -117.14 1722 34.85 668 37.48 -122.24 +3.6811 22 742 -118.46 588 3225 2209 726 34 -117.62 -123.28 1958 34.07 656 37.6 -122.31 +2.425 13 970 -118.32 852 1989 2098 401 39 -118.16 -122.3 805 40.07 341 34.14 -117.29 +4.665 18 600 -118.29 326 4795 2019 710 14 -118.22 -118.32 2047 34.2 640 36.92 -119.81 +4.1099 48 461 -122.21 480 7949 2603 1309 10 -118.39 -122.85 3176 33.8 1163 40.57 -122.42 +3.3693 15 307 -118.99 1777 2555 2572 510 30 -121.87 -119.8 1347 34.13 467 35.03 -117.82 +4.2 25 425 -119.7 1075 1255 3313 252 35 -118.08 -117.99 685 34.15 279 33.86 -118.11 +3.8776 30 474 -119.14 1182 13796 1630 2372 11 -118.48 -122.15 6000 41.48 2250 38.69 -121.32 +3.5729 36 389 -121.13 293 2737 3374 654 17 -117.89 -117.78 910 38.64 492 33.4 -117.65 +2.2625 46 536 -121.03 436 2063 1804 484 48 -121.91 -121.48 1054 33.97 466 37.85 -122.28 +3.3125 8 655 -117.98 608 1970 1189 332 22 -121.14 -117.4 1066 37.81 319 36.79 -119.9 +1.7083 45 445 -116.89 292 686 1608 127 11 -119.21 -120.27 246 39.14 86 38.95 -122.63 +3.575 45 248 -118.21 412 1806 2734 322 51 -118.34 -121.93 709 37.39 298 33.96 -117.4 +7.0935 17 877 -118.31 1390 2508 5743 402 22 -118.22 -116.92 1254 34.04 395 33.82 -118.32 +2.1734 36 1753 -118.58 500 661 11294 146 34 -119.96 -118.43 742 37.99 143 33.91 -118.23 +3.9643 38 335 -116.93 1514 1832 3568 415 27 -116.9 -117.02 1480 37.64 414 34.16 -119.18 +3.5547 29 864 -121.98 658 372 3054 68 31 -118.32 -117.92 479 34.04 67 35.95 -121.32 +2.3261 25 1734 -116.21 142 1406 2200 413 20 -121.55 -121.02 850 34.02 412 32.75 -117.12 +4.7094 29 214 -119.32 508 12713 853 2558 14 -121.26 -122.03 4741 38.09 2412 37.92 -122.05 +0.7917 9 359 -122.09 312 107 5591 79 52 -118.26 -116.96 167 37.36 53 37.95 -121.29 +2.1212 16 438 -117.02 629 3806 3691 794 16 -118.16 -117.88 1501 38.58 714 37.35 -118.18 +1.6607 40 498 -118.38 345 164 2982 30 18 -117.07 -121.02 104 37.63 32 37.37 -120.67 +4.87 25 881 -121.9 651 2899 3758 499 19 -119.77 -119.05 1356 36.83 512 33.03 -117.27 +4.9306 5 651 -121.93 379 1131 340 236 17 -122.01 -118.36 622 32.81 244 33.85 -117.86 +3.0862 35 536 -122.31 662 2457 2832 552 35 -116.98 -119.18 1159 34.08 523 34.05 -118.37 +3.75 15 198 -118.4 375 1788 1717 313 52 -121.73 -119.28 792 39.14 294 34.1 -118.1 +3.2452 18 472 -119.55 152 5199 6932 1023 17 -118.29 -117.06 2036 34.13 890 38.58 -122.83 +4.2708 46 613 -120.62 1402 2920 790 601 24 -117.65 -121.89 1460 37.36 598 34.31 -118.46 +5.3561 49 273 -117.12 615 2658 2098 484 29 -117.65 -118.14 1318 36.83 498 37.27 -121.98 +3.8201 37 493 -121.83 504 6269 3244 1279 22 -121.85 -116.96 5587 33.77 1251 34.02 -117.91 +2.0417 34 129 -118.95 667 2692 1240 481 27 -121.92 -119.71 1518 34.06 447 37.33 -120.89 +6.0368 31 777 -121.88 123 3266 3291 529 22 -122.27 -117.25 1595 38.15 494 34.29 -118.72 +6.7192 25 309 -118.08 215 2296 3260 329 30 -118.18 -121.32 847 37.86 322 37.87 -122.05 +3.4896 52 338 -117.9 388 3420 1032 691 28 -117.37 -122.69 1502 37.84 656 33.97 -117.31 +3.3125 31 546 -121.58 335 2668 70 510 26 -122 -121.79 1437 39.96 505 39.07 -121.7 +4.1 28 347 -122.56 241 746 18448 172 30 -117.82 -121.75 1048 38.33 163 33.73 -117.9 +4.9879 10 386 -120.98 431 1812 1833 294 28 -117.38 -117.97 853 37.69 278 37.67 -121.87 +3.6667 47 224 -117.07 741 1294 3711 308 40 -117.65 -118.23 1177 34.09 301 37.66 -122.41 +2.9464 16 144 -121.36 505 6308 699 1167 19 -117.25 -117.46 3012 33.88 1112 38.56 -121.37 diff --git a/test/.data/cadata/cadata_train.data b/test/.data/cadata/cadata_train.data new file mode 100755 index 0000000000..4cb6375b2c --- /dev/null +++ b/test/.data/cadata/cadata_train.data @@ -0,0 +1,200 @@ +3.5962 29 274 -118.34 550 1597 1220 301 36 -120.89 -117.3 632 33.96 262 40.31 -121.24 +3.9696 9 795 -121.45 379 352 2430 41 25 -117.19 -121.2 99 35.36 34 34.01 -117.61 +2.2417 44 230 -120.38 235 833 3287 188 48 -118.88 -121.48 652 33.26 165 38.09 -122.25 +2.6713 52 300 -117.32 345 867 3694 199 52 -119.19 -117.58 391 36.83 187 36.62 -121.92 +2.1658 44 442 -117.7 244 1947 361 383 29 -118.33 -117.99 925 37.3 337 39.5 -121.58 +4.5417 28 169 -115.37 547 2692 1416 477 21 -120.77 -121.94 1330 37.39 456 34.25 -118.47 +5.451 18 152 -121.06 261 2922 1435 507 35 -122.22 -119.82 1130 33.12 485 34.17 -118.43 +5.4218 34 224 -121.59 333 4274 4251 715 16 -118.42 -118.34 2240 34.04 704 37.23 -121.76 +3.2813 34 737 -117.77 1673 940 2373 219 46 -121.94 -119.51 599 33.95 214 33.89 -118.16 +5.2485 34 277 -120.15 1344 1813 3741 313 35 -117.85 -119.9 825 33.83 316 33.94 -118.12 +2.5363 16 666 -122.83 724 2526 1876 579 44 -121.96 -122 1423 38.39 573 33.97 -118.33 +2.6902 17 372 -118.3 310 2052 1880 405 17 -121.21 -118.05 975 33.92 340 36.8 -119.24 +1.8667 21 432 -122.43 478 1325 1898 280 50 -118.19 -118.02 811 33.67 281 36.74 -119.77 +6.3767 23 478 -122.24 550 3930 1802 661 24 -117.86 -118.27 1831 33.73 616 33.67 -117.81 +2.2448 28 163 -117.8 103 1815 1155 375 20 -121.28 -118.44 1665 33.34 357 36.7 -119.54 +0.8907 15 195 -122.71 272 80 1819 26 34 -124.09 -121.69 125 39.16 35 34.12 -118.24 +5.5942 33 216 -120.06 625 3136 1882 501 5 -117.7 -117.23 1327 36.83 467 38.13 -121.32 +3.6964 14 211 -118.56 841 3652 5464 967 47 -117.76 -117.91 1438 39.15 887 34.06 -118.4 +1.2434 22 40 -120.85 688 249 4213 78 52 -122.1 -118.45 396 34.66 85 37.8 -122.27 +4.5337 25 19 -117.83 753 4077 878 777 31 -121.62 -119.86 2544 38.26 738 37.68 -122.47 +5.5501 19 626 -118.25 123 3318 2303 502 17 -118.09 -122.27 1520 35.29 498 33.72 -117.92 +4.918 36 584 -117.11 657 1820 2910 313 36 -122.04 -117.86 899 34.28 295 33.83 -118.11 +4.2237 15 180 -117.09 360 5024 4401 881 25 -121.94 -117.1 1994 34.14 838 38.27 -122.45 +6.077 13 330 -118.35 414 1973 2437 367 34 -118.42 -118.18 843 33.89 345 33.88 -118.39 +4.375 42 529 -117.98 297 1596 6875 276 52 -122.04 -118.2 642 37.32 273 37.74 -122.45 +5.0476 18 999 -120.02 704 1258 3005 333 32 -123.53 -118.43 645 34.15 334 33.73 -118.11 +1.4384 15 174 -118.63 619 3223 2635 940 5 -120.6 -117.76 3284 35.26 854 32.55 -117.06 +3.017 31 915 -122.07 451 224 1788 38 52 -118.49 -121.3 120 32.76 45 39.25 -122.08 +1.1553 52 91 -122.17 1089 2289 2663 611 12 -118.4 -119.2 919 32.91 540 38.54 -122.81 +4.7794 24 703 -121.89 208 1669 1239 276 26 -118.17 -117.67 951 34.21 278 37.44 -121.91 +3.1042 13 393 -120.97 398 1091 48 269 37 -117 -118.4 905 36.72 242 34.07 -118.03 +2.565 20 190 -116.32 864 3457 4076 1021 52 -120.13 -118.46 2286 33.91 994 37.79 -122.42 +5.8838 21 115 -116.99 449 4650 1831 748 24 -121.41 -122.27 2374 33.81 702 34.24 -119.02 +3.6034 46 734 -117.96 292 1414 972 463 16 -120.15 -118.61 793 38.82 439 34.44 -119.82 +3.8672 16 438 -124.22 365 1007 2468 224 42 -122.36 -121.25 776 34.07 228 33.9 -118.07 +2.6368 25 179 -118.24 343 2123 4140 387 34 -122.43 -119.07 1310 35.34 368 37.72 -121.22 +2.5568 44 419 -118.22 356 1359 2307 359 35 -119.26 -118.15 655 33.72 341 34.01 -118.34 +3.1364 23 553 -117.86 804 2817 1295 604 30 -121.13 -117.86 1089 32.8 412 34.81 -118.95 +4.825 29 681 -119.8 538 2578 3137 551 13 -122.03 -120.85 1680 33.97 528 37.59 -122.07 +3.3017 35 39 -117.16 393 639 1197 197 5 -122.41 -122.02 666 34.03 197 33.74 -117.93 +2.5982 24 412 -122.33 69 2210 1330 643 42 -117.62 -121.94 1228 37.35 605 34.09 -118.35 +3.5607 36 19 -118.34 315 3691 2491 640 21 -121.98 -120.25 1758 37.71 603 38.83 -121.21 +4.2898 11 1451 -117.97 352 1975 3275 389 40 -117.99 -121.42 1116 37.44 378 34.09 -118.06 +2.4931 15 7 -118.03 1141 2136 1190 557 26 -117.81 -121.96 1528 33.94 537 33.78 -117.96 +4 31 312 -117.35 437 5257 2567 1360 37 -118.16 -118.18 2128 34.1 1264 37.45 -122.18 +0.6775 47 639 -122.46 232 2806 2352 1944 52 -117.66 -122.49 2232 34.3 1605 34.05 -118.25 +3.5504 50 219 -117.05 158 5873 4291 1455 11 -116.87 -117.97 3089 34.02 1365 34.18 -118.85 +3.0978 22 684 -117.2 851 2459 2809 492 28 -122.91 -122.16 1230 33.87 498 34.08 -117.68 +4.5461 21 499 -120.87 335 2819 1471 479 16 -120.24 -117.74 1068 34.02 365 35.1 -120.3 +15.0001 29 375 -118.08 578 1482 3914 171 52 -118.34 -117.13 531 33.54 161 34.07 -118.33 +2.6023 52 705 -119.73 575 2364 5775 631 14 -117.25 -118 1300 33.71 625 38.46 -122.66 +6.7851 20 919 -118.25 634 2964 2432 436 45 -119.62 -118.38 1067 33.93 426 37.81 -122.2 +3.1645 25 679 -118.47 679 1358 4077 231 37 -119.34 -122.13 586 38.61 214 37.52 -121.14 +3.1797 17 5 -117.92 625 707 6352 166 48 -122.42 -122.17 458 34.4 172 38.44 -122.72 +2.9624 42 356 -122.04 625 7963 2246 1881 16 -117.86 -118.31 3769 34.23 1804 32.77 -117.04 +4.6875 16 224 -122.26 416 1742 3170 340 36 -122.31 -117.92 857 35.34 341 33.82 -118.11 +2.8672 32 739 -122.48 632 1692 543 398 30 -121.48 -118.43 1130 32.85 365 34.06 -118.13 +3.9167 17 108 -117.85 342 2625 1805 673 20 -121.8 -119.04 1184 33.77 606 33.74 -118.3 +2.6182 39 595 -118.23 417 1617 1172 493 34 -119.74 -120.25 1530 37.69 500 33.91 -118.3 +1.3882 23 464 -117.97 1154 1059 1751 268 47 -122.14 -121.56 693 33.8 241 36.33 -119.65 +2.6944 17 247 -117.92 164 1216 2434 240 36 -118.31 -118.15 647 34.16 228 40.54 -122.38 +3.9565 33 453 -121.72 451 2675 1980 585 48 -117.91 -118.39 1773 33.87 540 37.72 -122.44 +2.875 10 487 -118.17 391 1788 2181 368 44 -118.49 -117.27 933 37.85 329 37.95 -122.34 +3.2955 52 418 -120.65 203 3874 319 676 19 -118.25 -117.09 2441 34.07 707 37.35 -120.6 +6.7544 52 315 -118.08 197 3358 1433 504 11 -120.87 -118.43 1690 33.8 482 34.02 -117.66 +5.1298 36 544 -117.64 528 2248 1728 448 17 -118.03 -121.56 878 33.89 423 33.41 -117.59 +3.3239 36 387 -119.78 389 3995 1038 778 9 -118.48 -121.48 1691 36.9 712 36.82 -119.85 +4.0328 29 5 -122.41 595 2690 1834 459 16 -122.28 -117.14 1253 33.97 393 33.2 -117.15 +4.0833 40 590 -119.71 235 5001 1703 830 20 -122.27 -117.97 2330 36.75 830 38.64 -121.3 +4.0426 31 844 -121.86 696 2809 2705 450 15 -121.28 -122.44 1267 33.89 408 39.23 -121 +3.9024 52 428 -122.43 448 3853 989 761 13 -121.01 -118.15 1685 33.16 669 34.03 -117.32 +3.4543 28 947 -118.02 527 4609 3815 1005 12 -117.9 -119.63 2293 33.17 960 38.32 -122.28 +4.9562 38 824 -120.45 283 455 1911 92 45 -122.45 -120.48 394 33.67 89 33.82 -118.21 +4.7083 18 278 -118.56 720 4461 2347 864 20 -117.29 -118.4 2042 34.08 808 37.32 -121.99 +5.1149 27 170 -118.3 702 2199 2933 361 22 -122.18 -118.96 1270 34.2 386 37.31 -121.79 +3.2833 26 367 -122.87 652 1462 2206 241 33 -121.83 -122.69 569 37.16 231 39.5 -121.52 +4.375 31 2267 -117.7 60 1371 1875 236 33 -117.14 -117.38 715 37.33 227 33.93 -117.44 +3.6654 52 572 -117.88 417 881 1716 159 35 -119.18 -117.71 605 39.5 170 33.91 -118.32 +7.7773 37 201 -116.01 352 1054 3944 209 33 -121.72 -122.29 400 37.67 161 37.34 -122.38 +4.2716 21 213 -122.39 426 3659 1067 652 9 -121.45 -117.32 1889 33.95 632 38.53 -122.78 +5.6482 14 407 -122.41 270 1651 1776 269 35 -117.76 -118.49 707 36.09 252 33.9 -118.37 +3.8904 44 472 -120.84 433 3049 2079 582 21 -122.26 -118.14 2355 37.96 585 32.71 -116.99 +2.1927 42 598 -121.51 587 3245 1963 1190 29 -122.31 -115.55 3906 34.04 1102 34.09 -118.3 +2.7955 34 300 -118.25 518 1796 2246 380 23 -121.15 -119.25 939 35.63 330 38.69 -122.03 +2.5352 19 542 -117.23 753 4495 4524 856 13 -121.37 -121.81 1149 37.74 459 38.25 -120.37 +3.7414 52 624 -118.8 562 2049 3024 330 29 -118.15 -119.33 787 36.34 309 38.5 -121.5 +8.758 25 379 -117.11 568 2040 4466 294 30 -118.3 -120.37 787 37.89 278 37.35 -122.06 +4.2083 36 288 -119.32 778 1729 2666 396 33 -118.34 -117.09 1073 37.28 344 33.91 -118.32 +7.5 15 257 -117.06 2837 2580 1764 372 8 -118.03 -118.31 1111 33.8 393 36.85 -119.88 +4.5458 36 90 -118.01 615 4685 3913 965 6 -121.64 -118.15 2180 38.11 909 34.28 -118.77 +2.8676 23 441 -119.3 185 1055 2949 211 30 -118.08 -118.31 629 36.98 170 37.95 -121.22 +6.9473 21 361 -118.88 753 7357 2746 963 19 -117.08 -117.89 3018 37.78 981 37.23 -121.87 +2.6696 32 647 -119.18 234 2010 1582 433 19 -121.3 -118.17 910 37.47 390 37.98 -120.4 +4.4567 17 780 -121.95 517 2183 1379 364 27 -117.3 -118.32 1458 39.29 388 34.17 -119.19 +4.4375 35 654 -118.23 359 12045 765 2162 5 -119.84 -118.09 5640 38.6 1997 33.09 -117.1 +4.9107 33 626 -122.89 631 2511 301 465 19 -118.19 -118.74 1551 37.66 450 36.84 -121.7 +2.6442 36 389 -118.33 654 859 1237 239 47 -122.48 -122.47 913 39.51 234 34.09 -118.23 +5.0947 52 279 -117.19 1222 1358 2130 247 33 -117.73 -122.5 738 38.61 235 34.26 -118.46 +1.6641 17 474 -121.82 214 1009 1904 225 43 -122.25 -122.58 604 37.32 218 38.63 -121.43 +3.2833 4 444 -117.06 273 1340 3857 298 38 -122.31 -122.07 766 40.99 241 37.95 -122.34 +1.5909 52 245 -117.25 170 626 3100 256 44 -118.34 -122.01 572 37.75 229 32.72 -117.17 +5.0483 19 691 -118.54 678 1770 1607 362 35 -118.49 -122.09 1083 33.8 355 34.2 -118.56 +3.0139 16 618 -118.07 621 4091 1234 864 11 -118.3 -118.38 1927 33.67 765 33.15 -117.2 +3.1378 39 266 -122.41 709 2312 3216 592 13 -117.97 -118.16 2038 38.54 559 33.92 -117.95 +2.125 13 420 -120.97 877 1149 1277 280 37 -117.06 -118.16 1016 37.95 250 33.88 -118.22 +2.5272 8 389 -117.08 1043 4937 1781 1139 5 -121.02 -118.31 2204 37.72 812 34.1 -117.41 +4.1029 48 446 -122.44 520 393 3606 76 33 -117.98 -118.59 330 37.64 80 32.58 -117.1 +2.824 17 619 -116.91 136 2149 4407 527 36 -122.02 -121.76 1359 37.74 481 33.97 -118.03 +3.1513 8 380 -118.1 411 2591 5083 486 10 -120 -118.12 1255 33.71 425 33.71 -117.34 +3.5179 30 223 -117.71 469 3058 1544 567 37 -121.66 -117.91 1351 36.97 523 38.01 -121.8 +3.6797 4 548 -118.24 690 6638 554 1634 21 -118.4 -117.94 3240 37.81 1568 33.79 -118.32 +2.567 22 298 -121.98 676 3157 3929 637 21 -117.36 -122.73 2268 33.96 620 37.38 -120.64 +2.2292 21 161 -119.8 656 2899 1645 745 5 -122 -116.95 1593 36.98 633 37.51 -120.85 +2.7679 9 1649 -118.26 374 1091 1087 233 33 -117.95 -115.58 890 39.83 226 34.01 -118.08 +3.0139 39 362 -118.3 460 2053 3201 382 34 -118.19 -117.28 1258 37.31 380 33.92 -118.3 +3.8068 27 616 -122.62 1446 1752 14281 328 19 -122.58 -118.11 873 39.04 336 38.11 -122.6 +2.3456 10 32 -119.69 460 2112 3674 493 45 -118.39 -118.37 1406 36.95 452 37.94 -122.35 +3.025 31 828 -118.24 555 1808 2459 440 25 -117.07 -119.57 1342 33.37 454 33.88 -117.87 +3.1667 27 281 -118.38 303 1146 2334 338 28 -122.36 -118.09 672 32.83 292 33.05 -117.29 +2.5893 20 162 -117.03 376 3494 1750 662 29 -120.05 -118.53 1781 34.89 616 36.8 -119.76 +5.1282 39 772 -117.66 468 2683 3759 475 35 -121.81 -118.21 1498 34.23 484 37.59 -122.49 +9.1531 27 1635 -118.37 685 494 1856 81 25 -122.25 -121.96 254 35.49 85 37.74 -121.77 +6.5954 38 273 -117.93 465 2036 2090 272 17 -115.57 -122.2 713 34.02 265 33.44 -117.61 +5.6856 20 149 -118.36 348 2254 3129 400 9 -118.35 -121.95 694 32.74 243 39.36 -120.15 +4.1674 9 213 -117.82 563 3163 2207 832 10 -117.14 -122.4 1537 33.75 797 37.28 -121.93 +3.8819 28 621 -117.93 236 1489 1041 304 39 -122.02 -122.11 700 38.26 268 34.44 -119.72 +2.7153 36 325 -118.1 74 1111 2850 226 16 -121.92 -117.36 317 34.16 199 37.3 -121.93 +3.2619 45 1045 -122.03 98 2019 870 411 25 -122.41 -118.01 888 40.57 326 38.04 -121.63 +4.0921 12 458 -119.59 951 2192 1578 406 20 -117.09 -116.94 1766 33.66 393 32.69 -117.07 +5.1874 37 181 -121.31 309 1816 6577 338 42 -117.09 -122.04 897 34.14 306 33.89 -118.33 +7.8336 20 391 -117.9 569 2489 2723 314 25 -117.27 -117.19 911 34.1 309 32.79 -117.07 +3.8571 37 222 -122.28 40 1004 14652 220 34 -123.53 -117.09 772 34.23 217 34.12 -117.87 +4.7831 18 280 -117.87 500 3421 1287 656 24 -117.53 -121.4 2220 32.92 645 34.22 -119.03 +5.131 29 663 -121.02 402 4013 2034 673 17 -120.64 -118.29 2263 34.03 661 32.84 -117.02 +3.625 25 356 -118.28 290 2453 2460 648 28 -119.73 -121.92 1082 33.79 617 32.79 -117.23 +10.8805 31 359 -118.49 508 7665 1401 999 10 -120.45 -117 3517 33.66 998 34.28 -118.54 +2.7019 25 1190 -117.78 160 2431 1683 655 33 -121.34 -117.03 1854 36.62 603 37.67 -122.09 +2.4896 12 529 -118.05 454 1552 4613 290 38 -120.1 -117.98 873 38.67 291 40.79 -124.14 +3.2632 39 466 -122.75 814 2183 1694 465 52 -118.28 -118.86 1129 33.59 460 37.8 -122.22 +2.5395 20 277 -118.3 266 1040 2308 231 35 -118.29 -116.31 1040 34.26 242 34.12 -117.99 +2.9107 39 287 -117.02 403 732 1384 145 7 -118.22 -118.09 431 34.1 132 34.06 -117.7 +4.3939 25 544 -118.43 132 14034 1549 3020 22 -121.32 -121.47 6266 37.31 2952 37.5 -122.31 +3.3869 52 180 -122.86 238 1669 1616 314 30 -121.27 -117.95 837 36.99 325 36.55 -119.39 +4.5156 40 1096 -122.32 1856 1806 5765 293 35 -118.5 -117.98 683 37.74 295 39.53 -121.53 +3.2969 34 428 -118.17 1105 1316 3085 263 38 -121.79 -122.5 671 37.47 278 33.84 -117.92 +3.6201 39 411 -119.64 312 3481 3679 808 21 -118.3 -122.11 1866 36.96 746 34.11 -117.81 +4.7 39 493 -122.04 736 1866 2548 300 37 -118.44 -119.06 822 35.37 305 37.61 -122.42 +2.6618 31 444 -119.71 483 1170 2285 303 37 -117.44 -121.92 766 37.27 302 38.28 -122.27 +7.6717 40 507 -119.72 438 4048 1799 513 26 -119.4 -118.46 1486 33.85 498 37.81 -122.12 +2.5259 25 587 -117.3 438 2914 2719 683 35 -121.83 -120.6 1562 33.77 638 32.73 -117.23 +4.1094 43 33 -117.65 311 1930 4145 363 14 -117.15 -117.07 990 37.03 322 38 -121.9 +4.2972 4 623 -119.75 155 2199 19234 529 34 -117.89 -118.37 1193 37.75 532 37.44 -122.16 +2.7083 43 713 -122.04 280 1214 2259 281 46 -118.38 -121.4 701 36.34 294 34.09 -117.65 +2.2125 35 453 -118.68 298 1552 2269 444 34 -117.09 -118.25 2093 39.12 413 33.9 -118.2 +2.8542 23 304 -118.09 895 814 2512 216 52 -121.59 -121.86 327 33.95 181 38.56 -121.48 +5.6022 16 525 -117.26 656 3029 3502 500 31 -124.08 -122.26 1236 34.52 487 38.04 -122.2 +1.7159 20 284 -118.36 706 2174 587 481 49 -122.41 -118.35 1861 34.02 484 33.99 -118.28 +1.1903 23 292 -121.31 314 1657 861 362 13 -118.08 -121.42 1186 37.34 376 35.58 -119.35 +3.2303 8 324 -117.23 444 2277 1991 459 17 -117.99 -118.28 1149 34.16 476 39.17 -121.02 +4.163 25 534 -117.11 588 140 882 35 30 -122.17 -122.11 103 37.55 35 38.36 -121.98 +2.5164 17 332 -122.83 751 2191 1897 531 36 -120.08 -118.3 1563 33.89 524 37.96 -122.35 +5.7705 27 298 -117.02 618 3715 6933 575 25 -118.2 -118.34 1640 33.61 572 33.78 -118.04 +6.7528 22 450 -118.09 322 2577 1337 404 30 -116.94 -117.88 1076 38.52 374 33.89 -117.94 +2.6964 24 379 -122.13 564 855 4273 199 29 -117.28 -122.31 785 34.06 169 34.05 -118.19 +3.8644 29 158 -120.84 993 2994 3273 543 47 -117.37 -117.96 1651 33.97 561 34.07 -118.16 +1.3304 33 714 -117.33 1116 1592 2924 304 28 -122.28 -120.91 962 33.2 282 36.7 -119.8 +1.4615 17 153 -117.07 659 1457 1901 372 20 -117.05 -118.21 1000 33.81 346 37.35 -120.62 +4.3428 12 336 -117.81 607 4835 1419 854 20 -117.25 -122.44 2983 38.02 834 33.2 -117.28 +6.5764 35 601 -118.19 176 1665 1582 247 17 -120.86 -117.22 755 40.42 254 33.89 -117.95 +1.3029 26 474 -121.99 458 2137 4615 448 52 -117.48 -118.33 1194 33.07 444 36.74 -119.76 +2.345 38 571 -119.63 4522 715 2800 282 38 -118.29 -122 1174 34.16 300 34.06 -118.26 +2.9063 45 233 -119.82 378 1931 950 329 52 -122.72 -122.05 1025 37.92 293 37.73 -122.39 +3.0393 42 254 -118.18 404 3431 1723 934 17 -122.24 -118.29 2365 34.01 810 33.96 -118.36 +3.625 19 991 -118.45 386 991 2558 210 21 -122.25 -122 695 39.75 203 32.69 -117.05 +1.6645 24 622 -119.7 602 973 2777 221 37 -119.4 -115.52 842 34.15 178 33.94 -118.27 +10.3953 19 1049 -118.29 792 2887 2330 351 8 -122.23 -118.32 1176 37.31 351 33.58 -117.69 +3.1923 35 482 -122.32 93 1611 5534 410 42 -117.05 -118.17 879 34.17 386 34.18 -118.52 +3.6736 19 1236 -121.98 328 2495 1261 551 16 -118.77 -117.06 2314 33.76 567 34.3 -118.47 +1.8333 42 137 -118.27 428 1001 3188 205 48 -121.87 -122.39 605 37.37 175 38.54 -121.46 +4.2037 16 445 -117.23 249 2145 2361 340 23 -117.68 -118.08 1022 38.95 349 38.67 -121.3 +3.9712 21 777 -118.46 366 3769 1711 839 16 -122.19 -122.29 1986 37.27 815 37.95 -122.47 +2.6546 16 1146 -118.18 447 620 2335 133 41 -120.76 -121.76 642 37.69 162 33.91 -118.28 +2.7188 27 788 -118.26 184 2844 1098 551 32 -118.28 -117.41 1337 38.61 516 34.94 -120.42 +1.9472 4 307 -117.66 593 2277 1354 498 40 -118.3 -118.85 1391 32.68 453 33.93 -116.98 +2.5185 22 198 -118.09 1100 1191 1715 345 36 -121.19 -117.28 1193 34.03 295 33.93 -118.2 +2.5625 8 654 -117.65 547 186 574 48 26 -121.28 -118.07 102 37.45 39 33.51 -116.42 +2.5833 48 522 -117.89 174 2287 6039 531 30 -119.87 -115.52 1796 38.39 503 34.1 -117.48 +5.1741 26 207 -118.02 266 1416 9944 249 16 -117.25 -118.09 636 33.48 244 33.7 -117.79 +4.9688 50 623 -121.54 252 1497 4794 243 15 -121.16 -119.66 730 33.87 242 38.68 -121.25 +4.7026 17 379 -120.45 325 2211 2020 502 34 -122.05 -119.72 1113 37.4 488 33.81 -118.36 +1.6521 15 363 -122.43 304 3446 1649 950 36 -121.28 -120.09 2460 37.85 847 38.52 -121.44 +4.0125 33 206 -122.42 460 3415 2988 631 29 -117.85 -117.02 1527 37.93 597 35.44 -119.02 +2.7989 28 231 -118.42 239 3044 617 565 27 -122.16 -118.28 1583 35.4 514 38.52 -121.98 +5.0551 15 534 -122.24 521 2178 856 421 52 -118.13 -117.86 940 38 423 37.89 -122.29 +5.133 19 211 -122.78 161 3617 2523 597 17 -118.4 -121.84 1176 33.88 571 33.51 -117.72 +3.2325 17 406 -122.33 274 4667 1718 875 28 -122.06 -121.33 2404 34.18 841 35.4 -118.96 +3.2981 24 662 -118.03 1611 1862 20377 472 52 -115.58 -117.35 872 33.95 471 37.77 -122.43 +3.8864 31 1093 -121.36 464 1493 1261 331 33 -122.62 -122.56 1571 33.83 354 34.04 -117.94 +2.9524 24 1027 -122.42 833 2852 2570 740 31 -119.8 -122.01 3100 33.83 725 34.06 -118.1 +4.1812 27 163 -117.08 838 2250 3257 430 17 -118.48 -117.99 1218 37.22 468 36.33 -119.34 diff --git a/test/.data/cadata/cadata_train.solution b/test/.data/cadata/cadata_train.solution new file mode 100755 index 0000000000..8f806fb379 --- /dev/null +++ b/test/.data/cadata/cadata_train.solution @@ -0,0 +1,200 @@ +93600 +500000 +87900 +234600 +57600 +238900 +341800 +233900 +190900 +323800 +158800 +94400 +62800 +269000 +58900 +154200 +186900 +500001 +500001 +306700 +274200 +225200 +262300 +472700 +349500 +500001 +108800 +112500 +139300 +225800 +152000 +225000 +232600 +150000 +162700 +165600 +312500 +123500 +222000 +87500 +315800 +151900 +251600 +236100 +394300 +350000 +173800 +137200 +270800 +500001 +221100 +323500 +170800 +140400 +144700 +218200 +198500 +285200 +172600 +53800 +75300 +268500 +133400 +88600 +207900 +246000 +91300 +294600 +160000 +191700 +122400 +194500 +165700 +217700 +235700 +82600 +129900 +184500 +456300 +250800 +294800 +113800 +253300 +96300 +113700 +98500 +500001 +180500 +256200 +208200 +76900 +361400 +121200 +191100 +353000 +231900 +136100 +210300 +67000 +111700 +262500 +221000 +199000 +137000 +101900 +92000 +122700 +167900 +154300 +130800 +271100 +70400 +127500 +176400 +154700 +201600 +105200 +156900 +300000 +70900 +262500 +418800 +346200 +138100 +214000 +289900 +233300 +183800 +135000 +230800 +277600 +174500 +214200 +148300 +266700 +500001 +154000 +81000 +227700 +139200 +95300 +491200 +80400 +91200 +220000 +150400 +341300 +136200 +416500 +240200 +162200 +405900 +116300 +103200 +125000 +197000 +95000 +63200 +149500 +112500 +114200 +247100 +459600 +122200 +241500 +51300 +69200 +152100 +349000 +69100 +225000 +192000 +129200 +144300 +94900 +500001 +221800 +192200 +58200 +125400 +187500 +159600 +133700 +73200 +138800 +103100 +90600 +227700 +135600 +356800 +69700 +84400 +126700 +232200 +324000 +89000 +222700 +158900 +178800 +93700 diff --git a/test/.data/cadata/cadata_valid.data b/test/.data/cadata/cadata_valid.data new file mode 100755 index 0000000000..03dab0aead --- /dev/null +++ b/test/.data/cadata/cadata_valid.data @@ -0,0 +1,100 @@ +3.7054 49 508 -119.85 374 1784 5154 440 28 -118.39 -121.44 1255 34.52 433 33.87 -117.97 +3.5694 34 19 -122.41 279 1115 1695 268 31 -120.47 -117.2 1369 34.25 259 33.73 -117.86 +2.8447 32 231 -118.35 285 2722 722 511 16 -122.43 -117.19 1366 34.01 495 40.2 -122.38 +2.5556 28 528 -118.16 290 2432 3148 586 13 -122.19 -121.94 1441 35.89 606 38.1 -121.28 +2.7361 38 352 -117.19 384 996 977 264 52 -122.27 -118.14 341 34.23 160 33.34 -118.32 +2.9028 24 207 -118.21 62 2127 2320 581 11 -122.13 -122.19 1989 34.02 530 34.22 -118.37 +1.8781 26 106 -118.16 309 1478 4333 413 29 -121.62 -117.91 1580 34.22 394 34.06 -118.21 +2.067 8 112 -122.17 535 2017 1239 462 31 -122.15 -122.33 1462 34.1 457 34.06 -117.96 +2.767 14 247 -116.72 4 1809 9761 424 42 -121.96 -118.37 1094 37.67 382 34.04 -118.37 +4.0474 49 240 -119.04 323 2622 771 467 34 -118.48 -120.71 1233 35.41 476 34.16 -118.43 +6.1159 17 77 -118.39 240 1282 2335 189 52 -122.62 -118.28 431 36.54 187 34.14 -118.08 +4.0708 28 766 -118.25 321 7591 3240 1710 28 -116.66 -117.08 3420 37.69 1635 33.82 -118.35 +3.5 52 364 -119.29 864 679 2696 159 46 -118.1 -122.41 382 37.37 143 34.27 -119.25 +4.1719 46 231 -117.65 525 1987 4609 335 17 -117.13 -118.43 1152 37.35 313 36.76 -119.89 +3.724 21 635 -122.27 342 1250 1713 236 38 -118.29 -118.38 631 38.68 279 37.92 -122.31 +5.0025 18 824 -119.34 544 3233 796 553 32 -121.77 -122.22 1678 33.68 545 34.2 -118.62 +4.6648 18 358 -116.54 399 2131 2806 329 21 -118.2 -122.31 1094 34.07 353 34.87 -120.43 +5.3946 27 1063 -118.4 579 336 3643 60 27 -117.31 -119.01 195 38.44 68 39.15 -121.63 +13.1867 21 638 -117.37 364 1575 1468 183 34 -117.31 -118.01 511 33.63 180 37.36 -122.11 +3.2375 32 505 -118.52 359 2554 502 540 17 -118.55 -117.87 723 37.97 319 38.97 -122.7 +5.3074 16 392 -117.18 405 1339 7880 284 18 -117.98 -122.03 761 33.83 290 33.03 -117.08 +4.3693 28 516 -118.3 314 3192 3462 565 44 -122.47 -118.27 1439 37.15 568 37.01 -121.58 +4.2841 37 240 -117.9 663 2690 710 410 8 -118.41 -124.01 1085 39.76 381 35.63 -120.67 +4.2727 15 753 -117.02 719 1167 3269 250 47 -118.37 -121.88 953 33.38 253 37.72 -122.4 +3.1607 21 187 -117.23 346 1643 2483 489 28 -122.46 -119.11 1142 39.15 458 34.26 -118.3 +5.1582 52 617 -118.47 139 3084 4575 505 26 -122.14 -118.32 1557 34.2 501 37.94 -121.96 +3.7609 17 306 -120.96 199 3157 1125 721 6 -116.63 -121.27 1695 33.79 710 33.55 -117.67 +3.6182 25 410 -117.76 680 2295 2207 424 27 -118.32 -118.17 1252 33.85 350 37.44 -120.75 +2.7361 43 596 -116.99 400 2370 5153 540 21 -122.23 -118.26 1488 38.63 554 34.86 -118.17 +5.7843 20 169 -117.26 76 2494 1115 414 5 -120.36 -119.16 1416 33.74 421 32.78 -115.58 +3.6091 24 1076 -117.93 474 6862 4166 1292 16 -122.7 -121.26 3562 33.79 1126 34.87 -117 +1.995 36 1049 -118.24 546 1755 2715 530 8 -118.23 -118.09 1687 39.94 511 33.37 -117.25 +3.2604 6 272 -117.23 367 816 1226 159 30 -118 -118.29 531 37.88 147 37.61 -120.76 +3.3516 35 241 -119.73 400 2366 4999 505 32 -123.11 -118.39 1283 38.45 477 33.86 -117.96 +2.0677 3 483 -122.48 377 1813 3320 501 29 -122.11 -120.91 1170 32.91 482 33.79 -117.96 +3.184 46 800 -116.96 441 1391 1086 393 20 -121.62 -118.19 856 39.73 360 33.65 -117.92 +4.6184 27 303 -122.33 641 1705 2136 299 36 -118.11 -118.23 871 34.49 296 33.95 -118.02 +4.6364 52 231 -117.98 1164 1254 1859 263 35 -118 -118.28 1092 33.3 268 34.05 -117.96 +5.3307 31 332 -122.79 482 5609 1212 952 16 -122.22 -117.58 2624 33.9 934 33.87 -117.78 +6.9223 21 427 -117.78 240 7480 3331 1084 23 -119.95 -122.03 3037 37.85 1058 34.2 -118.65 +2.25 25 262 -118.27 423 1952 1263 397 30 -118.13 -120.76 961 37.68 333 40.02 -122.18 +4.7986 41 213 -117.21 734 1704 831 277 52 -118.34 -122.02 746 35.13 262 34.19 -118.3 +1.9191 35 696 -118.15 727 1296 7179 287 9 -122.06 -117.25 768 37.8 260 39.93 -122.2 +5.5456 23 254 -117.97 132 3824 2326 559 18 -122.15 -121.09 241 38.34 106 39.19 -120.1 +4.8448 15 238 -118.08 293 2906 1582 578 31 -115.58 -119.76 1806 37.76 553 33.84 -118.08 +6.1185 52 552 -118.03 1081 1937 2001 286 26 -118.13 -120.47 769 34.05 274 38.66 -121.19 +3.0217 36 807 -118.28 284 438 1257 103 52 -121.55 -117.94 176 33.96 99 38.57 -121.47 +2.4167 15 445 -122.42 690 904 2614 191 36 -121.45 -118.43 627 33.74 191 34.19 -118.39 +4.9375 27 295 -117.12 438 2512 2456 575 19 -121.97 -119.02 1275 34.12 544 33.87 -118.36 +7.3841 39 903 -117.8 388 422 1764 63 40 -117.98 -121.98 158 37.95 63 37.65 -120.98 +4.7125 42 1088 -117.34 474 2209 385 353 27 -122.08 -116.45 1034 37.7 344 34.21 -118.58 +2.1108 20 514 -115.57 59 1425 1332 438 44 -114.73 -122.21 1121 36.76 374 34.08 -118.25 +3.875 13 279 -118.38 671 1880 3055 367 39 -118.22 -118.26 954 34.1 349 34.05 -118.14 +5.6194 18 1458 -121.88 310 2476 1447 368 32 -117.93 -118.33 1048 32.73 367 37.72 -122.08 +3.25 33 805 -121.42 1279 2471 3572 431 19 -118.25 -121.43 1040 34.19 426 36.08 -119.03 +11.6677 46 502 -117.14 576 1080 2011 135 37 -122.25 -122.08 366 33.73 142 33.75 -118.32 +4.5057 52 754 -117.02 359 1548 784 506 10 -121.76 -122.18 1535 37.38 424 33.82 -117.92 +4.9 36 1175 -117.87 436 1773 3367 360 42 -117.68 -118.97 815 34.05 299 33.83 -118.19 +4.3723 28 370 -122.41 556 4741 2040 835 19 -122.02 -118.43 2903 37.32 796 33.93 -117.5 +2.6742 33 660 -120.45 307 5896 2337 1464 25 -117.88 -118 4149 33.6 1362 33.89 -118.18 +3.7167 7 879 -118.32 256 1475 1120 308 17 -121.29 -121.6 549 37.34 293 32.8 -117.05 +6.6004 10 139 -118.09 360 1528 1323 264 17 -122.68 -121.93 606 37.8 251 38.48 -122.6 +3.6991 18 649 -118.5 870 3694 1129 1036 19 -122.48 -116.93 2496 39.18 986 37.34 -122.04 +2.5875 36 345 -118.31 631 1038 1175 252 28 -117.34 -118.31 912 38.56 245 33.92 -118.15 +5.6062 17 171 -117.79 427 2762 3042 496 26 -118.31 -118.32 1716 33.74 459 37.46 -121.91 +2.8203 24 279 -120.35 693 2630 3137 722 27 -117.23 -122.44 1414 38.09 634 37.65 -122.09 +0.9204 17 113 -118.16 890 987 9117 240 43 -122.42 -117.66 1253 34.08 237 38.59 -121.48 +11.7894 8 1863 -118.19 255 2257 778 285 45 -121.89 -118.46 759 38.61 305 34.14 -118.17 +4.825 11 345 -118.48 685 1814 2232 325 45 -118.29 -122.46 709 36.46 311 34.05 -118.52 +7.7317 5 158 -118.44 359 3892 2584 520 16 -117.32 -122.19 1454 33.84 524 33.01 -117.25 +2.7428 17 532 -118.51 831 2138 1997 567 33 -117.27 -122.02 1072 37.3 528 33.89 -118.29 +3.4286 52 415 -122.1 329 2705 3225 649 44 -118.26 -118.05 1676 38.39 654 34.16 -118.33 +4.4423 17 2838 -120.25 468 2795 2229 622 28 -118.16 -118.26 1173 37.79 545 34.16 -118.46 +1.9338 13 1047 -122.39 617 658 3506 218 44 -121.19 -121.86 869 33.83 212 32.7 -117.14 +2.905 26 852 -121.88 208 2724 3437 579 37 -122.02 -118 1400 33.2 540 38 -121.81 +6.1949 25 362 -116.95 282 3135 1885 480 26 -118.41 -117.82 1474 33.2 458 34.19 -118.86 +3.6301 34 288 -118.36 46 8206 1382 1523 7 -117.13 -122.22 4399 37.21 1423 33.25 -117.32 +3.5234 34 1243 -118.36 148 1158 1863 253 52 -117.11 -118.29 528 33.93 253 33.99 -118.46 +3.5775 21 292 -121.83 125 3230 1657 587 33 -122.18 -121.99 1579 37.79 560 37.68 -121.01 +2.6053 6 264 -121.89 320 1862 1221 429 33 -118.19 -118.41 971 34.03 389 37.8 -121.21 +4.4464 31 324 -120.65 282 1204 16921 268 38 -122.46 -116.34 921 34.23 247 37.64 -122.41 +2.1955 37 154 -115.57 380 2745 2895 543 46 -122.16 -122.46 1423 39.15 482 35.13 -119.46 +4.9432 24 105 -117.9 237 1090 4713 164 10 -117.11 -122.41 470 37.04 158 36.35 -119.67 +3.2847 29 511 -121.49 292 1648 4038 285 35 -118.14 -121.13 792 38.54 265 36.09 -119.56 +3.0625 28 607 -118.17 580 790 1025 199 32 -121.94 -121.3 1196 34.23 201 33.75 -117.92 +2.1875 27 418 -120.43 415 1126 1516 289 43 -118.45 -122.07 1132 38.56 294 32.7 -117.14 +3.1065 17 166 -118.11 287 1823 3807 410 36 -124.13 -121.37 1589 33.67 387 37.35 -121.93 +4.5833 34 274 -121.29 153 1442 2081 285 44 -118.34 -118.23 859 34.06 292 34.25 -118.3 +3.4419 25 422 -121.89 638 7626 3832 1570 15 -119.73 -119.71 3823 38.02 1415 33.87 -117.6 +3.3281 36 864 -122.07 445 2643 8295 502 18 -122.21 -119.73 1755 34.07 541 38.42 -121.37 +2.6548 52 631 -118.12 1398 1095 2026 340 27 -120.25 -122.36 1300 34.12 318 33.98 -118.22 +2.8977 16 348 -117.97 572 3490 1700 816 19 -118.18 -118.2 2818 36.05 688 34.09 -117.63 +4.9091 29 461 -118.34 948 2321 1877 480 33 -118.36 -122.62 1230 39.35 451 37.36 -121.99 +3.2891 5 305 -119.23 254 3794 1555 772 27 -118.13 -118.18 1756 33.69 724 38.05 -122.14 +6.0224 34 660 -121.45 477 51 1974 12 38 -122.43 -122.38 41 34.17 10 33.8 -117.89 +3.662 18 1264 -117.98 430 1794 1621 276 8 -117.03 -117.92 690 34.09 271 34.48 -117.27 +2.0243 18 427 -122.61 748 2704 3736 698 18 -117.81 -118.27 1611 37.34 597 34.51 -117.31 +2.5658 39 528 -115.57 476 1578 3038 460 29 -122.09 -117.93 1236 36.31 461 32.64 -117.1 +2.2244 25 337 -122.52 812 1307 1222 314 24 -118.33 -120.67 917 34.21 291 38.43 -121.83 +3.1641 15 510 -117.13 247 1802 1435 335 18 -117.67 -122.5 1110 35.12 329 37.55 -120.8 diff --git a/test/automl/test_estimators.py b/test/automl/test_estimators.py index fdd4ec07ff..ca9b1d928f 100644 --- a/test/automl/test_estimators.py +++ b/test/automl/test_estimators.py @@ -73,7 +73,13 @@ def test_feat_type_wrong_arguments(self): X=X, y=y, feat_type=[True]) self.assertRaisesRegexp(ValueError, - 'Array feat_type must only contain bools.', + 'Array feat_type must only contain strings.', + cls.fit, + X=X, y=y, feat_type=[True]*100) + + self.assertRaisesRegexp(ValueError, + 'Only `Categorical` and `Numerical` are ' + 'valid feature types, you passed `Car`', cls.fit, X=X, y=y, feat_type=['Car']*100) @@ -131,7 +137,7 @@ def test_fit_pSMAC(self): score = automl.score(X_test, Y_test) self.assertEqual(len(os.listdir(os.path.join(output, '.auto-sklearn', - 'ensemble_indices'))), 1) + 'ensembles'))), 1) self.assertGreaterEqual(score, 0.90) self.assertEqual(automl._task, MULTICLASS_CLASSIFICATION) diff --git a/test/automl/test_start_automl.py b/test/automl/test_start_automl.py index 2cd4765be8..0c7c7cb1fd 100644 --- a/test/automl/test_start_automl.py +++ b/test/automl/test_start_automl.py @@ -12,6 +12,7 @@ import autosklearn.automl import autosklearn.pipeline.util as putil +from autosklearn.util import setup_logger, get_logger from autosklearn.constants import * from autosklearn.cli.base_interface import store_and_or_load_data @@ -109,19 +110,28 @@ def test_automl_outputs(self): self._tearDown(output) def test_do_dummy_prediction(self): - output = os.path.join(self.test_dir, '..', - '.tmp_test_do_dummy_prediction') - self._setUp(output) - - name = '401_bac' - dataset = os.path.join(self.test_dir, '..', '.data', name) - - auto = autosklearn.automl.AutoML( - output, output, 15, 15, - initial_configurations_via_metalearning=25) - auto._backend._make_internals_directory() - D = store_and_or_load_data(dataset, output) - auto._do_dummy_prediction(D) - - del auto - self._tearDown(output) + for name in ['401_bac', '31_bac', 'adult', 'cadata']: + output = os.path.join(self.test_dir, '..', + '.tmp_test_do_dummy_prediction') + self._setUp(output) + + dataset = os.path.join(self.test_dir, '..', '.data', name) + + auto = autosklearn.automl.AutoML( + output, output, 15, 15, + initial_configurations_via_metalearning=25) + setup_logger() + auto._logger = get_logger('test_do_dummy_predictions') + auto._backend._make_internals_directory() + D = store_and_or_load_data(dataset, output) + auto._do_dummy_prediction(D) + + # Assure that the dummy predictions are not in the current working + # directory, but in the output directory (under output) + self.assertFalse(os.path.exists(os.path.join(os.getcwd(), + '.auto-sklearn'))) + self.assertTrue(os.path.exists(os.path.join(output, + '.auto-sklearn'))) + + del auto + self._tearDown(output) diff --git a/test/evaluation/test_cv_evaluator.py b/test/evaluation/test_cv_evaluator.py deleted file mode 100644 index 460bba593b..0000000000 --- a/test/evaluation/test_cv_evaluator.py +++ /dev/null @@ -1,220 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function -import copy -import functools -import os -import unittest - -import numpy as np -from numpy.linalg import LinAlgError - -from autosklearn.constants import * -from autosklearn.data.competition_data_manager import CompetitionDataManager -from autosklearn.evaluation.cv_evaluator import CVEvaluator -from autosklearn.util.pipeline import get_configuration_space -from autosklearn.pipeline.util import get_dataset - -N_TEST_RUNS = 10 - - -class Dummy(object): - pass - - -class CVEvaluator_Test(unittest.TestCase): - _multiprocess_can_split_ = True - - def test_evaluate_multiclass_classification(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': BAC_METRIC, - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['extra_trees'], - include_preprocessors=['select_rates']) - - err = np.zeros([N_TEST_RUNS]) - num_models_better_than_random = 0 - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = CVEvaluator(D_, configuration, with_predictions=True) - - if not self._fit(evaluator): - print() - continue - e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \ - evaluator.predict() - err[i] = e_ - print(err[i], configuration['classifier:__choice__']) - - num_targets = len(np.unique(Y_train)) - self.assertTrue(np.isfinite(err[i])) - self.assertGreaterEqual(err[i], 0.0) - # Test that ten models were trained - self.assertEqual(len(evaluator.models), 10) - self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0]) - self.assertEqual(Y_optimization_pred.shape[1], num_targets) - self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0]) - self.assertEqual(Y_valid_pred.shape[1], num_targets) - self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0]) - self.assertEqual(Y_test_pred.shape[1], num_targets) - # Test some basic statistics of the dataset - if err[i] < 0.5: - self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666) - self.assertGreaterEqual(Y_valid_pred.std(), 0.01) - self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666) - self.assertGreaterEqual(Y_test_pred.std(), 0.01) - num_models_better_than_random += 1 - self.assertGreater(num_models_better_than_random, 5) - - def test_evaluate_multiclass_classification_partial_fit(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': BAC_METRIC, - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['select_rates']) - - err = np.zeros([N_TEST_RUNS]) - num_models_better_than_random = 0 - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = CVEvaluator(D_, configuration, with_predictions=True) - - if not self._partial_fit(evaluator, fold=i % 10): - print() - continue - e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \ - evaluator.predict() - err[i] = e_ - print(err[i], configuration['classifier:__choice__']) - - self.assertTrue(np.isfinite(err[i])) - self.assertGreaterEqual(err[i], 0.0) - # Test that only one model was trained - self.assertEqual(len(evaluator.models), 10) - self.assertEqual(1, np.sum([True if model is not None else False - for model in evaluator.models])) - self.assertLess(Y_optimization_pred.shape[0], 13) - self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0]) - self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0]) - # Test some basic statistics of the dataset - if err[i] < 0.5: - self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666) - self.assertGreaterEqual(Y_valid_pred.std(), 0.01) - self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666) - self.assertGreaterEqual(Y_test_pred.std(), 0.01) - num_models_better_than_random += 1 - self.assertGreaterEqual(num_models_better_than_random, 5) - - def test_with_abalone(self): - dataset = 'abalone' - dataset_path = os.path.join(os.path.dirname(__file__), '.datasets', - dataset) - D = CompetitionDataManager(dataset_path) - configuration_space = get_configuration_space( - D.info, - include_estimators=['extra_trees'], - include_preprocessors=['no_preprocessing']) - - errors = [] - for i in range(N_TEST_RUNS): - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = CVEvaluator(D_, configuration, cv_folds=3) - if not self._fit(evaluator): - continue - err = evaluator.predict() - self.assertLess(err, 0.99) - self.assertTrue(np.isfinite(err)) - errors.append(err) - # This is a reasonable bound - self.assertEqual(10, len(errors)) - self.assertLess(min(errors), 0.77) - - def _fit(self, evaluator): - return self.__fit(evaluator.fit) - - def _partial_fit(self, evaluator, fold): - partial_fit = functools.partial(evaluator.partial_fit, fold=fold) - return self.__fit(partial_fit) - - def __fit(self, function_handle): - """Allow us to catch known and valid exceptions for all evaluate - scripts.""" - try: - function_handle() - return True - except ValueError as e: - if 'Floating-point under-/overflow occurred at epoch' in e.args[0] or \ - 'removed all features' in e.args[0] or \ - 'failed to create intent' in e.args[0]: - pass - else: - raise e - except LinAlgError as e: - if 'not positive definite, even with jitter' in e.args[0]: - pass - else: - raise e - except AttributeError as e: - # Some error in QDA - if 'log' == e.args[0]: - pass - else: - raise e - except RuntimeWarning as e: - if 'invalid value encountered in sqrt' in e.args[0]: - pass - elif 'divide by zero encountered in divide' in e.args[0]: - pass - else: - raise e - except UserWarning as e: - if 'FastICA did not converge' in e.args[0]: - pass - else: - raise e diff --git a/test/evaluation/test_holdout_evaluator.py b/test/evaluation/test_holdout_evaluator.py deleted file mode 100644 index 9c184fe766..0000000000 --- a/test/evaluation/test_holdout_evaluator.py +++ /dev/null @@ -1,467 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function -import copy -import os -import shutil -import sys -import traceback -import unittest - -import numpy as np -from numpy.linalg import LinAlgError -import sklearn.datasets - -from autosklearn.pipeline.util import get_dataset - -from autosklearn.constants import * -from autosklearn.data.competition_data_manager import CompetitionDataManager -from autosklearn.evaluation.holdout_evaluator import HoldoutEvaluator -from autosklearn.util.data import convert_to_bin -from autosklearn.util.pipeline import get_configuration_space - -N_TEST_RUNS = 10 - - -class Dummy(object): - def __init__(self): - self.name = 'dummy' - - -class HoldoutEvaluator_Test(unittest.TestCase): - _multiprocess_can_split_ = True - - def test_evaluate_multiclass_classification(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': BAC_METRIC, - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['pca']) - - err = np.zeros([N_TEST_RUNS]) - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration) - - if not self._fit(evaluator): - continue - err[i] = evaluator.predict() - print(err[i]) - - self.assertTrue(np.isfinite(err[i])) - self.assertGreaterEqual(err[i], 0.0) - - def test_evaluate_multiclass_classification_all_metrics(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': BAC_METRIC, - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['pca']) - - # Test all scoring functions - err = [] - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration, - all_scoring_functions=True) - if not self._fit(evaluator): - continue - - err.append(evaluator.predict()) - print(err[-1]) - - self.assertIsInstance(err[-1], dict) - for key in err[-1]: - self.assertEqual(len(err[-1]), 5) - self.assertTrue(np.isfinite(err[-1][key])) - self.assertGreaterEqual(err[-1][key], 0.0) - - def test_evaluate_multilabel_classification(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - Y_train = np.array(convert_to_bin(Y_train, 3)) - Y_train[:, -1] = 1 - Y_test = np.array(convert_to_bin(Y_test, 3)) - Y_test[:, -1] = 1 - - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': F1_METRIC, - 'task': MULTILABEL_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['extra_trees'], - include_preprocessors=['no_preprocessing']) - - err = np.zeros([N_TEST_RUNS]) - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration) - if not self._fit(evaluator): - continue - err[i] = evaluator.predict() - print(err[i]) - - self.assertTrue(np.isfinite(err[i])) - self.assertGreaterEqual(err[i], 0.0) - - def test_evaluate_binary_classification(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - - eliminate_class_two = Y_train != 2 - X_train = X_train[eliminate_class_two] - Y_train = Y_train[eliminate_class_two] - - eliminate_class_two = Y_test != 2 - X_test = X_test[eliminate_class_two] - Y_test = Y_test[eliminate_class_two] - - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': AUC_METRIC, - 'task': BINARY_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 2 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['pca']) - - err = np.zeros([N_TEST_RUNS]) - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration) - - if not self._fit(evaluator): - continue - err[i] = evaluator.predict() - self.assertTrue(np.isfinite(err[i])) - print(err[i]) - - self.assertGreaterEqual(err[i], 0.0) - - def test_evaluate_regression(self): - X_train, Y_train, X_test, Y_test = get_dataset('boston') - - X_valid = X_test[:200, ] - Y_valid = Y_test[:200, ] - X_test = X_test[200:, ] - Y_test = Y_test[200:, ] - - D = Dummy() - D.info = { - 'metric': R2_METRIC, - 'task': REGRESSION, - 'is_sparse': False, - 'label_num': 1 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical', - 'numerical', 'numerical', 'numerical', 'numerical', - 'numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['extra_trees'], - include_preprocessors=['no_preprocessing']) - - err = np.zeros([N_TEST_RUNS]) - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration) - if not self._fit(evaluator): - continue - err[i] = evaluator.predict() - self.assertTrue(np.isfinite(err[i])) - print(err[i]) - - self.assertGreaterEqual(err[i], 0.0) - - def test_with_abalone(self): - dataset = 'abalone' - dataset_path = os.path.join(os.path.dirname(__file__), '.datasets', - dataset) - D = CompetitionDataManager(dataset_path) - configuration_space = get_configuration_space( - D.info, - include_estimators=['extra_trees'], - include_preprocessors=['no_preprocessing']) - - errors = [] - for i in range(N_TEST_RUNS): - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration) - if not self._fit(evaluator): - continue - err = evaluator.predict() - self.assertLess(err, 0.99) - self.assertTrue(np.isfinite(err)) - errors.append(err) - # This is a reasonable bound - self.assertEqual(10, len(errors)) - self.assertLess(min(errors), 0.77) - - def test_5000_classes(self): - weights = ([0.0002] * 4750) + ([0.0001] * 250) - X, Y = sklearn.datasets.make_classification(n_samples=10000, - n_features=20, - n_classes=5000, - n_clusters_per_class=1, - n_informative=15, - n_redundant=5, - n_repeated=0, - weights=weights, - flip_y=0, - class_sep=1.0, - hypercube=True, - shift=None, - scale=1.0, - shuffle=True, - random_state=1) - - self.assertEqual(250, np.sum(np.bincount(Y) == 1)) - D = Dummy() - D.info = { - 'metric': ACC_METRIC, - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 1 - } - D.data = {'X_train': X, 'Y_train': Y, 'X_valid': X, 'X_test': X} - D.feat_type = ['numerical'] * 5000 - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['no_preprocessing']) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = HoldoutEvaluator(D_, configuration) - evaluator.fit() - - def _fit(self, evaluator): - """Allow us to catch known and valid exceptions for all evaluate - scripts.""" - try: - evaluator.fit() - return True - except KeyError as e: - if 'Floating-point under-/overflow occurred at epoch' in e.args[0] or \ - 'removed all features' in e.args[0] or \ - 'failed to create intent' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - except LinAlgError as e: - if 'not positive definite, even with jitter' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - except AttributeError as e: - # Some error in QDA - if 'log' == e.args[0]: - pass - else: - traceback.print_exc() - raise e - except RuntimeWarning as e: - if 'invalid value encountered in sqrt' in e.args[0]: - pass - elif 'divide by zero encountered in divide' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - except UserWarning as e: - if 'FastICA did not converge' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - - def test_file_output(self): - output_dir = os.path.join(os.getcwd(), '.test') - - try: - shutil.rmtree(output_dir) - except Exception: - pass - - X_train, Y_train, X_test, Y_test = get_dataset('boston') - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': R2_METRIC, - 'task': REGRESSION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - D.name = 'test' - - configuration_space = get_configuration_space(D.info) - - while True: - configuration = configuration_space.sample_configuration() - evaluator = HoldoutEvaluator(D, configuration, - with_predictions=True, - all_scoring_functions=True, - output_dir=output_dir, - output_y_test=True) - - if not self._fit(evaluator): - continue - evaluator.predict() - evaluator.file_output() - - self.assertTrue(os.path.exists(os.path.join( - output_dir, '.auto-sklearn', 'true_targets_ensemble.npy'))) - break - - def test_predict_proba_binary_classification(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - - eliminate_class_two = Y_train != 2 - X_train = X_train[eliminate_class_two] - Y_train = Y_train[eliminate_class_two] - - eliminate_class_two = Y_test != 2 - X_test = X_test[eliminate_class_two] - Y_test = Y_test[eliminate_class_two] - - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - class Dummy2(object): - - def predict_proba(self, y, batch_size=200): - return np.array([[0.1, 0.9], [0.7, 0.3]]) - - model = Dummy2() - task_type = BINARY_CLASSIFICATION - - D = Dummy() - D.info = { - 'metric': BAC_METRIC, - 'task': task_type, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['select_rates']) - configuration = configuration_space.sample_configuration() - - evaluator = HoldoutEvaluator(D, configuration) - pred = evaluator.predict_proba(None, model, task_type) - expected = [[0.9], [0.3]] - for i in range(len(expected)): - self.assertEqual(expected[i], pred[i]) - diff --git a/test/evaluation/test_nested_cv_evaluator.py b/test/evaluation/test_nested_cv_evaluator.py deleted file mode 100644 index c06fa8bd3f..0000000000 --- a/test/evaluation/test_nested_cv_evaluator.py +++ /dev/null @@ -1,181 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function -import copy -import os -import traceback -import unittest - -import numpy as np -from numpy.linalg import LinAlgError - -from autosklearn.constants import * -from autosklearn.data.competition_data_manager import CompetitionDataManager -from autosklearn.evaluation.nested_cv_evaluator import NestedCVEvaluator -from autosklearn.util.pipeline import get_configuration_space -from autosklearn.pipeline.util import get_dataset - -N_TEST_RUNS = 10 - - -class Dummy(object): - pass - - -class NestedCVEvaluator_Test(unittest.TestCase): - _multiprocess_can_split_ = True - - def test_evaluate_multiclass_classification(self): - X_train, Y_train, X_test, Y_test = get_dataset('iris') - - X_valid = X_test[:25, ] - Y_valid = Y_test[:25, ] - X_test = X_test[25:, ] - Y_test = Y_test[25:, ] - - D = Dummy() - D.info = { - 'metric': ACC_METRIC, - 'task': MULTICLASS_CLASSIFICATION, - 'is_sparse': False, - 'label_num': 3 - } - D.data = { - 'X_train': X_train, - 'Y_train': Y_train, - 'X_valid': X_valid, - 'X_test': X_test - } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] - - configuration_space = get_configuration_space( - D.info, - include_estimators=['lda'], - include_preprocessors=['pca']) - - err = np.zeros([N_TEST_RUNS]) - num_models_better_than_random = 0 - for i in range(N_TEST_RUNS): - print('Evaluate configuration: %d; result:' % i) - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = NestedCVEvaluator(D_, configuration, - with_predictions=True, - all_scoring_functions=True) - - if not self._fit(evaluator): - continue - e_, Y_optimization_pred, Y_valid_pred, Y_test_pred = \ - evaluator.predict() - err[i] = e_[ACC_METRIC] - print(err[i], configuration['classifier:__choice__']) - print(e_['outer:bac_metric'], e_[BAC_METRIC]) - - # Test the outer CV - num_targets = len(np.unique(Y_train)) - self.assertTrue(np.isfinite(err[i])) - self.assertGreaterEqual(err[i], 0.0) - # Test that ten models were trained - self.assertEqual(len(evaluator.outer_models), 5) - self.assertTrue(all([model is not None - for model in evaluator.outer_models])) - - self.assertEqual(Y_optimization_pred.shape[0], Y_train.shape[0]) - self.assertEqual(Y_optimization_pred.shape[1], num_targets) - self.assertEqual(Y_valid_pred.shape[0], Y_valid.shape[0]) - self.assertEqual(Y_valid_pred.shape[1], num_targets) - self.assertEqual(Y_test_pred.shape[0], Y_test.shape[0]) - self.assertEqual(Y_test_pred.shape[1], num_targets) - # Test some basic statistics of the predictions - if err[i] < 0.5: - self.assertTrue(0.3 < Y_valid_pred.mean() < 0.36666) - self.assertGreaterEqual(Y_valid_pred.std(), 0.1) - self.assertTrue(0.3 < Y_test_pred.mean() < 0.36666) - self.assertGreaterEqual(Y_test_pred.std(), 0.1) - num_models_better_than_random += 1 - - # Test the inner CV - self.assertEqual(len(evaluator.inner_models), 5) - for fold in range(5): - self.assertEqual(len(evaluator.inner_models[fold]), 5) - self.assertTrue(all([model is not None - for model in evaluator.inner_models[fold] - ])) - self.assertGreaterEqual(len(evaluator.outer_indices[fold][0]), - 75) - for inner_fold in range(5): - self.assertGreaterEqual( - len(evaluator.inner_indices[fold][inner_fold][0]), 60) - - self.assertGreater(num_models_better_than_random, 9) - - def test_with_abalone(self): - dataset = 'abalone' - dataset_path = os.path.join(os.path.dirname(__file__), '.datasets', - dataset) - D = CompetitionDataManager(dataset_path) - configuration_space = get_configuration_space( - D.info, - include_estimators=['extra_trees'], - include_preprocessors=['no_preprocessing']) - - errors = [] - for i in range(N_TEST_RUNS): - configuration = configuration_space.sample_configuration() - D_ = copy.deepcopy(D) - evaluator = NestedCVEvaluator(D_, configuration, - inner_cv_folds=2, - outer_cv_folds=2) - if not self._fit(evaluator): - continue - err = evaluator.predict() - self.assertLess(err, 0.99) - self.assertTrue(np.isfinite(err)) - errors.append(err) - # This is a reasonable bound - self.assertEqual(10, len(errors)) - self.assertLess(min(errors), 0.77) - - def _fit(self, evaluator): - return self.__fit(evaluator.fit) - - def __fit(self, function_handle): - """Allow us to catch known and valid exceptions for all evaluate - scripts.""" - try: - function_handle() - return True - except ValueError as e: - if 'Floating-point under-/overflow occurred at epoch' in e.args[0] or \ - 'removed all features' in e.args[0] or \ - 'failed to create intent' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - except LinAlgError as e: - if 'not positive definite, even with jitter' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - except AttributeError as e: - # Some error in QDA - if 'log' == e.args[0]: - pass - else: - traceback.print_exc() - raise e - except RuntimeWarning as e: - if 'invalid value encountered in sqrt' in e.args[0]: - pass - elif 'divide by zero encountered in divide' in e.args[0]: - pass - else: - traceback.print_exc() - raise e - except UserWarning as e: - if 'FastICA did not converge' in e.args[0]: - pass - else: - traceback.print_exc() - raise e diff --git a/test/scores/test_libscores.py b/test/scores/test_libscores.py deleted file mode 100644 index afea703c83..0000000000 --- a/test/scores/test_libscores.py +++ /dev/null @@ -1,70 +0,0 @@ -# -*- encoding: utf-8 -*- -from __future__ import print_function -import unittest - -import numpy as np - -from autosklearn.metrics import acc_metric - - -class LibScoresTest(unittest.TestCase): - _multiprocess_can_split_ = True - - def test_accuracy_metric_4_binary_classification(self): - # 100% correct - expected = np.array([0, 1, 1, 1, 0, 0, 1, 1, 1, 0]).reshape((-1, 1)) - prediction = expected.copy() - score = acc_metric(expected, prediction) - self.assertEqual(1, score) - - # 100% incorrect - prediction = (expected.copy() - 1) * -1 - score = acc_metric(expected, prediction) - self.assertAlmostEqual(-1, score) - - # Random - prediction = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - score = acc_metric(expected, prediction) - self.assertAlmostEqual(0, score) - - def test_accuracy_metric_4_multiclass_classification(self): - # 100% correct - expected = np.array([[0, 0, 1, 1, 0, 1, 0, 1, 0, 1], - [1, 1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, - 1, 0, 1, 0]]) - prediction = expected.copy() - score = acc_metric(expected, prediction) - self.assertEqual(1, score) - - # 100% incorrect - prediction = (expected.copy() - 1) * -1 - score = acc_metric(expected, prediction) - self.assertAlmostEqual(-1, score) - - # Pseudorandom - prediction = np.array([[1, 0, 0, 1, 0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 1, - 0, 0, 1, 0, 0], - [0, 0, 1, 0, 0, 1, 0, 0, 1, 0]]) - score = acc_metric(expected, prediction) - self.assertAlmostEqual(0.33333333, score) - - def test_accuracy_metric_4_multilabel_classification(self): - # 100% correct - expected = np.array([[0, 0, 1, 1, 0, 1, 0, 1, 0, 1], - [1, 1, 0, 0, 1, 0, 1, 0, 1, 0], [1, 1, 0, 0, 1, 0, - 1, 0, 1, 0]]) - prediction = expected.copy() - score = acc_metric(expected, prediction) - self.assertEqual(1, score) - - # 100% incorrect - prediction = (expected.copy() - 1) * -1 - score = acc_metric(expected, prediction) - self.assertAlmostEqual(-1, score) - - # Pseudorandom - prediction = np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, - 1, 1, 1, 1, 1], - [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]) - score = acc_metric(expected, prediction) - self.assertAlmostEqual(-0.0666666666, score) diff --git a/test/cli/__init__.py b/test/test_cli/__init__.py similarity index 100% rename from test/cli/__init__.py rename to test/test_cli/__init__.py diff --git a/test/cli/test_HPOlib_interface.py b/test/test_cli/test_HPOlib_interface.py similarity index 82% rename from test/cli/test_HPOlib_interface.py rename to test/test_cli/test_HPOlib_interface.py index d811a38f23..ee3ccfe5f8 100644 --- a/test/cli/test_HPOlib_interface.py +++ b/test/test_cli/test_HPOlib_interface.py @@ -51,16 +51,17 @@ def setUp(self): 'rescaling:strategy': 'min/max' } + self.output_directory = os.path.join(os.getcwd(), + '.test_HPOlib_interface') + try: - path = os.path.join(os.getcwd(), '.auto-sklearn', 'datamanager.pkl') - os.remove(path) + shutil.rmtree(self.output_directory) except Exception: pass def tearDown(self): try: - path = os.path.join(os.getcwd(), '.auto-sklearn', 'datamanager.pkl') - os.remove(path) + shutil.rmtree(self.output_directory) except Exception: pass @@ -71,12 +72,13 @@ def test_holdout(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - HPOlib_interface.main() + HPOlib_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'holdout', '1', self.params)) - self.assertEqual(call_kwargs, {'mode_args': None}) + self.assertEqual(call_kwargs, {'mode_args': None, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_holdout_iterative_fit(self, patch): @@ -85,13 +87,14 @@ def test_holdout_iterative_fit(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - HPOlib_interface.main() + HPOlib_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'holdout-iterative-fit', '1', self.params)) - self.assertEqual(call_kwargs, {'mode_args': None}) + self.assertEqual(call_kwargs, {'mode_args': None, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_testset(self, patch): @@ -101,12 +104,13 @@ def test_testset(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - HPOlib_interface.main() + HPOlib_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'test', '1', self.params)) - self.assertEqual(call_kwargs, {'mode_args': None}) + self.assertEqual(call_kwargs, {'mode_args': None, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_cv(self, patch): @@ -116,12 +120,13 @@ def test_cv(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - HPOlib_interface.main() + HPOlib_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'cv', '1', self.params)) - self.assertEqual(call_kwargs, {'mode_args': {'folds': 3}}) + self.assertEqual(call_kwargs, {'mode_args': {'folds': 3}, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_partial_cv(self, patch): @@ -133,13 +138,14 @@ def test_partial_cv(self, patch): (self.dataset_string, fold, self.param_string) sys.argv = shlex.split(call) - HPOlib_interface.main() + HPOlib_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, fold+1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'partial-cv', '1', self.params)) self.assertEqual(call_kwargs, {'mode_args': {'folds': 3, - 'fold': fold}}) + 'fold': fold}, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_nested_cv(self, patch): @@ -149,10 +155,11 @@ def test_nested_cv(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - HPOlib_interface.main() + HPOlib_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'nested-cv', '1', self.params)) self.assertEqual(call_kwargs, {'mode_args': {'outer_folds': 3, - 'inner_folds': 3}}) + 'inner_folds': 3}, + 'output_dir': self.output_directory}) diff --git a/test/cli/test_SMAC_interface.py b/test/test_cli/test_SMAC_interface.py similarity index 81% rename from test/cli/test_SMAC_interface.py rename to test/test_cli/test_SMAC_interface.py index 11269b87f8..06d097f5c4 100644 --- a/test/cli/test_SMAC_interface.py +++ b/test/test_cli/test_SMAC_interface.py @@ -49,17 +49,17 @@ def setUp(self): 'random_forest:n_estimators': '100', 'rescaling:strategy': 'min/max' } + self.output_directory = os.path.join(os.getcwd(), + '.test_SMAC_interface') try: - path = os.path.join(os.getcwd(), '.auto-sklearn', 'datamanager.pkl') - os.remove(path) + shutil.rmtree(self.output_directory) except Exception: pass def tearDown(self): try: - path = os.path.join(os.getcwd(), '.auto-sklearn', 'datamanager.pkl') - os.remove(path) + shutil.rmtree(self.output_directory) except Exception: pass @@ -70,12 +70,13 @@ def test_holdout(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - SMAC_interface.main() + SMAC_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'holdout', 1, self.params)) - self.assertEqual(call_kwargs, {'mode_args': None}) + self.assertEqual(call_kwargs, {'mode_args': None, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_holdout_iterative_fit(self, patch): @@ -84,13 +85,14 @@ def test_holdout_iterative_fit(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - SMAC_interface.main() + SMAC_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'holdout-iterative-fit', 1, self.params)) - self.assertEqual(call_kwargs, {'mode_args': None}) + self.assertEqual(call_kwargs, {'mode_args': None, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_testset(self, patch): @@ -99,12 +101,13 @@ def test_testset(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - SMAC_interface.main() + SMAC_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'test', 1, self.params)) - self.assertEqual(call_kwargs, {'mode_args': None}) + self.assertEqual(call_kwargs, {'mode_args': None, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_cv(self, patch): @@ -113,12 +116,13 @@ def test_cv(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - SMAC_interface.main() + SMAC_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'cv', 1, self.params)) - self.assertEqual(call_kwargs, {'mode_args': {'folds': 3}}) + self.assertEqual(call_kwargs, {'mode_args': {'folds': 3}, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_partial_cv(self, patch): @@ -128,13 +132,14 @@ def test_partial_cv(self, patch): (fold, self.dataset_string, self.param_string) sys.argv = shlex.split(call) - SMAC_interface.main() + SMAC_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, fold + 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'partial-cv', 1, self.params)) self.assertEqual(call_kwargs, {'mode_args': {'folds': 3, - 'fold': fold}}) + 'fold': fold}, + 'output_dir': self.output_directory}) @mock.patch('autosklearn.cli.base_interface.main') def test_nested_cv(self, patch): @@ -143,10 +148,11 @@ def test_nested_cv(self, patch): (self.dataset_string, self.param_string) sys.argv = shlex.split(call) - SMAC_interface.main() + SMAC_interface.main(output_dir=self.output_directory) self.assertEqual(patch.call_count, 1) call_args, call_kwargs = patch.call_args self.assertEqual(call_args, (self.dataset_string, 'nested-cv', 1, self.params)) self.assertEqual(call_kwargs, {'mode_args': {'outer_folds': 3, - 'inner_folds': 3}}) + 'inner_folds': 3}, + 'output_dir': self.output_directory}) diff --git a/test/cli/test_base_interface.py b/test/test_cli/test_base_interface.py similarity index 84% rename from test/cli/test_base_interface.py rename to test/test_cli/test_base_interface.py index e6f8239143..10de325764 100644 --- a/test/cli/test_base_interface.py +++ b/test/test_cli/test_base_interface.py @@ -47,17 +47,17 @@ def setUp(self): 'one_hot_encoding:minimum_fraction': '0.01', 'rescaling:__choice__': 'min/max' } + self.output_directory = os.path.join(os.getcwd(), + '.test_base_interface') try: - path = os.path.join(os.getcwd(), '.auto-sklearn', 'datamanager.pkl') - os.remove(path) + shutil.rmtree(self.output_directory) except Exception: pass def tearDown(self): try: - path = os.path.join(os.getcwd(), '.auto-sklearn', 'datamanager.pkl') - os.remove(path) + shutil.rmtree(self.output_directory) except Exception: pass @@ -66,7 +66,8 @@ def test_holdout(self, patch): autosklearn.cli.base_interface.main(self.dataset_string, 'holdout', '1', - self.params) + self.params, + output_dir=self.output_directory) # Returns the actual call call_args = patch.call_args[0][0] result = call_args.split(",")[3].strip() @@ -77,7 +78,8 @@ def test_holdout_iterative_fit(self, patch): autosklearn.cli.base_interface.main(self.dataset_string, 'holdout-iterative-fit', '1', - self.params) + self.params, + output_dir=self.output_directory) # Returns the actual call call_args = patch.call_args[0][0] result = call_args.split(",")[3].strip() @@ -88,7 +90,8 @@ def test_testset(self, patch): autosklearn.cli.base_interface.main(self.dataset_string, 'test', '1', - self.params) + self.params, + output_dir=self.output_directory) # Returns the actual call call_args = patch.call_args[0][0] result = call_args.split(",")[3].strip() @@ -100,7 +103,8 @@ def test_cv(self, patch): 'cv', '1', self.params, - mode_args={'folds': 3}) + mode_args={'folds': 3}, + output_dir=self.output_directory) # Returns the actual call call_args = patch.call_args[0][0] result = call_args.split(",")[3].strip() @@ -116,7 +120,8 @@ def test_partial_cv(self, patch): '1', params, mode_args={'folds': 3, - 'fold': fold}) + 'fold': fold}, + output_dir=self.output_directory) # Returns the actual call call_args = patch.call_args[0][0] result = call_args.split(",")[3].strip() @@ -131,7 +136,8 @@ def test_nested_cv(self, patch): '1', self.params, mode_args={'outer_folds': 3, - 'inner_folds': 3}) + 'inner_folds': 3}, + output_dir=self.output_directory) # Returns the actual call call_args = patch.call_args[0][0] result = call_args.split(",")[3].strip() diff --git a/test/evaluation/.datasets/abalone/abalone_feat.type b/test/test_evaluation/.datasets/abalone/abalone_feat.type similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_feat.type rename to test/test_evaluation/.datasets/abalone/abalone_feat.type diff --git a/test/evaluation/.datasets/abalone/abalone_public.info b/test/test_evaluation/.datasets/abalone/abalone_public.info similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_public.info rename to test/test_evaluation/.datasets/abalone/abalone_public.info diff --git a/test/evaluation/.datasets/abalone/abalone_test.data b/test/test_evaluation/.datasets/abalone/abalone_test.data similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_test.data rename to test/test_evaluation/.datasets/abalone/abalone_test.data diff --git a/test/evaluation/.datasets/abalone/abalone_test.solution b/test/test_evaluation/.datasets/abalone/abalone_test.solution similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_test.solution rename to test/test_evaluation/.datasets/abalone/abalone_test.solution diff --git a/test/evaluation/.datasets/abalone/abalone_train.data b/test/test_evaluation/.datasets/abalone/abalone_train.data similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_train.data rename to test/test_evaluation/.datasets/abalone/abalone_train.data diff --git a/test/evaluation/.datasets/abalone/abalone_train.solution b/test/test_evaluation/.datasets/abalone/abalone_train.solution similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_train.solution rename to test/test_evaluation/.datasets/abalone/abalone_train.solution diff --git a/test/evaluation/.datasets/abalone/abalone_valid.data b/test/test_evaluation/.datasets/abalone/abalone_valid.data similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_valid.data rename to test/test_evaluation/.datasets/abalone/abalone_valid.data diff --git a/test/evaluation/.datasets/abalone/abalone_valid.solution b/test/test_evaluation/.datasets/abalone/abalone_valid.solution similarity index 100% rename from test/evaluation/.datasets/abalone/abalone_valid.solution rename to test/test_evaluation/.datasets/abalone/abalone_valid.solution diff --git a/test/scores/__init__.py b/test/test_evaluation/__init__.py similarity index 96% rename from test/scores/__init__.py rename to test/test_evaluation/__init__.py index cc3cd7becd..49b2047416 100644 --- a/test/scores/__init__.py +++ b/test/test_evaluation/__init__.py @@ -1,2 +1,4 @@ # -*- encoding: utf-8 -*- __author__ = 'feurerm' + + diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py new file mode 100644 index 0000000000..9940c446b1 --- /dev/null +++ b/test/test_evaluation/evaluation_util.py @@ -0,0 +1,261 @@ +import functools +import os +import sys +import traceback + +if sys.version_info[0] == 2: + import unittest2 as unittest +else: + import unittest + +import numpy as np +from numpy.linalg import LinAlgError +import sklearn.datasets + +from autosklearn.constants import * +from autosklearn.util.data import convert_to_bin +from autosklearn.data.competition_data_manager import CompetitionDataManager +from autosklearn.pipeline.util import get_dataset + +N_TEST_RUNS = 5 + + +class Dummy(object): + pass + + +class BaseEvaluatorTest(unittest.TestCase): + def _fit(self, evaluator): + return self.__fit(evaluator.fit) + + def _partial_fit(self, evaluator, fold): + partial_fit = functools.partial(evaluator.partial_fit, fold=fold) + return self.__fit(partial_fit) + + def __fit(self, function_handle): + """Allow us to catch known and valid exceptions for all evaluate + scripts.""" + try: + function_handle() + return True + except KeyError as e: + if 'Floating-point under-/overflow occurred at epoch' in \ + e.args[0] or \ + 'removed all features' in e.args[0] or \ + 'failed to create intent' in e.args[0]: + pass + else: + traceback.print_exc() + raise e + except ValueError as e: + if 'Floating-point under-/overflow occurred at epoch' in e.args[ + 0] or \ + 'removed all features' in e.args[0] or \ + 'failed to create intent' in e.args[0]: + pass + else: + raise e + except LinAlgError as e: + if 'not positive definite, even with jitter' in e.args[0]: + pass + else: + raise e + except RuntimeWarning as e: + if 'invalid value encountered in sqrt' in e.args[0]: + pass + elif 'divide by zero encountered in divide' in e.args[0]: + pass + else: + raise e + except UserWarning as e: + if 'FastICA did not converge' in e.args[0]: + pass + else: + raise e + + +def get_multiclass_classification_datamanager(): + X_train, Y_train, X_test, Y_test = get_dataset('iris') + indices = list(range(X_train.shape[0])) + np.random.seed(1) + np.random.shuffle(indices) + X_train = X_train[indices] + Y_train = Y_train[indices] + + X_valid = X_test[:25, ] + Y_valid = Y_test[:25, ] + X_test = X_test[25:, ] + Y_test = Y_test[25:, ] + + D = Dummy() + D.info = { + 'metric': BAC_METRIC, + 'task': MULTICLASS_CLASSIFICATION, + 'is_sparse': False, + 'label_num': 3 + } + D.data = { + 'X_train': X_train, + 'Y_train': Y_train, + 'X_valid': X_valid, + 'X_test': X_test + } + D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] + return D, 1.01 + + +def get_abalone_datamanager(): + dataset = 'abalone' + dataset_path = os.path.join(os.path.dirname(__file__), '.datasets', + dataset) + D = CompetitionDataManager(dataset_path) + return D, 0.87 + + +def get_multilabel_classification_datamanager(): + X_train, Y_train, X_test, Y_test = get_dataset('iris') + indices = list(range(X_train.shape[0])) + np.random.seed(1) + np.random.shuffle(indices) + X_train = X_train[indices] + Y_train = Y_train[indices] + + Y_train = np.array(convert_to_bin(Y_train, 3)) + #for i in range(Y_train_.shape[0]): + # Y_train_[:, Y_train[i]] = 1 + #Y_train = Y_train_ + Y_test = np.array(convert_to_bin(Y_test, 3)) + #for i in range(Y_test_.shape[0]): + # Y_test_[:, Y_test[i]] = 1 + #Y_test = Y_test_ + + X_valid = X_test[:25, ] + Y_valid = Y_test[:25, ] + X_test = X_test[25:, ] + Y_test = Y_test[25:, ] + + D = Dummy() + D.info = { + 'metric': ACC_METRIC, + 'task': MULTILABEL_CLASSIFICATION, + 'is_sparse': False, + 'label_num': 3 + } + D.data = { + 'X_train': X_train, + 'Y_train': Y_train, + 'X_valid': X_valid, + 'X_test': X_test + } + D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] + return D, 0.67 + + +def get_binary_classification_datamanager(): + X_train, Y_train, X_test, Y_test = get_dataset('iris') + indices = list(range(X_train.shape[0])) + np.random.seed(1) + np.random.shuffle(indices) + X_train = X_train[indices] + Y_train = Y_train[indices] + + eliminate_class_two = Y_train != 2 + X_train = X_train[eliminate_class_two] + Y_train = Y_train[eliminate_class_two] + + eliminate_class_two = Y_test != 2 + X_test = X_test[eliminate_class_two] + Y_test = Y_test[eliminate_class_two] + + X_valid = X_test[:25, ] + Y_valid = Y_test[:25, ] + X_test = X_test[25:, ] + Y_test = Y_test[25:, ] + + D = Dummy() + D.info = { + 'metric': AUC_METRIC, + 'task': BINARY_CLASSIFICATION, + 'is_sparse': False, + 'label_num': 2 + } + D.data = { + 'X_train': X_train, + 'Y_train': Y_train, + 'X_valid': X_valid, + 'X_test': X_test + } + D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] + return D, 1.01 + + +def get_regression_datamanager(): + X_train, Y_train, X_test, Y_test = get_dataset('boston') + indices = list(range(X_train.shape[0])) + np.random.seed(1) + np.random.shuffle(indices) + X_train = X_train[indices] + Y_train = Y_train[indices] + + X_valid = X_test[:200, ] + Y_valid = Y_test[:200, ] + X_test = X_test[200:, ] + Y_test = Y_test[200:, ] + + D = Dummy() + D.info = { + 'metric': R2_METRIC, + 'task': REGRESSION, + 'is_sparse': False, + 'label_num': 1 + } + D.data = { + 'X_train': X_train, + 'Y_train': Y_train, + 'X_valid': X_valid, + 'X_test': X_test + } + D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical', + 'numerical', 'numerical', 'numerical', 'numerical', + 'numerical', 'numerical', 'numerical'] + return D, 1.05 + + +def get_500_classes_datamanager(): + weights = ([0.002] * 475) + ([0.001] * 25) + X, Y = sklearn.datasets.make_classification(n_samples=1000, + n_features=20, + n_classes=500, + n_clusters_per_class=1, + n_informative=15, + n_redundant=5, + n_repeated=0, + weights=weights, + flip_y=0, + class_sep=1.0, + hypercube=True, + shift=None, + scale=1.0, + shuffle=True, + random_state=1) + + assert (25 == np.sum(np.bincount(Y) == 1), np.sum(np.bincount(Y) == 1)) + D = Dummy() + D.info = { + 'metric': ACC_METRIC, + 'task': MULTICLASS_CLASSIFICATION, + 'is_sparse': False, + 'label_num': 500 + } + D.data = {'X_train': X, 'Y_train': Y, 'X_valid': X, 'X_test': X} + D.feat_type = ['numerical'] * 20 + return D, 1.01 + + +def get_dataset_getters(): + return [get_binary_classification_datamanager, + get_multiclass_classification_datamanager, + get_multilabel_classification_datamanager, + get_500_classes_datamanager, + get_abalone_datamanager, + get_regression_datamanager] diff --git a/test/test_evaluation/test_cv_evaluator.py b/test/test_evaluation/test_cv_evaluator.py new file mode 100644 index 0000000000..07492c84a3 --- /dev/null +++ b/test/test_evaluation/test_cv_evaluator.py @@ -0,0 +1,62 @@ +# -*- encoding: utf-8 -*- +from __future__ import print_function +import copy +import os +import sys +import numpy as np + +from autosklearn.evaluation import CVEvaluator + +this_directory = os.path.dirname(__file__) +sys.path.append(this_directory) +from evaluation_util import get_dataset_getters, BaseEvaluatorTest + +N_TEST_RUNS = 5 + + +class CVEvaluator_Test(BaseEvaluatorTest): + _multiprocess_can_split_ = True + + def test_datasets(self): + for getter in get_dataset_getters(): + testname = '%s_%s' % (os.path.basename(__file__). + replace('.pyc', '').replace('.py', ''), + getter.__name__) + with self.subTest(testname): + D, upper_error_bound = getter() + output_directory = os.path.join(os.getcwd(), '.%s' % testname) + err = np.zeros([N_TEST_RUNS]) + for i in range(N_TEST_RUNS): + D_ = copy.deepcopy(D) + evaluator = CVEvaluator(D_, output_directory, None) + + evaluator.fit() + err[i] = evaluator.loss_and_predict()[0] + + self.assertTrue(np.isfinite(err[i])) + self.assertLessEqual(err[i], upper_error_bound) + for model_idx in range(10): + model = evaluator.models[model_idx] + self.assertIsNotNone(model) + + D_ = copy.deepcopy(D) + evaluator = CVEvaluator(D_, output_directory, None) + for j in range(5): + evaluator.partial_fit(j) + model = evaluator.models[j] + self.assertIsNotNone(model) + for j in range(5, 10): + model = evaluator.models[j] + self.assertIsNone(model) + + + +# for getter in get_dataset_getters(): +# D, upper_error_bound = getter() +# testname = '%s_%s' % (os.path.basename(__file__). +# replace('.pyc','').replace('.py', ''), +# getter.__name__) +# output_directory = os.path.join(os.getcwd(), '._%s' % testname) +# setattr(CVEvaluator_Test, 'test_%s' % testname, +# generate(D, upper_error_bound, output_directory)) +# print(getattr(CVEvaluator_Test, 'test_%s' % testname)) diff --git a/test/test_evaluation/test_holdout_evaluator.py b/test/test_evaluation/test_holdout_evaluator.py new file mode 100644 index 0000000000..e27e4ff4f4 --- /dev/null +++ b/test/test_evaluation/test_holdout_evaluator.py @@ -0,0 +1,134 @@ +# -*- encoding: utf-8 -*- +from __future__ import print_function +import copy +import os +import shutil +import sys + +import numpy as np + +from autosklearn.constants import * +from autosklearn.evaluation.holdout_evaluator import HoldoutEvaluator +from autosklearn.util.pipeline import get_configuration_space + +this_directory = os.path.dirname(__file__) +sys.path.append(this_directory) +from evaluation_util import get_regression_datamanager, BaseEvaluatorTest, \ + get_binary_classification_datamanager, get_dataset_getters + +N_TEST_RUNS = 10 + + +class Dummy(object): + def __init__(self): + self.name = 'dummy' + + +class HoldoutEvaluatorTest(BaseEvaluatorTest): + _multiprocess_can_split_ = True + + def teardown(self): + try: + shutil.rmtree(self.output_dir) + except Exception: + pass + + def test_file_output(self): + self.output_dir = os.path.join(os.getcwd(), '.test') + + D, _ = get_regression_datamanager() + D.name = 'test' + + configuration_space = get_configuration_space(D.info) + + while True: + configuration = configuration_space.sample_configuration() + evaluator = HoldoutEvaluator(D, self.output_dir, configuration, + with_predictions=True, + all_scoring_functions=True, + output_y_test=True) + + if not self._fit(evaluator): + continue + evaluator.loss_and_predict() + evaluator.file_output() + + self.assertTrue(os.path.exists(os.path.join( + self.output_dir, '.auto-sklearn', 'true_targets_ensemble.npy'))) + break + + def test_predict_proba_binary_classification(self): + self.output_dir = os.path.join(os.getcwd(), + '.test_predict_proba_binary_classification') + D, _ = get_binary_classification_datamanager() + + class Dummy2(object): + + def predict_proba(self, y, batch_size=200): + return np.array([[0.1, 0.9], [0.7, 0.3]]) + + model = Dummy2() + task_type = BINARY_CLASSIFICATION + + configuration_space = get_configuration_space( + D.info, + include_estimators=['extra_trees'], + include_preprocessors=['select_rates']) + configuration = configuration_space.sample_configuration() + + evaluator = HoldoutEvaluator(D, self.output_dir, configuration) + pred = evaluator.predict_proba(None, model, task_type, + D.data['Y_train']) + expected = [[0.9], [0.3]] + for i in range(len(expected)): + self.assertEqual(expected[i], pred[i][1]) + + def test_datasets(self): + for getter in get_dataset_getters(): + testname = '%s_%s' % (os.path.basename(__file__). + replace('.pyc', '').replace('.py', ''), + getter.__name__) + with self.subTest(testname): + D, upper_error_bound = getter() + output_directory = os.path.join(os.getcwd(), '.%s' % testname) + self.output_directory = output_directory + + err = np.zeros([N_TEST_RUNS]) + for i in range(N_TEST_RUNS): + D_ = copy.deepcopy(D) + evaluator = HoldoutEvaluator(D_, self.output_directory, None) + + evaluator.fit() + err[i] = evaluator.loss_and_predict()[0] + + self.assertTrue(np.isfinite(err[i])) + self.assertLessEqual(err[i], upper_error_bound) + + +# def generate(D, upper_error_bound, output_directory): +# def run_test(self): +# self.output_directory = output_directory +# +# err = np.zeros([N_TEST_RUNS]) +# for i in range(N_TEST_RUNS): +# D_ = copy.deepcopy(D) +# evaluator = HoldoutEvaluator(D_, self.output_directory, None) +# +# evaluator.fit() +# +# err[i] = evaluator.predict() +# +# self.assertTrue(np.isfinite(err[i])) +# self.assertLessEqual(err[i], upper_error_bound) +# +# return run_test +# +# +# for getter in get_dataset_getters(): +# D, upper_error_bound = getter() +# testname = '%s_%s' % (os.path.basename(__file__). +# replace('.pyc', '').replace('.py', ''), +# getter.__name__) +# output_directory = os.path.join(os.getcwd(), '.%s' % testname) +# setattr(HoldoutEvaluatorTest, 'test_%s' % testname, +# generate(D, upper_error_bound, output_directory)) diff --git a/test/test_evaluation/test_nested_cv_evaluator.py b/test/test_evaluation/test_nested_cv_evaluator.py new file mode 100644 index 0000000000..727c7b77ad --- /dev/null +++ b/test/test_evaluation/test_nested_cv_evaluator.py @@ -0,0 +1,82 @@ +# -*- encoding: utf-8 -*- +from __future__ import print_function +import copy +import os +import sys + +import numpy as np + +this_directory = os.path.dirname(__file__) +sys.path.append(this_directory) +from evaluation_util import get_dataset_getters, BaseEvaluatorTest + +from autosklearn.evaluation import NestedCVEvaluator + + +N_TEST_RUNS = 10 + + +class Dummy(object): + pass + + +class NestedCVEvaluator_Test(BaseEvaluatorTest): + _multiprocess_can_split_ = True + + def test_datasets(self): + for getter in get_dataset_getters(): + testname = '%s_%s' % (os.path.basename(__file__). + replace('.pyc', '').replace('.py', ''), + getter.__name__) + with self.subTest(testname): + D, upper_error_bound = getter() + output_directory = os.path.join(os.getcwd(), '.%s' % testname) + err = np.zeros([N_TEST_RUNS]) + for i in range(N_TEST_RUNS): + D_ = copy.deepcopy(D) + evaluator = NestedCVEvaluator(D_, output_directory, None) + + evaluator.fit() + + err[i] = evaluator.loss_and_predict()[0] + + self.assertTrue(np.isfinite(err[i])) + self.assertLessEqual(err[i], upper_error_bound) + for model_idx in range(5): + model = evaluator.outer_models[model_idx] + self.assertIsNotNone(model) + model = evaluator.inner_models[model_idx] + self.assertIsNotNone(model) + +# def generate(D, upper_error_bound, output_directory): +# def run_test(self): +# self.output_directory = output_directory +# +# err = np.zeros([N_TEST_RUNS]) +# for i in range(N_TEST_RUNS): +# D_ = copy.deepcopy(D) +# evaluator = NestedCVEvaluator(D_, self.output_directory, None) +# +# evaluator.fit() +# +# err[i] = evaluator.predict() +# +# self.assertTrue(np.isfinite(err[i])) +# self.assertLessEqual(err[i], upper_error_bound) +# for model_idx in range(5): +# model = evaluator.outer_models[model_idx] +# self.assertIsNotNone(model) +# model = evaluator.inner_models[model_idx] +# self.assertIsNotNone(model) +# +# return run_test +# +# +# for getter in get_dataset_getters(): +# D, upper_error_bound = getter() +# testname = '%s_%s' % (os.path.basename(__file__). +# replace('.pyc', '').replace('.py', ''), +# getter.__name__) +# output_directory = os.path.join(os.getcwd(), '._%s' % testname) +# setattr(NestedCVEvaluator_Test, 'test_%s' % testname, +# generate(D, upper_error_bound, output_directory)) \ No newline at end of file diff --git a/test/evaluation/test_resampling.py b/test/test_evaluation/test_resampling.py similarity index 100% rename from test/evaluation/test_resampling.py rename to test/test_evaluation/test_resampling.py diff --git a/test/evaluation/__init__.py b/test/test_metric/__init__.py similarity index 100% rename from test/evaluation/__init__.py rename to test/test_metric/__init__.py diff --git a/test/test_metric/test_classification_metrics.py b/test/test_metric/test_classification_metrics.py new file mode 100644 index 0000000000..95e813fee1 --- /dev/null +++ b/test/test_metric/test_classification_metrics.py @@ -0,0 +1,776 @@ +# -*- encoding: utf-8 -*- +from __future__ import print_function +import sys +if sys.version_info[0] == 2: + import unittest2 as unittest +else: + import unittest +import numpy as np +from autosklearn.constants import * +from autosklearn.metrics.util import normalize_array +from autosklearn.metrics import acc_metric, auc_metric, bac_metric, \ + f1_metric, pac_metric + + +def copy_and_preprocess_arrays(solution, prediction): + solution = solution.copy() + prediction = prediction.copy() + return solution, prediction + + +class AccuracyTest(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_accuracy_metric_4_binary_classification(self): + # 100% correct + expected = np.array([0, 1, 1, 1, 0, 0, 1, 1, 1, 0]).reshape((-1, 1)) + prediction = np.array([[1., 0.], [0., 1.], [0., 1.], [0., 1.], + [1., 0.], [1., 0.], [0., 1.], [0., 1.], + [0., 1.], [1., 0.]]) + score = acc_metric(expected, prediction, task=BINARY_CLASSIFICATION) + self.assertEqual(1, score) + + # 100% incorrect + prediction = (prediction.copy() - 1) * -1 + score = acc_metric(expected, prediction, task=BINARY_CLASSIFICATION) + self.assertAlmostEqual(-1, score) + + # Random + prediction = np.array([[1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], + [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.]]) + score = acc_metric(expected, prediction, task=BINARY_CLASSIFICATION) + self.assertAlmostEqual(0, score) + + def test_accuracy_metric_4_multiclass_classification(self): + # 100% correct + expected = np.array([1, 1, 0, 0, 1, 0, 2, 0, 2, 1]) + prediction = np.array([[0.0, 1.0, 0.0], [0.0, 1.0, 0.0], + [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], [1.0, 0.0, 0.0], + [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], + [0.0, 0.0, 1.0], [0.0, 1.0, 0.0]]) + score = acc_metric(expected, prediction, task=MULTICLASS_CLASSIFICATION) + self.assertEqual(1, score) + + # 100% incorrect + prediction = (prediction.copy() - 1) * -1 + score = acc_metric(expected, prediction, task=MULTICLASS_CLASSIFICATION) + self.assertAlmostEqual(-0.5, score) + + # Pseudorandom + prediction = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], + [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], + [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], + [1.0, 0.0, 0.0]]) + score = acc_metric(expected, prediction, task=MULTICLASS_CLASSIFICATION) + self.assertAlmostEqual(0.1, score) + + def test_accuracy_metric_4_multilabel_classification(self): + # 100% correct + expected = np.array([[0, 1, 1], [0, 1, 1], [1, 0, 0], [1, 0, 0], + [0, 1, 1], [1, 0, 0], [0, 1, 1], [1, 0, 0], + [0, 1, 1], [1, 0, 0]]) + prediction = expected.copy() + score = acc_metric(expected, prediction.astype(float), + task=MULTILABEL_CLASSIFICATION) + self.assertEqual(1, score) + + # 100% incorrect + prediction = (prediction.copy() - 1) * -1 + score = acc_metric(expected, prediction.astype(float), + task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(-1, score) + + # Pseudorandom + prediction = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], + [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0]]) + score = acc_metric(expected, prediction, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(-0.0666666666, score) + + +class AreaUnderCurveTest(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_cases_binary_score_verification(self): + cases = [] + sol = np.array([0, 0, 1, 1]) + pred = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) + + cases.append(('perfect', sol, pred, 1.0)) + cases.append(('anti-perfect', sol, 1 - pred, -1.0)) + + uneven_proba = np.array( + [[0.7, 0.3], [0.4, 0.6], [0.49, 0.51], [0.2, 0.8]]) + + cases.append(('uneven proba', sol, uneven_proba, 0.5)) + + eps = 1.e-15 + ties = np.array([[0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps], + [0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps]]) + cases.append(('ties_broken', sol, ties, 0.0)) + + ties = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('ties', sol, ties, 0.0)) + + sol = np.array([0, 1, 1]) + pred = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('even proba', sol, pred, 0.0)) + + _pred = np.array([[1, 0], [0, 1], [0, 1]]) + pred = np.array([sum(_pred) * 1. / len(_pred)] * len(_pred)) + cases.append(('correct PAC prior', sol, pred, 0.0)) + + pred = np.array([[1., 1.], [1., 1.], [1., 1.]]) + cases.append(('all positive', sol, pred, 0.0)) + + pred = np.array([[0, 0], [0, 0], [0, 0]]) + cases.append(('all negative', sol, pred, 0.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + auc = auc_metric(sol, pred) + self.assertAlmostEqual(auc, result) + + def test_cases_multiclass_score_verification(self): + cases = [] + sol = np.array([0, 1, 0, 0]) + pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], + [1.0, 0.0, 0.0], [1.0, 0.0, 0.0]]) + + cases.append(('3 classes perfect', sol, pred, 0.333333333333)) + + pred = np.array([[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]) + cases.append(('all classes wrong', sol, pred, -0.555555555556)) + + pred = np.array([[1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3], + [1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3]]) + cases.append(('equi proba', sol, pred, -0.333333333333)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('sum(proba) < 1.0', sol, pred, -0.111111111111)) + + pred = np.array([[0.75, 0.25, 0.], [0.75, 0.25, 0.], [0.75, 0.25, 0.], + [0.75, 0.25, 0.]]) + cases.append(('predict prior', sol, pred, -0.333333333333)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = auc_metric(sol, pred, task=MULTICLASS_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + def test_cases_multilabel_1l(self): + cases = [] + num = 2 + + sol = np.array([[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) + sol3 = sol[:, 0:num] + if num == 1: + sol3 = np.array([sol3[:, 0]]).transpose() + + cases.append(('{} labels perfect'.format(num), sol3, sol3, 1.0)) + + cases.append(('All wrong, in the multi-label sense', sol3, 1 - sol3, + -1.0)) + + pred = np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], + [0.5, 0.5, 0.5]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba: 0.5', sol3, pred, 0.0)) + + pred = np.array([[0.25, 0.25, 0.25], [0.25, 0.25, 0.25], [0.25, 0.25, 0.25], + [0.25, 0.25, 0.25]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba, prior: 0.25', sol3, pred, 0.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Some proba', sol3, pred, -1.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Invert both solution and prediction', 1 - sol3, pred, + 1.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + auc = auc_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(auc, result) + + def test_cases_multilabel_2(self): + cases = [] + + sol4 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) + cases.append(('Three labels perfect', sol4, sol4, 1.0)) + + cases.append(('Three classes all wrong, in the multi-label sense', + sol4, 1 - sol4, -1.0)) + + pred = np.array([[1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3], + [1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3]]) + cases.append(('Three classes equi proba', sol4, pred, 0.0)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('Three classes some proba that do not add up', sol4, + pred, 0.0)) + + pred = np.array([[0.25, 0.25, 0.5], [0.25, 0.25, 0.5], + [0.25, 0.25, 0.5], [0.25, 0.25, 0.5]]) + cases.append(('Three classes predict prior', sol4, pred, 0.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + auc = auc_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(auc, result) + + +class BalancedAccurayTest(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_cases_binary_score_verification(self): + cases = [] + sol = np.array([0, 0, 1, 1]) + pred = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) + + cases.append(('perfect', sol, pred, 1.0)) + cases.append(('anti-perfect', sol, 1 - pred, -1.0,)) + + uneven_proba = np.array( + [[0.7, 0.3], [0.4, 0.6], [0.49, 0.51], [0.2, 0.8]]) + + cases.append(('uneven proba', sol, uneven_proba, 0.5)) + + eps = 1.e-15 + ties = np.array([[0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps], + [0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps]]) + cases.append(('ties_broken', sol, ties, 0.0)) + + ties = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('ties', sol, ties, 0.0)) + + sol = np.array([0, 1, 1]) + pred = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('even proba', sol, pred, 0.0)) + + _pred = np.array([[1, 0], [0, 1], [0, 1]]) + pred = np.array([sum(_pred) * 1. / len(_pred)] * len(_pred)) + cases.append(('correct PAC prior', sol, pred, 0.0)) + + pred = np.array([[1., 1.], [1., 1.], [1., 1.]]) + cases.append(('all positive', sol, pred, 0.0)) + + pred = np.array([[0, 0], [0, 0], [0, 0]]) + cases.append(('all negative', sol, pred, 0.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = bac_metric(sol, pred, task=BINARY_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + def test_cases_multiclass_score_verification(self): + cases = [] + sol = np.array([0, 1, 0, 0]) + pred = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 0]]) + + cases.append(('3 classes perfect', sol, pred, 1.0)) + + cases.append(('all classes wrong', sol, 1 - pred, 0.0)) + + pred = np.array([[1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3], + [1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3]]) + cases.append(('equi proba', sol, pred, 0.5)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('sum(proba) < 1.0', sol, pred, 0.333333333333)) + + pred = np.array([[0.75, 0.25, 0.], [0.75, 0.25, 0.], [0.75, 0.25, 0.], + [0.75, 0.25, 0.]]) + cases.append(('predict prior', sol, pred, 0.5)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = bac_metric(sol, pred, task=MULTICLASS_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + def test_cases_multilabel_1l(self): + cases = [] + num = 2 + + sol = np.array([[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) + sol3 = sol[:, 0:num] + if num == 1: + sol3 = np.array([sol3[:, 0]]).transpose() + + cases.append(('{} labels perfect'.format(num), sol3, sol3, 1.0)) + + cases.append(('All wrong, in the multi-label sense', sol3, 1 - sol3, + -1.0)) + + pred = np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], + [0.5, 0.5, 0.5]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba: 0.5', sol3, pred, 0.0)) + + pred = np.array( + [[0.25, 0.25, 0.25], [0.25, 0.25, 0.25], [0.25, 0.25, 0.25], + [0.25, 0.25, 0.25]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba, prior: 0.25', sol3, pred, 0.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Some proba', sol3, pred, -1.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Invert both solution and prediction', 1 - sol3, pred, + 1.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = bac_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + def test_cases_multilabel_2(self): + cases = [] + + sol4 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) + cases.append(('Three labels perfect', sol4, sol4, 1.0)) + + cases.append(('Three classes all wrong, in the multi-label sense', + sol4, 1 - sol4, -1.0)) + + pred = np.array([[1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3], + [1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3]]) + cases.append(('Three classes equi proba', sol4, pred, 0.0)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('Three classes some proba that do not add up', sol4, + pred, -0.5)) + + pred = np.array([[0.25, 0.25, 0.5], [0.25, 0.25, 0.5], + [0.25, 0.25, 0.5], [0.25, 0.25, 0.5]]) + cases.append(('Three classes predict prior', sol4, pred, 0.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('_%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = bac_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + +class F1Test(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_cases_binary_score_verification(self): + cases = [] + sol = np.array([0, 0, 1, 1]) + pred = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) + + cases.append(('perfect', sol, pred, 1.0)) + cases.append(('anti-perfect', sol, 1 - pred, -1.0)) + + uneven_proba = np.array( + [[0.7, 0.3], [0.4, 0.6], [0.49, 0.51], [0.2, 0.8]]) + + cases.append(('uneven proba', sol, uneven_proba, 0.60000000000000009)) + + # We cannot have lower eps for float32 + eps = 1.e-7 + ties = np.array([[0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps], + [0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps]]) + cases.append(('ties_broken', sol, ties, 0.0)) + + ties = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('ties', sol, ties, 0.333333333333)) + + sol = np.array([0, 1, 1]) + pred = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('even proba', sol, pred, 0.60000000000000009)) + + _pred = np.array([[1, 0], [0, 1], [0, 1]]) + pred = np.array([sum(_pred) * 1. / len(_pred)] * len(_pred)) + cases.append(('correct PAC prior', sol, pred, 0.60000000000000009)) + + pred = np.array([[1., 1.], [1., 1.], [1., 1.]]) + cases.append(('all positive', sol, pred, 0.60000000000000009)) + + pred = np.array([[0, 0], [0, 0], [0, 0]]) + cases.append(('all negative', sol, pred, -1.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + f1 = f1_metric(sol, pred, task=BINARY_CLASSIFICATION) + self.assertAlmostEqual(f1, result) + + def test_cases_multiclass_score_verification(self): + cases = [] + sol = np.array([0, 1, 0, 0]) + pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], + [1.0, 0.0, 0.0], [1.0, 0.0, 0.0]]) + + cases.append(('3 classes perfect', sol, pred, 1.0)) + + pred = np.array([[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]) + cases.append(('all classes wrong', sol, pred, -0.5)) + + pred = np.array([[1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3], + [1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3]]) + cases.append(('equi proba', sol, pred, 0.428571428571)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('sum(proba) < 1.0', sol, pred, -0.166666666667)) + + pred = np.array([[0.75, 0.25, 0.], [0.75, 0.25, 0.], [0.75, 0.25, 0.], + [0.75, 0.25, 0.]]) + cases.append(('predict prior', sol, pred, 0.428571428571)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = f1_metric(sol, pred, task=MULTICLASS_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + def test_cases_multilabel_1l(self): + cases = [] + num = 2 + + sol = np.array([[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) + sol3 = sol[:, 0:num] + if num == 1: + sol3 = np.array([sol3[:, 0]]).transpose() + + cases.append(('{} labels perfect'.format(num), sol3, sol3, 1.0)) + + cases.append(('All wrong, in the multi-label sense', sol3, 1 - sol3, + -1.0)) + + pred = np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], + [0.5, 0.5, 0.5]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba: 0.5', sol3, pred, -0.2)) + + pred = np.array( + [[0.25, 0.25, 0.25], [0.25, 0.25, 0.25], [0.25, 0.25, 0.25], + [0.25, 0.25, 0.25]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba, prior: 0.25', sol3, pred, -1.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Some proba', sol3, pred, -1.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Invert both solution and prediction', 1 - sol3, pred, + 1.0)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = f1_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + def test_cases_multilabel_2(self): + cases = [] + + sol4 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) + cases.append(('Three labels perfect', sol4, sol4, 1.0)) + + cases.append(('Three classes all wrong, in the multi-label sense', + sol4, 1 - sol4, -1.0)) + + pred = np.array([[1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3], + [1 / 3, 1 / 3, 1 / 3], [1 / 3, 1 / 3, 1 / 3]]) + cases.append(('Three classes equi proba', sol4, pred, -1.0)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('Three classes some proba that do not add up', sol4, + pred, -1.0)) + + pred = np.array([[0.25, 0.25, 0.5], [0.25, 0.25, 0.5], + [0.25, 0.25, 0.5], [0.25, 0.25, 0.5]]) + cases.append(('Three classes predict prior', sol4, pred, + -0.555555555556)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = f1_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + self.assertAlmostEqual(bac, result) + + +class PACTest(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_cases_binary_score_verification(self): + cases = [] + sol = np.array([0, 0, 1, 1]) + pred = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) + + cases.append(('perfect', sol, pred, 1.0)) + cases.append(('anti-perfect', sol, 1 - pred, -1.0,)) + + uneven_proba = np.array( + [[0.7, 0.3], [0.4, 0.6], [0.49, 0.51], [0.2, 0.8]]) + + cases.append(('uneven proba', sol, uneven_proba, 0.162745170342)) + + eps = 1.e-15 + ties = np.array([[0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps], + [0.5 + eps, 0.5 - eps], [0.5 - eps, 0.5 + eps]]) + cases.append(('ties_broken', sol, ties, 0.0)) + + ties = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('ties', sol, ties, 0.0)) + + sol = np.array([0, 1, 1]) + pred = np.array([[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]) + cases.append(('even proba', sol, pred, -0.0618725166757)) + + _pred = np.array([[1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) + pred = np.array([sum(_pred) * 1. / len(_pred)] * len(_pred)) + cases.append(('correct PAC prior', sol, pred, 0.0)) + + pred = np.array([[1., 1.], [1., 1.], [1., 1.]]) + cases.append(('all positive', sol, pred, -1.12374503314)) + + pred = np.array([[0, 0], [0, 0], [0, 0]]) + cases.append(('all negative', sol, pred, -1.1237237959)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = pac_metric(sol, pred, task=BINARY_CLASSIFICATION) + # Very inaccurate! + self.assertAlmostEqual(bac, result, places=1) + + def test_cases_multiclass_score_verification(self): + cases = [] + sol = np.array([0, 1, 0, 0]) + pred = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 0]]) + + cases.append(('3 classes perfect', sol, pred, 1.0)) + + pred = np.array([[0, 1, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]]) + cases.append(('all classes wrong', sol, pred, -1.32491508679)) + + pred = np.array([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]) + cases.append(('equi proba (wrong test from the starting kit)', sol, + pred, -1.32491508679)) + + pred = np.array([[1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3], + [1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3]]) + cases.append(('equi proba', sol, pred, -0.54994340656358087)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('sum(proba) < 1.0', sol, pred, -0.315724404334)) + + pred = np.array([[0.75, 0.25, 0.], [0.75, 0.25, 0.], [0.75, 0.25, 0.], + [0.75, 0.25, 0.]]) + cases.append( + ('predict prior', sol, pred, 1.54870455579e-15)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = pac_metric(sol, pred, task=MULTICLASS_CLASSIFICATION) + if bac != -1.3096137080181987 and result != -1.32470836935: + self.assertAlmostEqual(bac, result, places=2) + + def test_cases_multilabel_1l(self): + cases = [] + num = 2 + + sol = np.array([[1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) + sol3 = sol[:, 0:num] + if num == 1: + sol3 = np.array([sol3[:, 0]]).transpose() + + cases.append(('{} labels perfect'.format(num), sol3, sol3, 1.0)) + + cases.append(('All wrong, in the multi-label sense', sol3, 1 - sol3, + -1.32491508679)) + + pred = np.array([[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], + [0.5, 0.5, 0.5]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba: 0.5', sol3, pred, -0.162457543395)) + + pred = np.array( + [[0.25, 0.25, 0.25], [0.25, 0.25, 0.25], [0.25, 0.25, 0.25], + [0.25, 0.25, 0.25]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('All equi proba, prior: 0.25', sol3, pred, 0.0)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Some proba', sol3, pred, -0.892199631436)) + + pred = np.array([[0.2, 0.2, 0.2], [0.8, 0.8, 0.8], [0.9, 0.9, 0.9], + [0.7, 0.7, 0.7]]) + if num == 1: + pred = np.array([pred[:, 0]]).transpose() + else: + pred = pred[:, 0:num] + cases.append(('Invert both solution and prediction', 1 - sol3, pred, + 0.5277086603)) + + for case in cases: + testname, sol, pred, result = case + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + bac = pac_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + # Very weak test + self.assertAlmostEqual(bac, result, places=1) + + def test_cases_multilabel_2(self): + cases = [] + + sol4 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1]]) + cases.append(('Three labels perfect', sol4, sol4, 1.0)) + + cases.append(('Three classes all wrong, in the multi-label sense', + sol4, 1 - sol4, -1.20548265539)) + + pred = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) + cases.append(('Three classes equi proba (wrong test from StartingKit)', + sol4, pred, -1.20522116785)) + + pred = np.array([[1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3], + [1. / 3, 1. / 3, 1. / 3], [1. / 3, 1. / 3, 1. / 3]]) + cases.append(('Three classes equi proba', sol4, pred, -1.20522116785)) + + pred = np.array([[0.2, 0, 0.5], [0.8, 0.4, 0.1], [0.9, 0.1, 0.2], + [0.7, 0.3, 0.3]]) + cases.append(('Three classes some proba that do not add up', sol4, + pred, -0.249775129382)) + + pred = np.array([[0.25, 0.25, 0.5], [0.25, 0.25, 0.5], + [0.25, 0.25, 0.5], [0.25, 0.25, 0.5]]) + cases.append(('Three classes predict prior', sol4, pred, 0.0)) + + for case in cases: + testname, sol, pred, result = case + + + pred = pred.astype(np.float32) + with self.subTest('%s' % testname): + sol, pred = copy_and_preprocess_arrays(sol, pred) + pac = pac_metric(sol, pred, task=MULTILABEL_CLASSIFICATION) + + # Another weak test + if pac != -1.1860048034278985 and result != -1.20522116785: + self.assertAlmostEqual(pac, result, places=3) \ No newline at end of file diff --git a/test/test_metric/test_libscores.py b/test/test_metric/test_libscores.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/test_pipeline/components/classification/test_adaboost.py b/test/test_pipeline/components/classification/test_adaboost.py index 4905313498..2319cccb6b 100644 --- a/test/test_pipeline/components/classification/test_adaboost.py +++ b/test/test_pipeline/components/classification/test_adaboost.py @@ -2,9 +2,11 @@ from autosklearn.pipeline.components.classification.adaboost import \ AdaboostClassifier -from autosklearn.pipeline.util import _test_classifier +from autosklearn.pipeline.util import _test_classifier, _test_classifier_predict_proba import sklearn.metrics +import sklearn.ensemble +import numpy as np class AdaBoostComponentTest(unittest.TestCase): @@ -13,20 +15,57 @@ def test_default_configuration_iris(self): predictions, targets = \ _test_classifier(AdaboostClassifier) self.assertAlmostEqual(0.93999999999999995, - sklearn.metrics.accuracy_score(predictions, targets)) + sklearn.metrics.accuracy_score(targets, + predictions)) + + def test_default_configuration_iris_predict_proba(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(AdaboostClassifier) + self.assertAlmostEqual(0.34244204343758322, + sklearn.metrics.log_loss(targets, predictions)) def test_default_configuration_iris_sparse(self): for i in range(10): predictions, targets = \ _test_classifier(AdaboostClassifier, sparse=True) self.assertAlmostEqual(0.88, - sklearn.metrics.accuracy_score(predictions, - targets)) + sklearn.metrics.accuracy_score(targets, + predictions)) - def test_default_configuration_digits(self): + def test_default_configuration_multilabel(self): for i in range(10): predictions, targets = \ _test_classifier(classifier=AdaboostClassifier, - dataset='digits') - self.assertAlmostEqual(0.6915604128718883, - sklearn.metrics.accuracy_score(predictions, targets)) + dataset='digits', + make_multilabel=True) + self.assertAlmostEqual(0.80933874118770355, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_default_configuration_multilabel_predict_proba(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(classifier=AdaboostClassifier, + make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.97856971820815897, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(classifier=AdaboostClassifier, + dataset='digits', sparse=True, + make_binary=True) + self.assertAlmostEqual(0.93199757134183359, + sklearn.metrics.accuracy_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.ensemble.AdaBoostClassifier() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_bernoulli_nb.py b/test/test_pipeline/components/classification/test_bernoulli_nb.py index 498a40d832..81dd6d8b2e 100644 --- a/test/test_pipeline/components/classification/test_bernoulli_nb.py +++ b/test/test_pipeline/components/classification/test_bernoulli_nb.py @@ -4,7 +4,9 @@ BernoulliNB from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit +import numpy as np import sklearn.metrics +import sklearn.naive_bayes class BernoulliNBComponentTest(unittest.TestCase): @@ -22,4 +24,19 @@ def test_default_configuration_iterative_fit(self): _test_classifier_iterative_fit(BernoulliNB) self.assertAlmostEqual(0.26000000000000001, sklearn.metrics.accuracy_score(predictions, - targets)) \ No newline at end of file + targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(BernoulliNB, make_binary=True) + self.assertAlmostEqual(0.73999999999999999, + sklearn.metrics.accuracy_score( + predictions, targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.naive_bayes.BernoulliNB() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_decision_tree.py b/test/test_pipeline/components/classification/test_decision_tree.py index f8083cb17f..a4d27e7723 100644 --- a/test/test_pipeline/components/classification/test_decision_tree.py +++ b/test/test_pipeline/components/classification/test_decision_tree.py @@ -3,14 +3,15 @@ from autosklearn.pipeline.components.classification.decision_tree import DecisionTree from autosklearn.pipeline.util import _test_classifier, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.tree class DecisionTreetComponentTest(unittest.TestCase): def test_default_configuration(self): for i in range(10): - predictions, targets = _test_classifier(DecisionTree, - dataset='iris') + predictions, targets = _test_classifier(DecisionTree) self.assertAlmostEqual(0.92, sklearn.metrics.accuracy_score(predictions, targets)) @@ -25,6 +26,39 @@ def test_default_configuration_sparse(self): def test_default_configuration_predict_proba(self): for i in range(10): predictions, targets = _test_classifier_predict_proba( - DecisionTree, dataset='iris') + DecisionTree) self.assertAlmostEqual(0.28069887755912964, - sklearn.metrics.log_loss(targets, predictions)) \ No newline at end of file + sklearn.metrics.log_loss(targets, predictions)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier( + DecisionTree, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score( + targets, predictions)) + + def test_default_configuration_multilabel(self): + for i in range(10): + predictions, targets = _test_classifier( + DecisionTree, make_multilabel=True) + print(predictions, targets) + self.assertAlmostEqual(0.94120857699805072, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_default_configuration_multilabel_predict_proba(self): + for i in range(10): + predictions, targets = _test_classifier_predict_proba( + DecisionTree, make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.94589326168273546, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.tree.DecisionTreeClassifier() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + # Running this without an exception is the purpose of this test! + cls.fit(X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_extra_trees.py b/test/test_pipeline/components/classification/test_extra_trees.py index fe926f1926..cc44b0045d 100644 --- a/test/test_pipeline/components/classification/test_extra_trees.py +++ b/test/test_pipeline/components/classification/test_extra_trees.py @@ -2,9 +2,12 @@ from autosklearn.pipeline.components.classification.extra_trees import \ ExtraTreesClassifier -from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit +from autosklearn.pipeline.util import _test_classifier, \ + _test_classifier_iterative_fit, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.ensemble class ExtraTreesComponentTest(unittest.TestCase): @@ -13,20 +16,61 @@ def test_default_configuration(self): predictions, targets = \ _test_classifier(ExtraTreesClassifier) self.assertAlmostEqual(0.95999999999999996, - sklearn.metrics.accuracy_score(predictions, targets)) + sklearn.metrics.accuracy_score(targets, predictions)) + + def test_default_configuration_predict_proba(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(ExtraTreesClassifier) + self.assertAlmostEqual(0.12052046298054782, + sklearn.metrics.log_loss( + targets, predictions)) def test_default_configuration_sparse(self): for i in range(10): predictions, targets = \ _test_classifier(ExtraTreesClassifier, sparse=True) self.assertAlmostEqual(0.71999999999999997, - sklearn.metrics.accuracy_score(predictions, - targets)) + sklearn.metrics.accuracy_score(targets, + predictions)) def test_default_configuration_iterative_fit(self): for i in range(10): predictions, targets = \ _test_classifier_iterative_fit(ExtraTreesClassifier) self.assertAlmostEqual(0.95999999999999996, - sklearn.metrics.accuracy_score(predictions, - targets)) \ No newline at end of file + sklearn.metrics.accuracy_score(targets, + predictions)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(ExtraTreesClassifier, make_binary=True) + self.assertAlmostEqual(1, + sklearn.metrics.accuracy_score(targets, + predictions)) + + def test_default_configuration_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier(ExtraTreesClassifier, make_multilabel=True) + self.assertAlmostEqual(0.97060428849902536, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_default_configuration_predict_proba_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(ExtraTreesClassifier, + make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.98976738180772728, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.ensemble.ExtraTreesClassifier() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + # Running this without an exception is the purpose of this test! + cls.fit(X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_gaussian_nb.py b/test/test_pipeline/components/classification/test_gaussian_nb.py index 79d1007724..e53cc21055 100644 --- a/test/test_pipeline/components/classification/test_gaussian_nb.py +++ b/test/test_pipeline/components/classification/test_gaussian_nb.py @@ -4,7 +4,9 @@ GaussianNB from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit +import numpy as np import sklearn.metrics +import sklearn.naive_bayes class GaussianNBComponentTest(unittest.TestCase): @@ -22,4 +24,19 @@ def test_default_configuration_iterative_fit(self): _test_classifier_iterative_fit(GaussianNB) self.assertAlmostEqual(0.95999999999999996, sklearn.metrics.accuracy_score(predictions, - targets)) \ No newline at end of file + targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier(GaussianNB, + make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.average_precision_score( + predictions, targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.naive_bayes.GaussianNB() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_gradient_boosting.py b/test/test_pipeline/components/classification/test_gradient_boosting.py index 18137a6fa5..cf05f977a7 100644 --- a/test/test_pipeline/components/classification/test_gradient_boosting.py +++ b/test/test_pipeline/components/classification/test_gradient_boosting.py @@ -5,6 +5,8 @@ from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit import sklearn.metrics +import sklearn.ensemble +import numpy as np class GradientBoostingComponentTest(unittest.TestCase): @@ -21,4 +23,19 @@ def test_default_configuration_iterative_fit(self): _test_classifier_iterative_fit(GradientBoostingClassifier) self.assertAlmostEqual(0.95999999999999996, sklearn.metrics.accuracy_score(predictions, - targets)) \ No newline at end of file + targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier( + GradientBoostingClassifier, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.ensemble.GradientBoostingClassifier() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_k_nearest_neighbor.py b/test/test_pipeline/components/classification/test_k_nearest_neighbor.py index dcc3d57e14..a19ca23b51 100644 --- a/test/test_pipeline/components/classification/test_k_nearest_neighbor.py +++ b/test/test_pipeline/components/classification/test_k_nearest_neighbor.py @@ -4,7 +4,9 @@ KNearestNeighborsClassifier from autosklearn.pipeline.util import _test_classifier, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.neighbors class KNearestNeighborsComponentTest(unittest.TestCase): @@ -28,4 +30,38 @@ def test_default_configuration_predict_proba(self): predictions, targets = \ _test_classifier_predict_proba(KNearestNeighborsClassifier) self.assertAlmostEqual(1.381551055796429, - sklearn.metrics.log_loss(targets, predictions)) \ No newline at end of file + sklearn.metrics.log_loss(targets, predictions)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(KNearestNeighborsClassifier, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_default_configuration_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier(KNearestNeighborsClassifier, + make_multilabel=True) + self.assertAlmostEqual(0.959999999999999, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_default_configuration_predict_proba_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(KNearestNeighborsClassifier, + make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.97060428849902536, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.neighbors.KNeighborsClassifier() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + # Running this without an exception is the purpose of this test! + cls.fit(X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_lda.py b/test/test_pipeline/components/classification/test_lda.py index 28915f0e35..11d29c1e83 100644 --- a/test/test_pipeline/components/classification/test_lda.py +++ b/test/test_pipeline/components/classification/test_lda.py @@ -1,9 +1,11 @@ import unittest from autosklearn.pipeline.components.classification.lda import LDA -from autosklearn.pipeline.util import _test_classifier +from autosklearn.pipeline.util import _test_classifier, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.lda class LDAComponentTest(unittest.TestCase): @@ -22,3 +24,37 @@ def test_default_configuration_digits(self): self.assertAlmostEqual(0.88585306618093507, sklearn.metrics.accuracy_score(predictions, targets)) + + def test_default_configuration_iris_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(LDA, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_default_configuration_iris_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier(LDA, make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.66, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_default_configuration_predict_proba_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(LDA, + make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.96639166748245653, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.lda.LDA() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) diff --git a/test/test_pipeline/components/classification/test_liblinear.py b/test/test_pipeline/components/classification/test_liblinear.py index de30c1405d..5d2f2153c4 100644 --- a/test/test_pipeline/components/classification/test_liblinear.py +++ b/test/test_pipeline/components/classification/test_liblinear.py @@ -1,12 +1,43 @@ import unittest +import numpy as np +import sklearn.metrics +import sklearn.svm + from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC -from autosklearn.pipeline.util import _test_classifier +from autosklearn.pipeline.util import _test_classifier, _test_classifier_predict_proba class LibLinearComponentTest(unittest.TestCase): def test_default_configuration(self): + for i in range(10): + predictions, targets = _test_classifier(LibLinear_SVC) + self.assertTrue(all(targets == predictions)) + + def test_default_configuration_sparse(self): + for i in range(10): + predictions, targets = _test_classifier(LibLinear_SVC, + sparse=True) + self.assertEquals(0.56, sklearn.metrics.accuracy_score( + targets, predictions)) + + def test_default_configuration_binary(self): for i in range(10): predictions, targets = _test_classifier(LibLinear_SVC, - dataset='iris') - self.assertTrue(all(targets == predictions)) \ No newline at end of file + make_binary=True) + self.assertTrue(all(targets == predictions)) + + def test_default_configuration_multilabel(self): + for i in range(10): + predictions, targets = _test_classifier(LibLinear_SVC, + make_multilabel=True) + self.assertAlmostEquals(0.84479797979797977, sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.svm.LinearSVC() + + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_libsvm_svc.py b/test/test_pipeline/components/classification/test_libsvm_svc.py index a62b464644..d2bd478d60 100644 --- a/test/test_pipeline/components/classification/test_libsvm_svc.py +++ b/test/test_pipeline/components/classification/test_libsvm_svc.py @@ -6,6 +6,7 @@ import numpy as np import sklearn.metrics +import sklearn.svm class LibSVM_SVCComponentTest(unittest.TestCase): @@ -53,3 +54,19 @@ def test_default_configuration_predict_proba(self): prediction = cls.predict_proba(X_test) self.assertAlmostEqual(sklearn.metrics.log_loss(Y_test, prediction), 0.69323680119641773) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier(LibSVM_SVC, + make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score( + predictions, targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.svm.SVC() + + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) diff --git a/test/test_pipeline/components/classification/test_multinomial_nb.py b/test/test_pipeline/components/classification/test_multinomial_nb.py index 8f8bc42379..82f5da4552 100644 --- a/test/test_pipeline/components/classification/test_multinomial_nb.py +++ b/test/test_pipeline/components/classification/test_multinomial_nb.py @@ -7,6 +7,7 @@ import numpy as np import sklearn.metrics +import sklearn.naive_bayes class MultinomialNBComponentTest(unittest.TestCase): @@ -42,4 +43,19 @@ def test_default_configuration_negative_values(self): cls = cls.fit(X_train, Y_train) prediction = cls.predict(X_test) self.assertAlmostEqual(np.nanmean(prediction == Y_test), - 0.88888888888888884) \ No newline at end of file + 0.88888888888888884) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(MultinomialNB, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score( + predictions, targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.naive_bayes.MultinomialNB() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_passive_aggressive.py b/test/test_pipeline/components/classification/test_passive_aggressive.py index 56ec91b54a..8836040c90 100644 --- a/test/test_pipeline/components/classification/test_passive_aggressive.py +++ b/test/test_pipeline/components/classification/test_passive_aggressive.py @@ -2,9 +2,12 @@ from autosklearn.pipeline.components.classification.passive_aggressive import \ PassiveAggressive -from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit +from autosklearn.pipeline.util import _test_classifier, \ + _test_classifier_iterative_fit, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.linear_model class PassiveAggressiveComponentTest(unittest.TestCase): @@ -37,4 +40,20 @@ def test_default_configuration_digits_iterative_fit(self): dataset='digits') self.assertAlmostEqual(0.91924711596842745, sklearn.metrics.accuracy_score( - predictions, targets)) \ No newline at end of file + predictions, targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier(PassiveAggressive, + make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.linear_model.PassiveAggressiveClassifier() + + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_proj_logit.py b/test/test_pipeline/components/classification/test_proj_logit.py index d9972ea916..7ba2e141ee 100644 --- a/test/test_pipeline/components/classification/test_proj_logit.py +++ b/test/test_pipeline/components/classification/test_proj_logit.py @@ -3,7 +3,9 @@ from autosklearn.pipeline.components.classification.proj_logit import ProjLogitCLassifier from autosklearn.pipeline.util import _test_classifier +import numpy as np import sklearn.metrics +import autosklearn.pipeline.implementations.ProjLogit class ProjLogitComponentTest(unittest.TestCase): @@ -18,4 +20,21 @@ def test_default_configuration_digits(self): predictions, targets = _test_classifier(ProjLogitCLassifier, dataset='digits') self.assertAlmostEqual(0.8986035215543412, - sklearn.metrics.accuracy_score(predictions, targets)) \ No newline at end of file + sklearn.metrics.accuracy_score(predictions, targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier(ProjLogitCLassifier, + make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + @unittest.skip('Cannot be tested ATM. Wait for Tobias') + def test_target_algorithm_multioutput_multiclass_support(self): + cls = autosklearn.pipeline.implementations.ProjLogit.ProjLogit() + + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegex(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_qda.py b/test/test_pipeline/components/classification/test_qda.py index c8c2c0e2cf..8b9bdddd7d 100644 --- a/test/test_pipeline/components/classification/test_qda.py +++ b/test/test_pipeline/components/classification/test_qda.py @@ -1,9 +1,11 @@ import unittest from autosklearn.pipeline.components.classification.qda import QDA -from autosklearn.pipeline.util import _test_classifier +from autosklearn.pipeline.util import _test_classifier, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.qda class QDAComponentTest(unittest.TestCase): @@ -23,3 +25,58 @@ def test_default_configuration_digits(self): self.assertAlmostEqual(0.18882817243472982, sklearn.metrics.accuracy_score(predictions, targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = \ + _test_classifier(QDA, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_produce_zero_scaling(self): + from autosklearn.pipeline.classification import SimpleClassificationPipeline + from autosklearn.pipeline import util as putil + p = SimpleClassificationPipeline(configuration={ + 'balancing:strategy': 'weighting', + 'classifier:__choice__': 'qda', + 'classifier:qda:reg_param': 2.992955287687101, + 'imputation:strategy': 'most_frequent', + 'one_hot_encoding:use_minimum_fraction': 'False', + 'preprocessor:__choice__': 'gem', + 'preprocessor:gem:N': 18, + 'preprocessor:gem:precond': 0.12360249797270745, + 'rescaling:__choice__': 'none'}) + X_train, Y_train, X_test, Y_test = putil.get_dataset('iris') + self.assertRaisesRegexp(ValueError, 'Numerical problems in ' + 'QDA. QDA.scalings_ contains ' + 'values <= 0.0', + p.fit, X_train, Y_train) + # p.fit(X_train, Y_train) + # print(p.pipeline_.steps[-1][1].estimator.scalings_) + # print(p.predict_proba(X_test)) + + def test_default_configuration_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier(QDA, make_multilabel=True) + self.assertAlmostEqual(0.99456140350877187, + sklearn.metrics.average_precision_score( + predictions, targets)) + + def test_default_configuration_predict_proba_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(QDA, + make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(1.0, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.qda.QDA() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) diff --git a/test/test_pipeline/components/classification/test_random_forest.py b/test/test_pipeline/components/classification/test_random_forest.py index 81bd0a4606..df46cc3559 100644 --- a/test/test_pipeline/components/classification/test_random_forest.py +++ b/test/test_pipeline/components/classification/test_random_forest.py @@ -1,8 +1,11 @@ import unittest from autosklearn.pipeline.components.classification.random_forest import RandomForest -from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit +from autosklearn.pipeline.util import _test_classifier, \ + _test_classifier_iterative_fit, _test_classifier_predict_proba +import numpy as np +import sklearn.ensemble import sklearn.metrics @@ -26,4 +29,38 @@ def test_default_configuration_iterative_fit(self): _test_classifier_iterative_fit(RandomForest) self.assertAlmostEqual(0.95999999999999996, sklearn.metrics.accuracy_score( - predictions, targets)) \ No newline at end of file + predictions, targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier(RandomForest, + make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score( + predictions, targets)) + + def test_default_configuration_multilabel(self): + for i in range(10): + predictions, targets = _test_classifier(RandomForest, + make_multilabel=True) + self.assertAlmostEqual(0.95999999999999996, + sklearn.metrics.accuracy_score( + predictions, targets)) + + def test_default_configuration_predict_proba_multilabel(self): + for i in range(10): + predictions, targets = \ + _test_classifier_predict_proba(RandomForest, + make_multilabel=True) + self.assertEqual(predictions.shape, ((50, 3))) + self.assertAlmostEqual(0.9943139211500065, + sklearn.metrics.average_precision_score( + targets, predictions)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.ensemble.RandomForestClassifier() + + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + # Running this without an exception is the purpose of this test! + cls.fit(X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/classification/test_sgd.py b/test/test_pipeline/components/classification/test_sgd.py index 883cbf7a59..d304283aa1 100644 --- a/test/test_pipeline/components/classification/test_sgd.py +++ b/test/test_pipeline/components/classification/test_sgd.py @@ -1,9 +1,12 @@ import unittest from autosklearn.pipeline.components.classification.sgd import SGD -from autosklearn.pipeline.util import _test_classifier, _test_classifier_iterative_fit +from autosklearn.pipeline.util import _test_classifier, \ + _test_classifier_iterative_fit, _test_classifier_predict_proba +import numpy as np import sklearn.metrics +import sklearn.linear_model class SGDComponentTest(unittest.TestCase): @@ -37,4 +40,18 @@ def test_default_configuration_digits_iterative_fit(self): dataset='digits') self.assertAlmostEqual(0.91438979963570133, sklearn.metrics.accuracy_score( - predictions, targets)) \ No newline at end of file + predictions, targets)) + + def test_default_configuration_binary(self): + for i in range(10): + predictions, targets = _test_classifier(SGD, make_binary=True) + self.assertAlmostEqual(1.0, + sklearn.metrics.accuracy_score(predictions, + targets)) + + def test_target_algorithm_multioutput_multiclass_support(self): + cls = sklearn.linear_model.SGDClassifier() + X = np.random.random((10, 10)) + y = np.random.randint(0, 1, size=(10, 10)) + self.assertRaisesRegexp(ValueError, 'bad input shape \(10, 10\)', + cls.fit, X, y) \ No newline at end of file diff --git a/test/test_pipeline/components/data_preprocessing/test_balancing.py b/test/test_pipeline/components/data_preprocessing/test_balancing.py index 8da740bd53..dfa3e1ba53 100644 --- a/test/test_pipeline/components/data_preprocessing/test_balancing.py +++ b/test/test_pipeline/components/data_preprocessing/test_balancing.py @@ -18,7 +18,7 @@ from autosklearn.pipeline.components.classification.libsvm_svc import LibSVM_SVC from autosklearn.pipeline.components.classification.sgd import SGD from autosklearn.pipeline.components.feature_preprocessing\ - .extra_trees_preproc_for_classification import ExtraTreesPreprocessor + .extra_trees_preproc_for_classification import ExtraTreesPreprocessorClassification from autosklearn.pipeline.components.feature_preprocessing.liblinear_svc_preprocessor import LibLinear_Preprocessor @@ -119,7 +119,7 @@ def test_weighting_effect(self): for name, pre, acc_no_weighting, acc_weighting in \ [('extra_trees_preproc_for_classification', - ExtraTreesPreprocessor, 0.682, 0.634), + ExtraTreesPreprocessorClassification, 0.685, 0.589), ('liblinear_svc_preprocessor', LibLinear_Preprocessor, 0.714, 0.596)]: for strategy, acc in [('none', acc_no_weighting), diff --git a/test/test_pipeline/components/feature_preprocessing/test_choice.py b/test/test_pipeline/components/feature_preprocessing/test_choice.py index 9ae503f82c..6888a7f023 100644 --- a/test/test_pipeline/components/feature_preprocessing/test_choice.py +++ b/test/test_pipeline/components/feature_preprocessing/test_choice.py @@ -9,7 +9,7 @@ class FeatureProcessingTest(unittest.TestCase): def test_get_available_components(self): # Target type for target_type, num_values in [('classification', 16), - ('regression', 12)]: + ('regression', 13)]: data_properties = {'target_type': target_type} available_components = fp.FeaturePreprocessorChoice\ diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees.py deleted file mode 100644 index b1b9656b17..0000000000 --- a/test/test_pipeline/components/feature_preprocessing/test_extra_trees.py +++ /dev/null @@ -1,39 +0,0 @@ -import unittest - -from sklearn.linear_model import RidgeClassifier -from autosklearn.pipeline.components.feature_preprocessing.extra_trees_preproc_for_classification import \ - ExtraTreesPreprocessor -from autosklearn.pipeline.util import _test_preprocessing, PreprocessingTestCase, \ - get_dataset -import sklearn.metrics - - -class ExtreTreesComponentTest(PreprocessingTestCase): - def test_default_configuration(self): - transformation, original = _test_preprocessing(ExtraTreesPreprocessor) - self.assertEqual(transformation.shape[0], original.shape[0]) - self.assertFalse((transformation == 0).all()) - - def test_default_configuration_classify(self): - for i in range(2): - X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', - make_sparse=False) - configuration_space = ExtraTreesPreprocessor.get_hyperparameter_search_space() - default = configuration_space.get_default_configuration() - preprocessor = ExtraTreesPreprocessor(random_state=1, - **{hp_name: default[hp_name] - for hp_name in default}) - preprocessor.fit(X_train, Y_train) - X_train_trans = preprocessor.transform(X_train) - X_test_trans = preprocessor.transform(X_test) - - # fit a classifier on top - classifier = RidgeClassifier() - predictor = classifier.fit(X_train_trans, Y_train) - predictions = predictor.predict(X_test_trans) - accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) - self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2) - - def test_preprocessing_dtype(self): - super(ExtreTreesComponentTest, - self)._test_preprocessing_dtype(ExtraTreesPreprocessor) diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py new file mode 100644 index 0000000000..35f135e6f7 --- /dev/null +++ b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_classification.py @@ -0,0 +1,63 @@ +import unittest + +from sklearn.linear_model import RidgeClassifier +from autosklearn.pipeline.components.feature_preprocessing.\ + extra_trees_preproc_for_classification import \ + ExtraTreesPreprocessorClassification +from autosklearn.pipeline.util import _test_preprocessing, \ + PreprocessingTestCase, get_dataset +import sklearn.metrics + + +class ExtreTreesClassificationComponentTest(PreprocessingTestCase): + def test_default_configuration(self): + transformation, original = _test_preprocessing( + ExtraTreesPreprocessorClassification) + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertFalse((transformation == 0).all()) + + def test_default_configuration_classify(self): + for i in range(2): + X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', + make_sparse=False) + configuration_space = ExtraTreesPreprocessorClassification.\ + get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = ExtraTreesPreprocessorClassification( + random_state=1, + **{hp_name: default[hp_name] for hp_name in default}) + preprocessor.fit(X_train, Y_train) + X_train_trans = preprocessor.transform(X_train) + X_test_trans = preprocessor.transform(X_test) + + # fit a classifier on top + classifier = RidgeClassifier() + predictor = classifier.fit(X_train_trans, Y_train) + predictions = predictor.predict(X_test_trans) + accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) + self.assertAlmostEqual(accuracy, 0.87310261080752882, places=2) + + def test_default_configuration_classify_sparse(self): + for i in range(2): + X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', + make_sparse=True) + configuration_space = ExtraTreesPreprocessorClassification.\ + get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = ExtraTreesPreprocessorClassification( + random_state=1, + **{hp_name: default[hp_name] for hp_name in default}) + preprocessor.fit(X_train, Y_train) + X_train_trans = preprocessor.transform(X_train) + X_test_trans = preprocessor.transform(X_test) + + # fit a classifier on top + classifier = RidgeClassifier() + predictor = classifier.fit(X_train_trans, Y_train) + predictions = predictor.predict(X_test_trans) + accuracy = sklearn.metrics.accuracy_score(predictions, Y_test) + self.assertAlmostEqual(accuracy, 0.45051608986035213, places=2) + + def test_preprocessing_dtype(self): + super(ExtreTreesClassificationComponentTest, self).\ + _test_preprocessing_dtype(ExtraTreesPreprocessorClassification) diff --git a/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py new file mode 100644 index 0000000000..d7113eb564 --- /dev/null +++ b/test/test_pipeline/components/feature_preprocessing/test_extra_trees_regression.py @@ -0,0 +1,63 @@ +import unittest + +from sklearn.linear_model import Ridge +from autosklearn.pipeline.components.feature_preprocessing.\ + extra_trees_preproc_for_regression import \ + ExtraTreesPreprocessorRegression +from autosklearn.pipeline.util import _test_preprocessing, \ + PreprocessingTestCase, get_dataset +import sklearn.metrics + + +class ExtreTreesRegressionComponentTest(PreprocessingTestCase): + def test_default_configuration(self): + transformation, original = _test_preprocessing( + ExtraTreesPreprocessorRegression) + self.assertEqual(transformation.shape[0], original.shape[0]) + self.assertFalse((transformation == 0).all()) + + def test_default_configuration_regression(self): + for i in range(2): + X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', + make_sparse=False) + configuration_space = ExtraTreesPreprocessorRegression.\ + get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = ExtraTreesPreprocessorRegression( + random_state=1, + **{hp_name: default[hp_name] for hp_name in default}) + preprocessor.fit(X_train, Y_train) + X_train_trans = preprocessor.transform(X_train) + X_test_trans = preprocessor.transform(X_test) + + # fit a regressor on top + regressor = Ridge() + predictor = regressor.fit(X_train_trans, Y_train) + predictions = predictor.predict(X_test_trans) + accuracy = sklearn.metrics.mean_squared_error(predictions, Y_test) + self.assertAlmostEqual(accuracy, 28.596860630944015, places=2) + + def test_default_configuration_classify_sparse(self): + for i in range(2): + X_train, Y_train, X_test, Y_test = get_dataset(dataset='boston', + make_sparse=True) + configuration_space = ExtraTreesPreprocessorRegression.\ + get_hyperparameter_search_space() + default = configuration_space.get_default_configuration() + preprocessor = ExtraTreesPreprocessorRegression( + random_state=1, + **{hp_name: default[hp_name] for hp_name in default}) + preprocessor.fit(X_train, Y_train) + X_train_trans = preprocessor.transform(X_train) + X_test_trans = preprocessor.transform(X_test) + + # fit a regressor on top + regressor = Ridge() + predictor = regressor.fit(X_train_trans, Y_train) + predictions = predictor.predict(X_test_trans) + accuracy = sklearn.metrics.mean_squared_error(predictions, Y_test) + self.assertAlmostEqual(accuracy, 78.854181039533088, places=2) + + def test_preprocessing_dtype(self): + super(ExtreTreesRegressionComponentTest, self).\ + _test_preprocessing_dtype(ExtraTreesPreprocessorRegression) \ No newline at end of file diff --git a/test/test_pipeline/components/regression/test_ard_regression.py b/test/test_pipeline/components/regression/test_ard_regression.py new file mode 100644 index 0000000000..4091ab0495 --- /dev/null +++ b/test/test_pipeline/components/regression/test_ard_regression.py @@ -0,0 +1,17 @@ +import unittest + +from autosklearn.pipeline.components.regression.ard_regression import \ + ARDRegression +from autosklearn.pipeline.util import _test_regressor + +import sklearn.metrics + + +class ARDRegressionComponentTest(unittest.TestCase): + def test_default_configuration(self): + for i in range(10): + predictions, targets = \ + _test_regressor(ARDRegression, dataset='boston') + self.assertAlmostEqual(0.70316694175513961, + sklearn.metrics.r2_score(targets, + predictions)) diff --git a/test/test_pipeline/test_base.py b/test/test_pipeline/test_base.py index bc9663dcf1..0c3771719e 100644 --- a/test/test_pipeline/test_base.py +++ b/test/test_pipeline/test_base.py @@ -14,12 +14,11 @@ def test_get_hyperparameter_configuration_space_3choices(self): dataset_properties = {'target_type': 'classification'} exclude = {} include = {} - pipeline = [('p0', autosklearn.pipeline.components.feature_preprocessing._preprocessors[ - 'preprocessor']), - ('p1', autosklearn.pipeline.components.feature_preprocessing._preprocessors[ - 'preprocessor']), - ('c', autosklearn.pipeline.components.classification._classifiers[ - 'classifier'])] + pipeline = [('p0', + autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice), + ('p1', + autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice), + ('c', autosklearn.pipeline.components.classification.ClassifierChoice)] cs = base._get_hyperparameter_search_space(cs, dataset_properties, exclude, include, pipeline) diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 926198d2df..da8b6bba7f 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -26,6 +26,44 @@ from autosklearn.pipeline.constants import * +class DummyClassifier(AutoSklearnClassificationAlgorithm): + @staticmethod + def get_properties(dataset_properties=None): + return {'shortname': 'AB', + 'name': 'AdaBoost Classifier', + 'handles_regression': False, + 'handles_classification': True, + 'handles_multiclass': True, + 'handles_multilabel': True, + 'is_deterministic': True, + 'input': (DENSE, SPARSE, UNSIGNED_DATA), + 'output': (PREDICTIONS,)} + + @staticmethod + def get_hyperparameter_search_space(dataset_properties=None): + cs = ConfigurationSpace() + return cs + + +class DummyPreprocessor(AutoSklearnPreprocessingAlgorithm): + @staticmethod + def get_properties(dataset_properties=None): + return {'shortname': 'AB', + 'name': 'AdaBoost Classifier', + 'handles_regression': False, + 'handles_classification': True, + 'handles_multiclass': True, + 'handles_multilabel': True, + 'is_deterministic': True, + 'input': (DENSE, SPARSE, UNSIGNED_DATA), + 'output': (INPUT,)} + + @staticmethod + def get_hyperparameter_search_space(dataset_properties=None): + cs = ConfigurationSpace() + return cs + + class SimpleClassificationPipelineTest(unittest.TestCase): def test_io_dict(self): classifiers = classification_components._classifiers @@ -196,9 +234,13 @@ def test_configurations(self): self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ - e.args[0] or \ - "removed all features" in e.args[0] or \ - "all features are discarded" in e.args[0]: + e.args[0]: + continue + elif "removed all features" in e.args[0]: + continue + elif "all features are discarded" in e.args[0]: + continue + elif "Numerical problems in QDA" in e.args[0]: continue else: print(config) @@ -597,17 +639,18 @@ def test_predict_proba_batched(self): # Multilabel cls = SimpleClassificationPipeline(default) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits') - Y_train = np.array([(y, 26 - y) for y in Y_train]) + Y_train_ = np.zeros((Y_train.shape[0], 10)) + for i, y in enumerate(Y_train): + Y_train_[i][y] = 1 + Y_train = Y_train_ cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) - self.assertIsInstance(prediction, list) - self.assertEqual(2, len(prediction)) - self.assertEqual((1647, 10), prediction[0].shape) - self.assertEqual((1647, 10), prediction[1].shape) + self.assertIsInstance(prediction, np.ndarray) + self.assertEqual(prediction.shape, ((1647, 10))) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) @@ -652,17 +695,18 @@ def test_predict_proba_batched_sparse(self): cls = SimpleClassificationPipeline(config) X_train, Y_train, X_test, Y_test = get_dataset(dataset='digits', make_sparse=True) - Y_train = np.array([(y, 26 - y) for y in Y_train]) + Y_train_ = np.zeros((Y_train.shape[0], 10)) + for i, y in enumerate(Y_train): + Y_train_[i][y] = 1 + Y_train = Y_train_ cls.fit(X_train, Y_train) X_test_ = X_test.copy() prediction_ = cls.predict_proba(X_test_) cls_predict = mock.Mock(wraps=cls.pipeline_.steps[-1][1]) cls.pipeline_.steps[-1] = ("estimator", cls_predict) prediction = cls.predict_proba(X_test, batch_size=20) - self.assertIsInstance(prediction, list) - self.assertEqual(2, len(prediction)) - self.assertEqual((1647, 10), prediction[0].shape) - self.assertEqual((1647, 10), prediction[1].shape) + self.assertEqual(prediction.shape, ((1647, 10))) + self.assertIsInstance(prediction, np.ndarray) self.assertEqual(84, cls_predict.predict_proba.call_count) assert_array_almost_equal(prediction_, prediction) @@ -683,3 +727,20 @@ def test_set_params(self): def test_get_params(self): pass + + def test_add_classifier(self): + self.assertEqual(len(classification_components._addons.components), 0) + classification_components.add_classifier(DummyClassifier) + self.assertEqual(len(classification_components._addons.components), 1) + cs = SimpleClassificationPipeline.get_hyperparameter_search_space() + self.assertIn('DummyClassifier', str(cs)) + del classification_components._addons.components['DummyClassifier'] + + def test_add_preprocessor(self): + self.assertEqual(len(preprocessing_components._addons.components), 0) + preprocessing_components.add_preprocessor(DummyPreprocessor) + self.assertEqual(len(preprocessing_components._addons.components), 1) + cs = SimpleClassificationPipeline.get_hyperparameter_search_space() + self.assertIn('DummyPreprocessor', str(cs)) + del preprocessing_components._addons.components['DummyPreprocessor'] + diff --git a/test/test_pipeline/test_regression.py b/test/test_pipeline/test_regression.py index 709191534b..1a2653208a 100644 --- a/test/test_pipeline/test_regression.py +++ b/test/test_pipeline/test_regression.py @@ -98,9 +98,13 @@ def test_configurations(self): self.assertIsInstance(predicted_probabiliets, np.ndarray) except ValueError as e: if "Floating-point under-/overflow occurred at epoch" in \ - e.args[0] or \ - "removed all features" in e.args[0] or \ - "all features are discarded" in e.args[0]: + e.args[0]: + continue + elif "all features are discarded" in e.args[0]: + continue + elif "removed all features" in e.args[0]: + continue + elif "Bug in scikit-learn:" in e.args[0]: continue else: print(config) @@ -155,7 +159,7 @@ def test_get_hyperparameter_search_space(self): self.assertIsInstance(cs, ConfigurationSpace) conditions = cs.get_conditions() hyperparameters = cs.get_hyperparameters() - self.assertEqual(114, len(hyperparameters)) + self.assertEqual(130, len(hyperparameters)) self.assertEqual(len(hyperparameters) - 5, len(conditions)) def test_get_hyperparameter_search_space_include_exclude_models(self): diff --git a/testcommand.sh b/testcommand.sh index 426743ef2a..367a087990 100644 --- a/testcommand.sh +++ b/testcommand.sh @@ -1,2 +1,2 @@ #!/usr/bin/env bash -nosetests --processes=3 --process-timeout=120 -v \ No newline at end of file +nosetests --processes=3 --process-timeout=120 -v $1 \ No newline at end of file