From bef81064c431995792e6294ef99e3e108c30269e Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 21 Oct 2020 21:30:32 +0530 Subject: [PATCH 01/13] Option to allow users to calculate multiple metrics for a pipeline --- autosklearn/automl.py | 35 ++++++++++++-- autosklearn/ensemble_builder.py | 8 ++-- autosklearn/ensembles/ensemble_selection.py | 8 ++-- autosklearn/estimators.py | 5 +- autosklearn/evaluation/__init__.py | 6 +-- autosklearn/evaluation/abstract_evaluator.py | 20 ++++---- autosklearn/evaluation/test_evaluator.py | 12 ++--- autosklearn/evaluation/train_evaluator.py | 32 ++++++------- autosklearn/metrics/__init__.py | 10 +++- autosklearn/smbo.py | 5 +- ...un_auto-sklearn_for_metadata_generation.py | 6 ++- test/test_evaluation/test_test_evaluator.py | 6 ++- test/test_evaluation/test_train_evaluator.py | 47 +++++++++++-------- 13 files changed, 126 insertions(+), 74 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 96450062f1..1825c992c2 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -42,9 +42,9 @@ from autosklearn.ensembles.singlebest_ensemble import SingleBest from autosklearn.smbo import AutoMLSMBO from autosklearn.util.hash import hash_array_or_matrix -from autosklearn.metrics import f1_macro, accuracy, r2 +from autosklearn.metrics import f1_macro, accuracy, r2, CLASSIFICATION_METRICS, REGRESSION_METRICS from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \ - REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION + REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, CLASSIFICATION_TASKS from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.regression import RegressorChoice from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice @@ -119,6 +119,7 @@ def __init__(self, smac_scenario_args=None, logging_config=None, metric=None, + scoring_functions=None ): super(AutoML, self).__init__() self._backend = backend @@ -141,6 +142,7 @@ def __init__(self, self._include_preprocessors = include_preprocessors self._exclude_preprocessors = exclude_preprocessors self._resampling_strategy = resampling_strategy + self._scoring_functions = scoring_functions if scoring_functions is not None else [] self._resampling_strategy_arguments = resampling_strategy_arguments \ if resampling_strategy_arguments is not None else {} if self._resampling_strategy not in ['holdout', @@ -623,6 +625,7 @@ def fit( disable_file_output=self._disable_evaluator_output, get_smac_object_callback=self._get_smac_object_callback, smac_scenario_args=self._smac_scenario_args, + scoring_functions=self._scoring_functions, ) try: @@ -949,7 +952,7 @@ def score(self, X, y): prediction=prediction, task_type=self._task, metric=self._metric, - all_scoring_functions=False) + scoring_functions=None) @property def cv_results_(self): @@ -984,11 +987,21 @@ def cv_results_(self): masks[name] = [] hp_names.append(name) + metric_mask = dict() + metric_dict = dict() + metric_name = [] + + for metric in self._scoring_functions: + metric_name.append(metric) + metric_dict[metric] = [] + metric_mask[metric] = [] + mean_test_score = [] mean_fit_time = [] params = [] status = [] budgets = [] + task_metrics = CLASSIFICATION_METRICS if self._task in CLASSIFICATION_TASKS else REGRESSION_METRICS for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id @@ -1031,7 +1044,23 @@ def cv_results_(self): parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) + for name in metric_name: + if name in run_value.additional_info.keys(): + metric = task_metrics[name] + metric_value = metric._optimum - (metric._sign * run_value.additional_info[name]) + mask_value = False + else: + metric_value = np.NaN + mask_value = True + metric_dict[name].append(metric_value) + metric_mask[name].append(mask_value) + results['mean_test_score'] = np.array(mean_test_score) + for name in metric_name: + masked_array = ma.MaskedArray(metric_dict[name], + metric_mask[name]) + results['metric_%s' % name] = masked_array + results['mean_fit_time'] = np.array(mean_fit_time) results['params'] = params results['rank_test_scores'] = scipy.stats.rankdata(1 - results['mean_test_score'], diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 2d4c85f415..d82f80cd7d 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -511,7 +511,7 @@ def score_ensemble_preds(self): prediction=y_ensemble, task_type=self.task_type, metric=self.metric, - all_scoring_functions=False) + scoring_functions=None) if self.read_preds[y_ens_fn]["ens_score"] > -1: self.logger.debug( @@ -993,7 +993,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): prediction=train_pred, task_type=self.task_type, metric=self.metric, - all_scoring_functions=False + scoring_functions=None ) } if valid_pred is not None: @@ -1004,7 +1004,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): prediction=valid_pred, task_type=self.task_type, metric=self.metric, - all_scoring_functions=False + scoring_functions=None ) # In case test_pred was provided @@ -1014,7 +1014,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): prediction=test_pred, task_type=self.task_type, metric=self.metric, - all_scoring_functions=False + scoring_functions=None ) self.queue.put(performance_stamp) diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 2c02b9263f..b14d200f6c 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -128,7 +128,7 @@ def _fast( ) # Calculate score is versatile and can return a dict of score - # when all_scoring_functions=False, we know it will be a float + # when scoring_functions=None, we know it will be a float calculated_score = cast( float, calculate_score( @@ -136,7 +136,7 @@ def _fast( prediction=fant_ensemble_prediction, task_type=self.task_type, metric=self.metric, - all_scoring_functions=False + scoring_functions=None ) ) scores[j] = self.metric._optimum - calculated_score @@ -178,7 +178,7 @@ def _slow( ensemble.append(pred) ensemble_prediction = np.mean(np.array(ensemble), axis=0) # Calculate score is versatile and can return a dict of score - # when all_scoring_functions=False, we know it will be a float + # when scoring_functions=None, we know it will be a float calculated_score = cast( float, calculate_score( @@ -186,7 +186,7 @@ def _slow( prediction=ensemble_prediction, task_type=self.task_type, metric=self.metric, - all_scoring_functions=False + scoring_functions=None ) ) scores[j] = self.metric._optimum - calculated_score diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index c20a7b5109..2856470236 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -43,6 +43,7 @@ def __init__( logging_config=None, metadata_directory=None, metric=None, + scoring_functions=None, ): """ Parameters @@ -264,6 +265,7 @@ def __init__( self.logging_config = logging_config self.metadata_directory = metadata_directory self._metric = metric + self._scoring_functions = scoring_functions self.automl_ = None # type: Optional[AutoML] # n_jobs after conversion to a number (b/c default is None) @@ -314,7 +316,8 @@ def build_automl( smac_scenario_args=smac_scenario_args, logging_config=self.logging_config, metadata_directory=self.metadata_directory, - metric=self._metric + metric=self._metric, + scoring_functions=self._scoring_functions ) return automl diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 2c369149fe..56501fe743 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -88,7 +88,7 @@ class ExecuteTaFuncWithQueue(AbstractTAFunc): def __init__(self, backend, autosklearn_seed, resampling_strategy, metric, logger, cost_for_crash, abort_on_first_run_crash, initial_num_run=1, stats=None, - run_obj='quality', par_factor=1, all_scoring_functions=False, + run_obj='quality', par_factor=1, scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, memory_limit=None, disable_file_output=False, init_params=None, budget_type=None, ta=False, **resampling_strategy_args): @@ -142,7 +142,7 @@ def __init__(self, backend, autosklearn_seed, resampling_strategy, metric, self.metric = metric self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args - self.all_scoring_functions = all_scoring_functions + self.scoring_functions = scoring_functions # TODO deactivate output_y_hat_optimization and let the respective evaluator decide self.output_y_hat_optimization = output_y_hat_optimization self.include = include @@ -257,7 +257,7 @@ def run(self, config, instance=None, metric=self.metric, seed=self.autosklearn_seed, num_run=num_run, - all_scoring_functions=self.all_scoring_functions, + scoring_functions=self.scoring_functions, output_y_hat_optimization=self.output_y_hat_optimization, include=self.include, exclude=self.exclude, diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 93b3bde49c..2c5506857d 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -115,7 +115,7 @@ def send_warnings_to_log(message, category, filename, lineno, class AbstractEvaluator(object): def __init__(self, backend, queue, metric, configuration=None, - all_scoring_functions=False, + scoring_functions=None, seed=1, output_y_hat_optimization=True, num_run=None, @@ -146,7 +146,7 @@ def __init__(self, backend, queue, metric, self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization - self.all_scoring_functions = all_scoring_functions + self.scoring_functions = scoring_functions if isinstance(disable_file_output, (bool, list)): self.disable_file_output = disable_file_output @@ -226,7 +226,7 @@ def _get_model(self): init_params=self._init_params) return model - def _loss(self, y_true, y_hat, all_scoring_functions=None): + def _loss(self, y_true, y_hat, scoring_functions=None): """Auto-sklearn follows a minimization goal, so the make_scorer sign is used as a guide to obtain the value to reduce. @@ -238,20 +238,20 @@ def _loss(self, y_true, y_hat, all_scoring_functions=None): For accuracy for example: optimum(1) - (+1 * actual score) For logloss for example: optimum(0) - (-1 * actual score) """ - all_scoring_functions = ( - self.all_scoring_functions - if all_scoring_functions is None - else all_scoring_functions + scoring_functions = ( + self.scoring_functions + if scoring_functions is None + else scoring_functions ) if not isinstance(self.configuration, Configuration): - if all_scoring_functions: - return {self.metric: 1.0} + if scoring_functions: + return {self.metric.name: 1.0} else: return 1.0 score = calculate_score( y_true, y_hat, self.task_type, self.metric, - all_scoring_functions=all_scoring_functions) + scoring_functions=scoring_functions) if hasattr(score, '__len__'): # TODO: instead of using self.metric, it should use all metrics given by key. diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index 2fd84c15d6..5658026bc9 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -19,7 +19,7 @@ class TestEvaluator(AbstractEvaluator): def __init__(self, backend, queue, metric, configuration=None, - all_scoring_functions=False, + scoring_functions=None, seed=1, include=None, exclude=None, @@ -30,7 +30,7 @@ def __init__(self, backend, queue, metric, queue=queue, configuration=configuration, metric=metric, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, seed=seed, output_y_hat_optimization=False, num_run=-1, @@ -74,7 +74,7 @@ def predict_and_loss(self, train=False): prediction=Y_pred, task_type=self.task_type, metric=self.metric, - all_scoring_functions=self.all_scoring_functions) + scoring_functions=self.scoring_functions) else: Y_pred = self.predict_function(self.X_test, self.model, self.task_type, self.Y_train) @@ -83,7 +83,7 @@ def predict_and_loss(self, train=False): prediction=Y_pred, task_type=self.task_type, metric=self.metric, - all_scoring_functions=self.all_scoring_functions) + scoring_functions=self.scoring_functions) if hasattr(score, '__len__'): if self.task_type in CLASSIFICATION_TASKS: @@ -101,13 +101,13 @@ def predict_and_loss(self, train=False): # create closure for evaluating an algorithm # Has a stupid name so pytest doesn't regard it as a test def eval_t(queue, config, backend, metric, seed, num_run, instance, - all_scoring_functions, output_y_hat_optimization, include, + scoring_functions, output_y_hat_optimization, include, exclude, disable_file_output, init_params=None, budget_type=None, budget=None): evaluator = TestEvaluator(configuration=config, backend=backend, metric=metric, seed=seed, queue=queue, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, include=include, exclude=exclude, disable_file_output=disable_file_output, init_params=init_params) diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index a595f5a6a7..117845bce1 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -143,7 +143,7 @@ def _fit_with_budget(X_train, Y_train, budget, budget_type, logger, model, train class TrainEvaluator(AbstractEvaluator): def __init__(self, backend, queue, metric, configuration=None, - all_scoring_functions=False, + scoring_functions=None, seed=1, output_y_hat_optimization=True, resampling_strategy=None, @@ -161,7 +161,7 @@ def __init__(self, backend, queue, metric, queue=queue, configuration=configuration, metric=metric, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, seed=seed, output_y_hat_optimization=output_y_hat_optimization, num_run=num_run, @@ -358,7 +358,7 @@ def fit_predict_and_loss(self, iterative=False): # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. - if self.all_scoring_functions is True: + if self.scoring_functions: opt_loss = {} for metric in opt_losses[0].keys(): opt_loss[metric] = np.average( @@ -534,7 +534,7 @@ def fit_predict_and_loss(self, iterative=False): # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. - if self.all_scoring_functions is True: + if self.scoring_functions: opt_loss = {} for metric in opt_losses[0].keys(): opt_loss[metric] = np.average([opt_losses[i][metric] @@ -1044,7 +1044,7 @@ def eval_holdout( seed, num_run, instance, - all_scoring_functions, + scoring_functions, output_y_hat_optimization, include, exclude, @@ -1063,7 +1063,7 @@ def eval_holdout( configuration=config, seed=seed, num_run=num_run, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, output_y_hat_optimization=output_y_hat_optimization, include=include, exclude=exclude, @@ -1085,7 +1085,7 @@ def eval_iterative_holdout( seed, num_run, instance, - all_scoring_functions, + scoring_functions, output_y_hat_optimization, include, exclude, @@ -1103,7 +1103,7 @@ def eval_iterative_holdout( resampling_strategy_args=resampling_strategy_args, seed=seed, num_run=num_run, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, output_y_hat_optimization=output_y_hat_optimization, include=include, exclude=exclude, @@ -1126,7 +1126,7 @@ def eval_partial_cv( seed, num_run, instance, - all_scoring_functions, + scoring_functions, output_y_hat_optimization, include, exclude, @@ -1150,7 +1150,7 @@ def eval_partial_cv( resampling_strategy_args=resampling_strategy_args, seed=seed, num_run=num_run, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, output_y_hat_optimization=False, include=include, exclude=exclude, @@ -1173,7 +1173,7 @@ def eval_partial_cv_iterative( seed, num_run, instance, - all_scoring_functions, + scoring_functions, output_y_hat_optimization, include, exclude, @@ -1194,7 +1194,7 @@ def eval_partial_cv_iterative( seed=seed, num_run=num_run, instance=instance, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, output_y_hat_optimization=output_y_hat_optimization, include=include, exclude=exclude, @@ -1215,7 +1215,7 @@ def eval_cv( seed, num_run, instance, - all_scoring_functions, + scoring_functions, output_y_hat_optimization, include, exclude, @@ -1234,7 +1234,7 @@ def eval_cv( num_run=num_run, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, output_y_hat_optimization=output_y_hat_optimization, include=include, exclude=exclude, @@ -1257,7 +1257,7 @@ def eval_iterative_cv( seed, num_run, instance, - all_scoring_functions, + scoring_functions, output_y_hat_optimization, include, exclude, @@ -1276,7 +1276,7 @@ def eval_iterative_cv( num_run=num_run, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, - all_scoring_functions=all_scoring_functions, + scoring_functions=scoring_functions, output_y_hat_optimization=output_y_hat_optimization, include=include, exclude=exclude, diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 62ed48e799..aa20b968fc 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -341,12 +341,12 @@ def calculate_score( prediction: np.ndarray, task_type: int, metric: Scorer, - all_scoring_functions: bool = False + scoring_functions: list = None ) -> Union[float, Dict[str, float]]: if task_type not in TASK_TYPES: raise NotImplementedError(task_type) - if all_scoring_functions: + if scoring_functions: score_dict = dict() if task_type in REGRESSION_TASKS: # TODO put this into the regression metric itself @@ -354,6 +354,9 @@ def calculate_score( metric_dict = copy.copy(REGRESSION_METRICS) metric_dict[metric.name] = metric for metric_ in REGRESSION_METRICS: + if metric_ not in scoring_functions: + continue + func = REGRESSION_METRICS[metric_] try: score_dict[func.name] = func(solution, cprediction) @@ -369,6 +372,9 @@ def calculate_score( metric_dict = copy.copy(CLASSIFICATION_METRICS) metric_dict[metric.name] = metric for metric_ in metric_dict: + if metric_ not in scoring_functions: + continue + func = CLASSIFICATION_METRICS[metric_] # TODO maybe annotate metrics to define which cases they can diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 0992cb2383..2964a25ed5 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -218,7 +218,8 @@ def __init__(self, config_space, dataset_name, exclude_preprocessors=None, disable_file_output=False, smac_scenario_args=None, - get_smac_object_callback=None): + get_smac_object_callback=None, + scoring_functions=None): super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name @@ -259,6 +260,7 @@ def __init__(self, config_space, dataset_name, self.disable_file_output = disable_file_output self.smac_scenario_args = smac_scenario_args self.get_smac_object_callback = get_smac_object_callback + self.scoring_functions = scoring_functions dataset_name_ = "" if dataset_name is None else dataset_name logger_name = '%s(%d):%s' % (self.__class__.__name__, self.seed, ":" + dataset_name_) @@ -433,6 +435,7 @@ def run_smbo(self): metric=self.metric, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, + scoring_functions=self.scoring_functions, **self.resampling_strategy_args ) ta = ExecuteTaFuncWithQueue diff --git a/scripts/run_auto-sklearn_for_metadata_generation.py b/scripts/run_auto-sklearn_for_metadata_generation.py index c197b2179b..acc6691886 100644 --- a/scripts/run_auto-sklearn_for_metadata_generation.py +++ b/scripts/run_auto-sklearn_for_metadata_generation.py @@ -10,7 +10,7 @@ from autosklearn.regression import AutoSklearnRegressor from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.metrics import accuracy, balanced_accuracy, roc_auc, log_loss, r2, \ - mean_squared_error, mean_absolute_error, root_mean_squared_error + mean_squared_error, mean_absolute_error, root_mean_squared_error, CLASSIFICATION_METRICS, REGRESSION_METRICS from smac.runhistory.runhistory import RunInfo from smac.scenario.scenario import Scenario @@ -93,8 +93,10 @@ if task_type == 'classification': automl = AutoSklearnClassifier(**automl_arguments) + scoring_functions = CLASSIFICATION_METRICS elif task_type == 'regression': automl = AutoSklearnRegressor(**automl_arguments) + scoring_functions = REGRESSION_METRICS else: raise ValueError(task_type) @@ -138,7 +140,7 @@ disable_file_output=True, logger=logger, stats=stats, - all_scoring_functions=True, + scoring_functions=scoring_functions, include=include, metric=automl_arguments['metric'], cost_for_crash=get_cost_of_crash(automl_arguments['metric']), diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 5a7cfac96d..c35de5d4f3 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -95,7 +95,7 @@ def test_eval_test(self): config=self.configuration, metric=accuracy, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=False, include=None, exclude=None, @@ -115,7 +115,9 @@ def test_eval_test_all_loss_functions(self): config=self.configuration, metric=accuracy, seed=1, num_run=1, - all_scoring_functions=True, + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', 'f1_weighted', + 'log_loss', 'precision_macro', 'precision_micro', 'precision_weighted', + 'recall_macro', 'recall_micro', 'recall_weighted'], output_y_hat_optimization=False, include=None, exclude=None, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 37db05e329..e7c5bf1aa9 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -99,7 +99,7 @@ def test_holdout(self, pipeline_mock): configuration=configuration, resampling_strategy='holdout', resampling_strategy_args={'train_size': 0.66}, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy, ) @@ -166,7 +166,7 @@ def configuration_fully_fitted(self): evaluator = TrainEvaluator(backend_api, queue_, configuration=configuration, resampling_strategy='holdout', - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy, budget=0.0) @@ -265,7 +265,7 @@ def configuration_fully_fitted(self): evaluator = TrainEvaluator(backend_api, queue_, configuration=configuration, resampling_strategy='holdout-iterative-fit', - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy, budget=0.0) @@ -336,7 +336,7 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock): evaluator = TrainEvaluator(backend_api, queue_, configuration=configuration, resampling_strategy='holdout-iterative-fit', - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -380,7 +380,7 @@ def test_cv(self, pipeline_mock): configuration=configuration, resampling_strategy='cv', resampling_strategy_args={'folds': 5}, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -435,7 +435,7 @@ def test_partial_cv(self, pipeline_mock): configuration=configuration, resampling_strategy='partial-cv', resampling_strategy_args={'folds': 5}, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy) @@ -496,7 +496,7 @@ def configuration_fully_fitted(self): configuration=configuration, resampling_strategy='partial-cv-iterative-fit', resampling_strategy_args={'folds': 5}, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, metric=accuracy, budget=0.0) @@ -563,7 +563,10 @@ def test_file_output(self, loss_mock, makedirs_mock): configuration=configuration, resampling_strategy='cv', resampling_strategy_args={'folds': 5}, - all_scoring_functions=True, + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', + 'f1_weighted', 'log_loss', 'precision_macro', + 'precision_micro', 'precision_weighted', 'recall_macro', + 'recall_micro', 'recall_weighted'], output_y_hat_optimization=True, metric=accuracy) @@ -2267,7 +2270,7 @@ def test_eval_holdout(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + ascoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, @@ -2290,7 +2293,9 @@ def test_eval_holdout_all_loss_functions(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=True, + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', 'f1_weighted', + 'log_loss', 'precision_macro', 'precision_micro', 'precision_weighted', + 'recall_macro', 'recall_micro', 'recall_weighted'], output_y_hat_optimization=True, include=None, exclude=None, @@ -2340,7 +2345,7 @@ def test_eval_holdout_iterative_fit_no_timeout(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, @@ -2363,7 +2368,7 @@ def test_eval_holdout_budget_iterations(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, @@ -2392,7 +2397,7 @@ def test_eval_holdout_budget_iterations_converged(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude={'classifier': ['random_forest', 'liblinear_svc']}, @@ -2417,7 +2422,7 @@ def test_eval_holdout_budget_subsample(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, @@ -2443,7 +2448,7 @@ def test_eval_holdout_budget_mixed_iterations(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, @@ -2473,7 +2478,7 @@ def test_eval_holdout_budget_mixed_subsample(self): resampling_strategy_args=None, seed=1, num_run=1, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude={'classifier': ['random_forest']}, @@ -2498,7 +2503,7 @@ def test_eval_cv(self): num_run=1, resampling_strategy='cv', resampling_strategy_args={'folds': 3}, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, @@ -2521,7 +2526,9 @@ def test_eval_cv_all_loss_functions(self): num_run=1, resampling_strategy='cv', resampling_strategy_args={'folds': 3}, - all_scoring_functions=True, + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', 'f1_weighted', + 'log_loss', 'precision_macro', 'precision_micro', 'precision_weighted', + 'recall_macro', 'recall_micro', 'recall_weighted'], output_y_hat_optimization=True, include=None, exclude=None, @@ -2565,7 +2572,7 @@ def test_eval_cv_all_loss_functions(self): # backend_api = backend.create(self.tmp_dir, self.tmp_dir) # eval_cv(queue=self.queue, config=self.configuration, data=self.data, # backend=backend_api, seed=1, num_run=1, folds=5, subsample=45, - # with_predictions=True, all_scoring_functions=False, + # with_predictions=True, scoring_functions=None, # output_y_hat_optimization=True, include=None, exclude=None, # disable_file_output=False) # info = self.queue.get() @@ -2589,7 +2596,7 @@ def test_eval_partial_cv(self): instance=instance, resampling_strategy='partial-cv', resampling_strategy_args={'folds': 5}, - all_scoring_functions=False, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, From 2b08168b99d718706a944d92beb404554ed01ac8 Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 28 Oct 2020 15:56:30 +0530 Subject: [PATCH 02/13] Fix: metric score was not calculated when scoring_function passed --- autosklearn/metrics/__init__.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index aa20b968fc..55e653c184 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -394,14 +394,20 @@ def calculate_score( continue else: raise e + + if metric and metric.name not in score_dict.keys(): + score_dict[metric.name] = get_metric_score(metric, prediction, solution, task_type) return score_dict else: - if task_type in REGRESSION_TASKS: - # TODO put this into the regression metric itself - cprediction = sanitize_array(prediction) - score = metric(solution, cprediction) - else: - score = metric(solution, prediction) + return get_metric_score(metric, prediction, solution, task_type) + - return score +def get_metric_score(metric, prediction, solution, task_type): + if task_type in REGRESSION_TASKS: + # TODO put this into the regression metric itself + cprediction = sanitize_array(prediction) + score = metric(solution, cprediction) + else: + score = metric(solution, prediction) + return score From e931b1a5c86688f1c01b9e62d2159b39532eeee5 Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 4 Nov 2020 12:39:57 +0530 Subject: [PATCH 03/13] Fixing code formatting --- autosklearn/automl.py | 11 +++++++--- test/test_evaluation/test_test_evaluator.py | 7 +++--- test/test_evaluation/test_train_evaluator.py | 23 +++++++++++--------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 1825c992c2..233240ddb0 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -44,7 +44,8 @@ from autosklearn.util.hash import hash_array_or_matrix from autosklearn.metrics import f1_macro, accuracy, r2, CLASSIFICATION_METRICS, REGRESSION_METRICS from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \ - REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION, CLASSIFICATION_TASKS + REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION,\ + CLASSIFICATION_TASKS from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.regression import RegressorChoice from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice @@ -1001,7 +1002,10 @@ def cv_results_(self): params = [] status = [] budgets = [] - task_metrics = CLASSIFICATION_METRICS if self._task in CLASSIFICATION_TASKS else REGRESSION_METRICS + if self._task in CLASSIFICATION_TASKS: + task_metrics = CLASSIFICATION_METRICS + else: + task_metrics = REGRESSION_METRICS for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id @@ -1047,7 +1051,8 @@ def cv_results_(self): for name in metric_name: if name in run_value.additional_info.keys(): metric = task_metrics[name] - metric_value = metric._optimum - (metric._sign * run_value.additional_info[name]) + metric_cost = run_value.additional_info[name] + metric_value = metric._optimum - (metric._sign * metric_cost) mask_value = False else: metric_value = np.NaN diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index c35de5d4f3..9affa9bb5c 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -115,9 +115,10 @@ def test_eval_test_all_loss_functions(self): config=self.configuration, metric=accuracy, seed=1, num_run=1, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', 'f1_weighted', - 'log_loss', 'precision_macro', 'precision_micro', 'precision_weighted', - 'recall_macro', 'recall_micro', 'recall_weighted'], + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', + 'f1_weighted', 'log_loss', 'precision_macro', 'precision_micro', + 'precision_weighted', 'recall_macro', 'recall_micro', + 'recall_weighted'], output_y_hat_optimization=False, include=None, exclude=None, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index e7c5bf1aa9..649e69d494 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -558,15 +558,16 @@ def test_file_output(self, loss_mock, makedirs_mock): configuration = unittest.mock.Mock(spec=Configuration) queue_ = multiprocessing.Queue() loss_mock.return_value = None + scorer_list = ['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', + 'f1_weighted', 'log_loss', 'precision_macro', + 'precision_micro', 'precision_weighted', 'recall_macro', + 'recall_micro', 'recall_weighted'] evaluator = TrainEvaluator(self.backend_mock, queue=queue_, configuration=configuration, resampling_strategy='cv', resampling_strategy_args={'folds': 5}, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', - 'f1_weighted', 'log_loss', 'precision_macro', - 'precision_micro', 'precision_weighted', 'recall_macro', - 'recall_micro', 'recall_weighted'], + scoring_functions=scorer_list, output_y_hat_optimization=True, metric=accuracy) @@ -2293,9 +2294,10 @@ def test_eval_holdout_all_loss_functions(self): resampling_strategy_args=None, seed=1, num_run=1, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', 'f1_weighted', - 'log_loss', 'precision_macro', 'precision_micro', 'precision_weighted', - 'recall_macro', 'recall_micro', 'recall_weighted'], + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', + 'f1_weighted', 'log_loss', 'precision_macro', 'precision_micro', + 'precision_weighted', 'recall_macro', 'recall_micro', + 'recall_weighted'], output_y_hat_optimization=True, include=None, exclude=None, @@ -2526,9 +2528,10 @@ def test_eval_cv_all_loss_functions(self): num_run=1, resampling_strategy='cv', resampling_strategy_args={'folds': 3}, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', 'f1_weighted', - 'log_loss', 'precision_macro', 'precision_micro', 'precision_weighted', - 'recall_macro', 'recall_micro', 'recall_weighted'], + scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', + 'f1_weighted', 'log_loss', 'precision_macro', 'precision_micro', + 'precision_weighted', 'recall_macro', 'recall_micro', + 'recall_weighted'], output_y_hat_optimization=True, include=None, exclude=None, From 3ab0ac2a704749e318483b013564c88f502f934f Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Tue, 10 Nov 2020 19:38:49 +0530 Subject: [PATCH 04/13] Incorporating review comments --- autosklearn/automl.py | 24 ++++++++----------- autosklearn/estimators.py | 9 +++++-- autosklearn/metrics/__init__.py | 21 ++++------------ autosklearn/smbo.py | 2 +- ...un_auto-sklearn_for_metadata_generation.py | 6 +++-- test/test_evaluation/evaluation_util.py | 7 ++++++ test/test_evaluation/test_test_evaluator.py | 7 ++---- test/test_evaluation/test_train_evaluator.py | 17 ++++--------- 8 files changed, 40 insertions(+), 53 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index ac639bb8c3..a9112de31b 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -45,7 +45,7 @@ from autosklearn.ensembles.singlebest_ensemble import SingleBest from autosklearn.smbo import AutoMLSMBO from autosklearn.util.hash import hash_array_or_matrix -from autosklearn.metrics import f1_macro, accuracy, r2, CLASSIFICATION_METRICS, REGRESSION_METRICS +from autosklearn.metrics import f1_macro, accuracy, r2 from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \ REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION,\ CLASSIFICATION_TASKS @@ -1020,19 +1020,16 @@ def cv_results_(self): metric_name = [] for metric in self._scoring_functions: - metric_name.append(metric) - metric_dict[metric] = [] - metric_mask[metric] = [] + metric_name.append(metric.name) + metric_dict[metric.name] = [] + metric_mask[metric.name] = [] mean_test_score = [] mean_fit_time = [] params = [] status = [] budgets = [] - if self._task in CLASSIFICATION_TASKS: - task_metrics = CLASSIFICATION_METRICS - else: - task_metrics = REGRESSION_METRICS + for run_key in self.runhistory_.data: run_value = self.runhistory_.data[run_key] config_id = run_key.config_id @@ -1075,17 +1072,16 @@ def cv_results_(self): parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) - for name in metric_name: - if name in run_value.additional_info.keys(): - metric = task_metrics[name] - metric_cost = run_value.additional_info[name] + for metric in self._scoring_functions: + if metric.name in run_value.additional_info.keys(): + metric_cost = run_value.additional_info[metric.name] metric_value = metric._optimum - (metric._sign * metric_cost) mask_value = False else: metric_value = np.NaN mask_value = True - metric_dict[name].append(metric_value) - metric_mask[name].append(mask_value) + metric_dict[metric.name].append(metric_value) + metric_mask[metric.name].append(mask_value) results['mean_test_score'] = np.array(mean_test_score) for name in metric_name: diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 976a5d162c..bc4b035889 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1,6 +1,6 @@ # -*- encoding: utf-8 -*- -from typing import Optional, Dict +from typing import Optional, Dict, List import dask.distributed import joblib @@ -9,6 +9,7 @@ from sklearn.utils.multiclass import type_of_target from autosklearn.automl import AutoMLClassifier, AutoMLRegressor, AutoML +from autosklearn.metrics import Scorer from autosklearn.util.backend import create @@ -42,7 +43,7 @@ def __init__( logging_config=None, metadata_directory=None, metric=None, - scoring_functions=None, + scoring_functions: List[Scorer] = None, load_models: bool = True, ): """ @@ -219,6 +220,10 @@ def __init__( Metrics`_. If None is provided, a default metric is selected depending on the task. + scoring_functions : List[Scorer], optional (None) + List of scorers which will be calculated for each pipeline and results will be + available via ``cv_results`` + load_models : bool, optional (True) Whether to load the models after fitting Auto-sklearn. diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 55e653c184..6fe08621b8 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -341,7 +341,7 @@ def calculate_score( prediction: np.ndarray, task_type: int, metric: Scorer, - scoring_functions: list = None + scoring_functions: List[Scorer] = None ) -> Union[float, Dict[str, float]]: if task_type not in TASK_TYPES: raise NotImplementedError(task_type) @@ -351,15 +351,10 @@ def calculate_score( if task_type in REGRESSION_TASKS: # TODO put this into the regression metric itself cprediction = sanitize_array(prediction) - metric_dict = copy.copy(REGRESSION_METRICS) - metric_dict[metric.name] = metric - for metric_ in REGRESSION_METRICS: - if metric_ not in scoring_functions: - continue + for metric_ in scoring_functions: - func = REGRESSION_METRICS[metric_] try: - score_dict[func.name] = func(solution, cprediction) + score_dict[metric_.name] = metric_(solution, cprediction) except ValueError as e: print(e, e.args[0]) if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \ @@ -369,19 +364,13 @@ def calculate_score( raise e else: - metric_dict = copy.copy(CLASSIFICATION_METRICS) - metric_dict[metric.name] = metric - for metric_ in metric_dict: - if metric_ not in scoring_functions: - continue - - func = CLASSIFICATION_METRICS[metric_] + for metric_ in scoring_functions: # TODO maybe annotate metrics to define which cases they can # handle? try: - score_dict[func.name] = func(solution, prediction) + score_dict[metric_.name] = metric_(solution, prediction) except ValueError as e: if e.args[0] == 'multiclass format is not supported': continue diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 91ee89a96e..cfa1ec4731 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -221,7 +221,7 @@ def __init__(self, config_space, dataset_name, disable_file_output=False, smac_scenario_args=None, get_smac_object_callback=None, - scoring_functions=None): + scoring_functions=None, ensemble_callback: typing.Optional[EnsembleBuilderManager] = None, ): super(AutoMLSMBO, self).__init__() diff --git a/scripts/run_auto-sklearn_for_metadata_generation.py b/scripts/run_auto-sklearn_for_metadata_generation.py index 7145545434..56a0d6fadf 100644 --- a/scripts/run_auto-sklearn_for_metadata_generation.py +++ b/scripts/run_auto-sklearn_for_metadata_generation.py @@ -93,13 +93,15 @@ if task_type == 'classification': automl = AutoSklearnClassifier(**automl_arguments) - scoring_functions = CLASSIFICATION_METRICS + scorer_list = CLASSIFICATION_METRICS elif task_type == 'regression': automl = AutoSklearnRegressor(**automl_arguments) - scoring_functions = REGRESSION_METRICS + scorer_list = REGRESSION_METRICS else: raise ValueError(task_type) +scoring_functions = [scorer for name, scorer in scorer_list.items()] + automl.fit(X_train, y_train, dataset_name=str(task_id), feat_type=cat, X_test=X_test, y_test=y_test) trajectory = automl.trajectory_ diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py index 70a7fc1ae2..2ddb1f9438 100644 --- a/test/test_evaluation/evaluation_util.py +++ b/test/test_evaluation/evaluation_util.py @@ -14,6 +14,13 @@ from autosklearn.util.data import convert_to_bin from autosklearn.data.xy_data_manager import XYDataManager from autosklearn.pipeline.util import get_dataset +from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, f1_micro, f1_weighted, \ + log_loss, precision_macro, precision_micro, precision_weighted, recall_macro, \ + recall_micro, recall_weighted + +SCORER_LIST = [accuracy, balanced_accuracy, f1_macro, f1_micro, f1_weighted, log_loss, + precision_macro, precision_micro, precision_weighted, recall_macro, + recall_micro, recall_weighted] N_TEST_RUNS = 5 diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 89414893d7..db0cbbd6cc 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -22,7 +22,7 @@ this_directory = os.path.dirname(__file__) sys.path.append(this_directory) from evaluation_util import get_dataset_getters, BaseEvaluatorTest, \ - get_multiclass_classification_datamanager # noqa (E402: module level import not at top of file) + get_multiclass_classification_datamanager, SCORER_LIST # noqa (E402: module level import not at top of file) N_TEST_RUNS = 3 @@ -114,10 +114,7 @@ def test_eval_test_all_loss_functions(self): config=self.configuration, metric=accuracy, seed=1, num_run=1, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', - 'f1_weighted', 'log_loss', 'precision_macro', 'precision_micro', - 'precision_weighted', 'recall_macro', 'recall_micro', - 'recall_weighted'], + scoring_functions=SCORER_LIST, output_y_hat_optimization=False, include=None, exclude=None, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index c9b980ed3f..2c8e499ae2 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -34,7 +34,7 @@ sys.path.append(this_directory) from evaluation_util import get_regression_datamanager, BaseEvaluatorTest, \ get_binary_classification_datamanager, get_dataset_getters, \ - get_multiclass_classification_datamanager # noqa (E402: module level import not at top of file) + get_multiclass_classification_datamanager, SCORER_LIST # noqa (E402: module level import not at top of file) class BackendMock(object): @@ -546,10 +546,7 @@ def test_file_output(self, loss_mock): configuration = unittest.mock.Mock(spec=Configuration) queue_ = multiprocessing.Queue() loss_mock.return_value = None - scorer_list = ['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', - 'f1_weighted', 'log_loss', 'precision_macro', - 'precision_micro', 'precision_weighted', 'recall_macro', - 'recall_micro', 'recall_weighted'] + scorer_list = SCORER_LIST evaluator = TrainEvaluator(self.backend_mock, queue=queue_, configuration=configuration, @@ -2285,10 +2282,7 @@ def test_eval_holdout_all_loss_functions(self): resampling_strategy_args=None, seed=1, num_run=1, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', - 'f1_weighted', 'log_loss', 'precision_macro', 'precision_micro', - 'precision_weighted', 'recall_macro', 'recall_micro', - 'recall_weighted'], + scoring_functions=SCORER_LIST, output_y_hat_optimization=True, include=None, exclude=None, @@ -2519,10 +2513,7 @@ def test_eval_cv_all_loss_functions(self): num_run=1, resampling_strategy='cv', resampling_strategy_args={'folds': 3}, - scoring_functions=['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro', - 'f1_weighted', 'log_loss', 'precision_macro', 'precision_micro', - 'precision_weighted', 'recall_macro', 'recall_micro', - 'recall_weighted'], + scoring_functions=SCORER_LIST, output_y_hat_optimization=True, include=None, exclude=None, From b4a72609a258f82207e4b0eaf41dc3aae4046b38 Mon Sep 17 00:00:00 2001 From: ROHIT AGARWAL Date: Tue, 10 Nov 2020 19:43:28 +0530 Subject: [PATCH 05/13] Update run_auto-sklearn_for_metadata_generation.py --- scripts/run_auto-sklearn_for_metadata_generation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/run_auto-sklearn_for_metadata_generation.py b/scripts/run_auto-sklearn_for_metadata_generation.py index 56a0d6fadf..99ff93af3d 100644 --- a/scripts/run_auto-sklearn_for_metadata_generation.py +++ b/scripts/run_auto-sklearn_for_metadata_generation.py @@ -10,7 +10,8 @@ from autosklearn.regression import AutoSklearnRegressor from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.metrics import accuracy, balanced_accuracy, roc_auc, log_loss, r2, \ - mean_squared_error, mean_absolute_error, root_mean_squared_error, CLASSIFICATION_METRICS, REGRESSION_METRICS + mean_squared_error, mean_absolute_error, root_mean_squared_error, CLASSIFICATION_METRICS, \ + REGRESSION_METRICS from smac.runhistory.runhistory import RunInfo from smac.scenario.scenario import Scenario From d94b6ac8cffc3868dff27cef7080adc836f4f4bd Mon Sep 17 00:00:00 2001 From: ROHIT AGARWAL Date: Wed, 11 Nov 2020 15:46:51 +0530 Subject: [PATCH 06/13] Update test_train_evaluator.py --- test/test_evaluation/test_train_evaluator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 2c8e499ae2..281cdcfaa3 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -546,13 +546,12 @@ def test_file_output(self, loss_mock): configuration = unittest.mock.Mock(spec=Configuration) queue_ = multiprocessing.Queue() loss_mock.return_value = None - scorer_list = SCORER_LIST evaluator = TrainEvaluator(self.backend_mock, queue=queue_, configuration=configuration, resampling_strategy='cv', resampling_strategy_args={'folds': 5}, - scoring_functions=scorer_list, + scoring_functions=SCORER_LIST, output_y_hat_optimization=True, metric=accuracy) From 3d79a3e7b27f5510aebcd0672f7064d045e9d84a Mon Sep 17 00:00:00 2001 From: ROHIT AGARWAL Date: Wed, 11 Nov 2020 16:11:46 +0530 Subject: [PATCH 07/13] Update estimators.py --- autosklearn/estimators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index bc4b035889..a7936c517a 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -43,7 +43,7 @@ def __init__( logging_config=None, metadata_directory=None, metric=None, - scoring_functions: List[Scorer] = None, + scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, ): """ From 7d86ddd6f1a68a92f79c10d13ad097cd5713b515 Mon Sep 17 00:00:00 2001 From: ROHIT AGARWAL Date: Wed, 11 Nov 2020 16:12:29 +0530 Subject: [PATCH 08/13] Update __init__.py --- autosklearn/metrics/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 6fe08621b8..1d8c5deb62 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -341,7 +341,7 @@ def calculate_score( prediction: np.ndarray, task_type: int, metric: Scorer, - scoring_functions: List[Scorer] = None + scoring_functions: Optional[List[Scorer]] = None ) -> Union[float, Dict[str, float]]: if task_type not in TASK_TYPES: raise NotImplementedError(task_type) From 3bbeda8174c2e4a2643a0323c01ceb0357eb786a Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 11 Nov 2020 21:12:09 +0530 Subject: [PATCH 09/13] Fix build --- autosklearn/automl.py | 3 +-- autosklearn/metrics/__init__.py | 12 ++++++++---- test/test_evaluation/test_train_evaluator.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index a9112de31b..06ab992a5c 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -47,8 +47,7 @@ from autosklearn.util.hash import hash_array_or_matrix from autosklearn.metrics import f1_macro, accuracy, r2 from autosklearn.constants import MULTILABEL_CLASSIFICATION, MULTICLASS_CLASSIFICATION, \ - REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION,\ - CLASSIFICATION_TASKS + REGRESSION_TASKS, REGRESSION, BINARY_CLASSIFICATION, MULTIOUTPUT_REGRESSION from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.regression import RegressorChoice from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 1d8c5deb62..98434a980d 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,4 +1,3 @@ -import copy from abc import ABCMeta, abstractmethod from functools import partial from typing import Any, Callable, Dict, List, Optional, Union @@ -392,11 +391,16 @@ def calculate_score( return get_metric_score(metric, prediction, solution, task_type) -def get_metric_score(metric, prediction, solution, task_type): +def get_metric_score( + metric_: Scorer, + prediction: np.ndarray, + solution: np.ndarray, + task_type: int +) -> float: if task_type in REGRESSION_TASKS: # TODO put this into the regression metric itself cprediction = sanitize_array(prediction) - score = metric(solution, cprediction) + score = metric_(solution, cprediction) else: - score = metric(solution, prediction) + score = metric_(solution, prediction) return score diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 281cdcfaa3..3c8a29cc3e 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -2258,7 +2258,7 @@ def test_eval_holdout(self): resampling_strategy_args=None, seed=1, num_run=1, - ascoring_functions=None, + scoring_functions=None, output_y_hat_optimization=True, include=None, exclude=None, From 75ac0773db2255400f3cbdc5ea69538e32dcd864 Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 11 Nov 2020 21:13:45 +0530 Subject: [PATCH 10/13] Removed unnecessary checks --- autosklearn/metrics/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 98434a980d..e85dffc2f2 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -383,7 +383,7 @@ def calculate_score( else: raise e - if metric and metric.name not in score_dict.keys(): + if metric.name not in score_dict.keys(): score_dict[metric.name] = get_metric_score(metric, prediction, solution, task_type) return score_dict From fcd4175c84ce0196e49e0fe46eb99d92da621803 Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 2 Dec 2020 16:50:48 +0530 Subject: [PATCH 11/13] Adding test cases --- test/test_automl/test_estimators.py | 10 +++- test/test_metric/test_metrics.py | 87 +++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index 4db354dbe7..1dbcd132bf 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -249,7 +249,9 @@ def test_cv_results(tmp_dir, output_dir): output_folder=output_dir, seed=1, initial_configurations_via_metalearning=0, - ensemble_size=0) + ensemble_size=0, + scoring_functions=[autosklearn.metrics.precision, + autosklearn.metrics.roc_auc]) cls.fit(X_train, Y_train) cv_results = cls.cv_results_ assert isinstance(cv_results, dict), type(cv_results) @@ -262,6 +264,12 @@ def test_cv_results(tmp_dir, output_dir): assert isinstance(cv_results['rank_test_scores'], np.ndarray), type( cv_results['rank_test_scores'] ) + assert isinstance(cv_results['metric_precision'], npma.MaskedArray), type( + cv_results['metric_precision'] + ) + assert isinstance(cv_results['metric_roc_auc'], npma.MaskedArray), type( + cv_results['metric_roc_auc'] + ) cv_result_items = [isinstance(val, npma.MaskedArray) for key, val in cv_results.items() if key.startswith('param_')] assert all(cv_result_items), cv_results.items() diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 39aa493539..cbb7b16e17 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -5,6 +5,9 @@ import autosklearn.metrics +from autosklearn.metrics import calculate_score +from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION + from smac.utils.constants import MAXINT @@ -443,3 +446,87 @@ def test_classification_multilabel(self): previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) + + +class TestCalculateScore(unittest.TestCase): + + def test_unsupported_task_type(self): + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) + y_pred = \ + np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + scorer = autosklearn.metrics.accuracy + + raised = False + try: + calculate_score(y_true, y_pred, 6, scorer) + except NotImplementedError: + raised = True + assert raised + + def test_classification_scoring_functions(self): + + scoring_functions = list(autosklearn.metrics.CLASSIFICATION_METRICS.values()) + scoring_functions.remove(autosklearn.metrics.accuracy) + fail_metrics = ['precision_samples', 'recall_samples', 'f1_samples'] + success_metrics = list(autosklearn.metrics.CLASSIFICATION_METRICS.keys()) + for metric in fail_metrics: + success_metrics.remove(metric) + + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) + y_pred = \ + np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + score_dict = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, + autosklearn.metrics.accuracy, + scoring_functions) + + self.assertIsInstance(score_dict, dict) + self.assertTrue(len(success_metrics), len(score_dict)) + for metric in fail_metrics: + self.assertNotIn(metric, score_dict.keys()) + for metric in success_metrics: + self.assertIn(metric, score_dict.keys()) + self.assertAlmostEqual(autosklearn.metrics.CLASSIFICATION_METRICS[metric]._optimum, + score_dict[metric]) + + def test_regression_scoring_functions(self): + + scoring_functions = list(autosklearn.metrics.REGRESSION_METRICS.values()) + scoring_functions.remove(autosklearn.metrics.root_mean_squared_error) + + metrics = list(autosklearn.metrics.REGRESSION_METRICS.keys()) + metrics.remove('mean_squared_log_error') + + y_true = np.array([1, 2, 3, -4]) + y_pred = y_true.copy() + + score_dict = calculate_score(y_true, y_pred, REGRESSION, + autosklearn.metrics.root_mean_squared_error, + scoring_functions) + + self.assertIsInstance(score_dict, dict) + self.assertTrue(len(metrics), len(score_dict)) + for metric in metrics: + self.assertIn(metric, score_dict.keys()) + self.assertAlmostEqual(autosklearn.metrics.REGRESSION_METRICS[metric]._optimum, + score_dict[metric]) + + def test_classification_only_metric(self): + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) + y_pred = \ + np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + scorer = autosklearn.metrics.accuracy + + score = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, scorer) + + previous_score = scorer._optimum + self.assertAlmostEqual(score, previous_score) + + def test_regression_only_metric(self): + y_true = np.array([1, 2, 3, 4]) + y_pred = y_true.copy() + + scorer = autosklearn.metrics.root_mean_squared_error + + score = calculate_score(y_true, y_pred, REGRESSION, scorer) + previous_score = scorer._optimum + self.assertAlmostEqual(score, previous_score) \ No newline at end of file From 9f955a3082c69c8de4cab1aa1229d4bdb3bc2d38 Mon Sep 17 00:00:00 2001 From: ROHIT AGARWAL Date: Wed, 2 Dec 2020 23:43:17 +0530 Subject: [PATCH 12/13] Update test/test_metric/test_metrics.py Co-authored-by: Matthias Feurer --- test/test_metric/test_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index cbb7b16e17..14c4dcfed1 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -461,7 +461,7 @@ def test_unsupported_task_type(self): calculate_score(y_true, y_pred, 6, scorer) except NotImplementedError: raised = True - assert raised + self.assertTrue(raised) def test_classification_scoring_functions(self): @@ -529,4 +529,4 @@ def test_regression_only_metric(self): score = calculate_score(y_true, y_pred, REGRESSION, scorer) previous_score = scorer._optimum - self.assertAlmostEqual(score, previous_score) \ No newline at end of file + self.assertAlmostEqual(score, previous_score) From 9023c04ed43f961b866476e60af6132b06393904 Mon Sep 17 00:00:00 2001 From: Rohit Agarwal Date: Wed, 2 Dec 2020 23:45:56 +0530 Subject: [PATCH 13/13] Fixing lint --- test/test_metric/test_metrics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index cbb7b16e17..49f6fa00fb 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -524,9 +524,8 @@ def test_classification_only_metric(self): def test_regression_only_metric(self): y_true = np.array([1, 2, 3, 4]) y_pred = y_true.copy() - scorer = autosklearn.metrics.root_mean_squared_error score = calculate_score(y_true, y_pred, REGRESSION, scorer) previous_score = scorer._optimum - self.assertAlmostEqual(score, previous_score) \ No newline at end of file + self.assertAlmostEqual(score, previous_score)