diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 2114d72ff3..6496950a11 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -213,9 +213,9 @@ def _loss(self, y_true, y_hat, all_scoring_functions=None): all_scoring_functions=all_scoring_functions) if hasattr(score, '__len__'): - err = {key: 1 - score[key] for key in score} + err = {key: self.metric._optimum - score[key] for key in score} else: - err = 1 - score + err = self.metric._optimum - score return err diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 83f324796c..d72aec9e07 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -11,10 +11,11 @@ class Scorer(object, metaclass=ABCMeta): - def __init__(self, name, score_func, sign, kwargs): + def __init__(self, name, score_func, optimum, sign, kwargs): self.name = name self._kwargs = kwargs self._score_func = score_func + self._optimum = optimum self._sign = sign @abstractmethod @@ -133,8 +134,8 @@ def __call__(self, y_true, y_pred, sample_weight=None): return self._sign * self._score_func(y_true, y_pred, **self._kwargs) -def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, - needs_threshold=False, **kwargs): +def make_scorer(name, score_func, optimum=1, greater_is_better=True, + needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. Factory inspired by scikit-learn which wraps scikit-learn scoring functions @@ -146,6 +147,10 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``. + optimum : int or float, default=1 + The best score achievable by the score function, i.e. maximum in case of + scorer function and minimum in case of loss function. + greater_is_better : boolean, default=True Whether score_func is a score function (default), meaning high is good, or a loss function, meaning low is good. In the latter case, the @@ -174,41 +179,56 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, cls = _ThresholdScorer else: cls = _PredictScorer - return cls(name, score_func, sign, kwargs) + return cls(name, score_func, optimum, sign, kwargs) # Standard regression scores -r2 = make_scorer('r2', sklearn.metrics.r2_score) +r2 = make_scorer('r2', + sklearn.metrics.r2_score) mean_squared_error = make_scorer('mean_squared_error', sklearn.metrics.mean_squared_error, + optimum=0, greater_is_better=False) mean_absolute_error = make_scorer('mean_absolute_error', sklearn.metrics.mean_absolute_error, + optimum=0, greater_is_better=False) median_absolute_error = make_scorer('median_absolute_error', sklearn.metrics.median_absolute_error, + optimum=0, greater_is_better=False) # Standard Classification Scores -accuracy = make_scorer('accuracy', sklearn.metrics.accuracy_score) +accuracy = make_scorer('accuracy', + sklearn.metrics.accuracy_score) balanced_accuracy = make_scorer('balanced_accuracy', classification_metrics.balanced_accuracy) -f1 = make_scorer('f1', sklearn.metrics.f1_score) +f1 = make_scorer('f1', + sklearn.metrics.f1_score) # Score functions that need decision values -roc_auc = make_scorer('roc_auc', sklearn.metrics.roc_auc_score, - greater_is_better=True, needs_threshold=True) +roc_auc = make_scorer('roc_auc', + sklearn.metrics.roc_auc_score, + greater_is_better=True, + needs_threshold=True) average_precision = make_scorer('average_precision', sklearn.metrics.average_precision_score, needs_threshold=True) -precision = make_scorer('precision', sklearn.metrics.precision_score) -recall = make_scorer('recall', sklearn.metrics.recall_score) +precision = make_scorer('precision', + sklearn.metrics.precision_score) +recall = make_scorer('recall', + sklearn.metrics.recall_score) # Score function for probabilistic classification -log_loss = make_scorer('log_loss', sklearn.metrics.log_loss, - greater_is_better=False, needs_proba=True) -pac_score = make_scorer('pac_score', classification_metrics.pac_score, - greater_is_better=True, needs_proba=True) +log_loss = make_scorer('log_loss', + sklearn.metrics.log_loss, + optimum=0, + greater_is_better=False, + needs_proba=True) +pac_score = make_scorer('pac_score', + classification_metrics.pac_score, + greater_is_better=True, + needs_proba=True) # TODO what about mathews correlation coefficient etc? diff --git a/example/example_metrics.py b/example/example_metrics.py index 603b82f946..dd306e6c63 100644 --- a/example/example_metrics.py +++ b/example/example_metrics.py @@ -55,6 +55,7 @@ def main(): accuracy_scorer = autosklearn.metrics.make_scorer( name="accu", score_func=accuracy, + optimum=1, greater_is_better=True, needs_proba=False, needs_threshold=False, @@ -77,6 +78,7 @@ def main(): accuracy_scorer = autosklearn.metrics.make_scorer( name="accu_add", score_func=accuracy_wk, + optimum=1, greater_is_better=True, needs_proba=False, needs_threshold=False, diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 1038e0ab29..4b7da5d438 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -13,7 +13,7 @@ def test_predict_scorer_binary(self): y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -28,13 +28,13 @@ def test_predict_scorer_binary(self): scorer = autosklearn.metrics._PredictScorer( 'bac', autosklearn.metrics.classification_metrics.balanced_accuracy, - 1, {}) + 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, -1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, -1, {}) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -45,7 +45,7 @@ def test_predict_scorer_multiclass(self): y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -60,13 +60,13 @@ def test_predict_scorer_multiclass(self): scorer = autosklearn.metrics._PredictScorer( 'bac', autosklearn.metrics.classification_metrics.balanced_accuracy, - 1, {}) + 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.333333333) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, -1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, -1, {}) y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -77,7 +77,7 @@ def test_predict_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -92,13 +92,13 @@ def test_predict_scorer_multilabel(self): scorer = autosklearn.metrics._PredictScorer( 'bac', autosklearn.metrics.classification_metrics.balanced_accuracy, - 1, {}) + 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, -1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, -1, {}) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -109,7 +109,7 @@ def test_predict_scorer_regression(self): y_pred = y_true.copy() scorer = autosklearn.metrics._PredictScorer( - 'r2', sklearn.metrics.r2_score, 1, {}) + 'r2', sklearn.metrics.r2_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -123,7 +123,7 @@ def test_proba_scorer_binary(self): y_pred = [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]] scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, 1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.0) @@ -137,7 +137,7 @@ def test_proba_scorer_binary(self): self.assertAlmostEqual(score, 0.69314718055994529) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, -1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, -1, {}) y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]] score = scorer(y_true, y_pred) @@ -148,7 +148,7 @@ def test_proba_scorer_multiclass(self): y_pred = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, 1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.0) @@ -162,7 +162,7 @@ def test_proba_scorer_multiclass(self): self.assertAlmostEqual(score, 1.0986122886681096) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, -1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, -1, {}) y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]] score = scorer(y_true, y_pred) @@ -173,7 +173,7 @@ def test_proba_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, 1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.34657359027997314) @@ -187,7 +187,7 @@ def test_proba_scorer_multilabel(self): self.assertAlmostEqual(score, 0.69314718055994529) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, -1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, -1, {}) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -198,7 +198,7 @@ def test_threshold_scorer_binary(self): y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, 1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -212,7 +212,7 @@ def test_threshold_scorer_binary(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, -1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, -1, {}) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -223,7 +223,7 @@ def test_threshold_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, 1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -237,7 +237,7 @@ def test_threshold_scorer_multilabel(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, -1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, -1, {}) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -308,3 +308,128 @@ def test_classification_metrics(self): pass else: raise e + + +class TestMetric(unittest.TestCase): + + def test_regression_all(self): + + for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): + y_true = np.array([1, 2, 3, 4]) + y_pred = y_true.copy() + previous_score = scorer._optimum + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + y_pred = np.array([3, 4, 5, 6]) + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([-1, 0, -1, 0]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([-5, 10, 7, -3]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + + def test_classification_binary(self): + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for binary classification. + # TODO: Average precision should work for binary classification, + # TODO: but its behavior is not right. When y_pred is completely + # TODO: wrong, it does return 0.5, but when it is not completely + # TODO: wrong, it returns value smaller than 0.5. + if metric in ['average_precision', 'pac_score', + 'precision_samples', 'recall_samples', 'f1_samples']: + continue + + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) + y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], + [1.0, 0.0], [1.0, 0.0]]) + previous_score = scorer._optimum + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], + [0.0, 1.0], [1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], + [0.0, 1.0], [0.0, 1.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], + [0.0, 1.0], [0.0, 1.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + def test_classification_multiclass(self): + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for multiclass classification. + if metric in ['pac_score', 'roc_auc', 'average_precision', + 'precision', 'recall', 'f1','precision_samples', + 'recall_samples', 'f1_samples']: + continue + y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) + y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + previous_score = scorer._optimum + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], + [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], + [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + def test_classification_multilabel(self): + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for multi-label classification. + if metric in ['roc_auc', 'log_loss', + 'pac_score', 'precision', 'recall', 'f1']: + continue + y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) + y_pred = y_true.copy() + previous_score = scorer._optimum + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + y_pred = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 1], [1, 1, 1]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 1], [1, 1, 0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0, 1, 1], [0, 0, 1], [1, 0, 0], [0, 0, 0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score)