From 12c51645c524d8571d526a83762ddda7cd5cef9e Mon Sep 17 00:00:00 2001 From: Jinu Date: Tue, 17 Apr 2018 14:44:47 +0200 Subject: [PATCH 1/7] Added Unittest for metric functions (new branch). --- test/test_metric/test_metrics.py | 102 +++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 1038e0ab29..f32cf28950 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -308,3 +308,105 @@ def test_classification_metrics(self): pass else: raise e + + +class TestMetric(unittest.TestCase): + + def test_regression_all(self): + + for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): + y_true = np.array([1, 2, 3, 4]) + y_pred = y_true.copy() + + # the best possible score of r2 loss is 1. + if metric == 'r2': + previous_score = 1 + else: + previous_score = 0 + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + y_pred = np.array([3, 4, 5, 6]) + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([-1, 0, -1, 0]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([-5, 10, 7, -3]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + + def test_classification_binary(self): + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for binary classification. + if metric in ['pac_score', 'precision_samples', 'recall_samples', 'f1_samples']: + continue + + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0]) + y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]]) + if metric is 'log_loss': + previous_score = 0 # the best value for log loss is 0. + else: + previous_score = 1 # the best value for other losses is 1. + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + if metric is 'recall': + y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + else: + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + def test_classification_multiclass(self): + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for multiclass classification. + if metric in ['pac_score', 'roc_auc', 'average_precision', 'precision', 'recall', 'f1', + 'precision_samples', 'recall_samples', 'f1_samples']: + continue + y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) + y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + if metric is 'log_loss': # the best possible score for log_loss is 0. + previous_score = 0 + else: + previous_score = 1 # the best value for other losses is 1, and we flip the sign to minimize. + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + + y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + + def test_classification_multilabel(self): + + for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): + # Skip functions not applicable for multi-label classification. + if metric in ['accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'log_loss', + 'pac_score', 'precision', 'recall', 'f1']: + continue + y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) + y_pred = y_true.copy() + previous_score = 1 + current_score = scorer(y_true, y_pred) + self.assertAlmostEqual(current_score, previous_score) + From 6f71d9b79add7c07b387a3a1abe2ee7c034c6108 Mon Sep 17 00:00:00 2001 From: Jinu Date: Tue, 17 Apr 2018 17:05:37 +0200 Subject: [PATCH 2/7] Create and modify unit tests for metric functions. --- autosklearn/evaluation/abstract_evaluator.py | 4 +- autosklearn/metrics/__init__.py | 18 +++-- test/test_metric/test_metrics.py | 79 +++++++++++++------- 3 files changed, 63 insertions(+), 38 deletions(-) diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 2114d72ff3..6496950a11 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -213,9 +213,9 @@ def _loss(self, y_true, y_hat, all_scoring_functions=None): all_scoring_functions=all_scoring_functions) if hasattr(score, '__len__'): - err = {key: 1 - score[key] for key in score} + err = {key: self.metric._optimum - score[key] for key in score} else: - err = 1 - score + err = self.metric._optimum - score return err diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 83f324796c..a017eb95d5 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -11,10 +11,11 @@ class Scorer(object, metaclass=ABCMeta): - def __init__(self, name, score_func, sign, kwargs): + def __init__(self, name, score_func, optimum, sign, kwargs): self.name = name self._kwargs = kwargs self._score_func = score_func + self._optimum = optimum self._sign = sign @abstractmethod @@ -133,7 +134,7 @@ def __call__(self, y_true, y_pred, sample_weight=None): return self._sign * self._score_func(y_true, y_pred, **self._kwargs) -def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, +def make_scorer(name, score_func, optimum=1, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. @@ -146,6 +147,9 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, Score function (or loss function) with signature ``score_func(y, y_pred, **kwargs)``. + optimum : int or float, default=1 + The best value achievable by the score function. + greater_is_better : boolean, default=True Whether score_func is a score function (default), meaning high is good, or a loss function, meaning low is good. In the latter case, the @@ -174,19 +178,19 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, cls = _ThresholdScorer else: cls = _PredictScorer - return cls(name, score_func, sign, kwargs) + return cls(name, score_func, optimum, sign, kwargs) # Standard regression scores r2 = make_scorer('r2', sklearn.metrics.r2_score) mean_squared_error = make_scorer('mean_squared_error', - sklearn.metrics.mean_squared_error, + sklearn.metrics.mean_squared_error, optimum=0, greater_is_better=False) mean_absolute_error = make_scorer('mean_absolute_error', - sklearn.metrics.mean_absolute_error, + sklearn.metrics.mean_absolute_error, optimum=0, greater_is_better=False) median_absolute_error = make_scorer('median_absolute_error', - sklearn.metrics.median_absolute_error, + sklearn.metrics.median_absolute_error, optimum=0, greater_is_better=False) # Standard Classification Scores @@ -205,7 +209,7 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False, recall = make_scorer('recall', sklearn.metrics.recall_score) # Score function for probabilistic classification -log_loss = make_scorer('log_loss', sklearn.metrics.log_loss, +log_loss = make_scorer('log_loss', sklearn.metrics.log_loss, optimum=0, greater_is_better=False, needs_proba=True) pac_score = make_scorer('pac_score', classification_metrics.pac_score, greater_is_better=True, needs_proba=True) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index f32cf28950..7c834902ff 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -13,7 +13,7 @@ def test_predict_scorer_binary(self): y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -28,13 +28,13 @@ def test_predict_scorer_binary(self): scorer = autosklearn.metrics._PredictScorer( 'bac', autosklearn.metrics.classification_metrics.balanced_accuracy, - 1, {}) + 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, -1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, -1, {}) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -45,7 +45,7 @@ def test_predict_scorer_multiclass(self): y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -60,13 +60,13 @@ def test_predict_scorer_multiclass(self): scorer = autosklearn.metrics._PredictScorer( 'bac', autosklearn.metrics.classification_metrics.balanced_accuracy, - 1, {}) + 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.333333333) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, -1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, -1, {}) y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -77,7 +77,7 @@ def test_predict_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -92,13 +92,13 @@ def test_predict_scorer_multilabel(self): scorer = autosklearn.metrics._PredictScorer( 'bac', autosklearn.metrics.classification_metrics.balanced_accuracy, - 1, {}) + 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, -1, {}) + 'accuracy', sklearn.metrics.accuracy_score, 1, -1, {}) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -109,7 +109,7 @@ def test_predict_scorer_regression(self): y_pred = y_true.copy() scorer = autosklearn.metrics._PredictScorer( - 'r2', sklearn.metrics.r2_score, 1, {}) + 'r2', sklearn.metrics.r2_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -123,7 +123,7 @@ def test_proba_scorer_binary(self): y_pred = [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]] scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, 1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.0) @@ -137,7 +137,7 @@ def test_proba_scorer_binary(self): self.assertAlmostEqual(score, 0.69314718055994529) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, -1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, -1, {}) y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]] score = scorer(y_true, y_pred) @@ -148,7 +148,7 @@ def test_proba_scorer_multiclass(self): y_pred = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]] scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, 1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.0) @@ -162,7 +162,7 @@ def test_proba_scorer_multiclass(self): self.assertAlmostEqual(score, 1.0986122886681096) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, -1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, -1, {}) y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]] score = scorer(y_true, y_pred) @@ -173,7 +173,7 @@ def test_proba_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, 1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 0.34657359027997314) @@ -187,7 +187,7 @@ def test_proba_scorer_multilabel(self): self.assertAlmostEqual(score, 0.69314718055994529) scorer = autosklearn.metrics._ProbaScorer( - 'accuracy', sklearn.metrics.log_loss, -1, {}) + 'accuracy', sklearn.metrics.log_loss, 0, -1, {}) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -198,7 +198,7 @@ def test_threshold_scorer_binary(self): y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, 1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -212,7 +212,7 @@ def test_threshold_scorer_binary(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, -1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, -1, {}) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -223,7 +223,7 @@ def test_threshold_scorer_multilabel(self): y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, 1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, 1, {}) score = scorer(y_true, y_pred) self.assertAlmostEqual(score, 1.0) @@ -237,7 +237,7 @@ def test_threshold_scorer_multilabel(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._ThresholdScorer( - 'accuracy', sklearn.metrics.roc_auc_score, -1, {}) + 'accuracy', sklearn.metrics.roc_auc_score, 1, -1, {}) y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) score = scorer(y_true, y_pred) @@ -345,11 +345,12 @@ def test_classification_binary(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for binary classification. - if metric in ['pac_score', 'precision_samples', 'recall_samples', 'f1_samples']: + # TODO: Average precision should work for binary classification, but its behavior is not right. + if metric in ['average_precision', 'pac_score', 'precision_samples', 'recall_samples', 'f1_samples']: continue - y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0]) - y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]]) + y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) + y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) if metric is 'log_loss': previous_score = 0 # the best value for log loss is 0. else: @@ -357,15 +358,17 @@ def test_classification_binary(self): current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) - if metric is 'recall': - y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) - else: - y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]]) + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) + y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) @@ -396,12 +399,16 @@ def test_classification_multiclass(self): current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) + y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) def test_classification_multilabel(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for multi-label classification. - if metric in ['accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'log_loss', + if metric in ['roc_auc', 'log_loss', 'pac_score', 'precision', 'recall', 'f1']: continue y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) @@ -410,3 +417,17 @@ def test_classification_multilabel(self): current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) + y_pred = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 1], [1, 1, 1]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 1], [1, 1, 0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) + + y_pred = np.array([[0, 1, 1], [0, 0, 1], [1, 0, 0], [0, 0, 0]]) + previous_score = current_score + current_score = scorer(y_true, y_pred) + self.assertLess(current_score, previous_score) From 6ff5cffaf882f151986c2aa13534d2608972dad8 Mon Sep 17 00:00:00 2001 From: Jinu Date: Tue, 17 Apr 2018 17:48:47 +0200 Subject: [PATCH 3/7] Add optimum in example_metrics.py and minor changes. --- autosklearn/metrics/__init__.py | 12 +++++------ example/example_metrics.py | 2 ++ test/test_metric/test_metrics.py | 37 +++++++++++++++++++++----------- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index a017eb95d5..52a9d63e12 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -134,8 +134,8 @@ def __call__(self, y_true, y_pred, sample_weight=None): return self._sign * self._score_func(y_true, y_pred, **self._kwargs) -def make_scorer(name, score_func, optimum=1, greater_is_better=True, needs_proba=False, - needs_threshold=False, **kwargs): +def make_scorer(name, score_func, optimum=1, greater_is_better=True, + needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. Factory inspired by scikit-learn which wraps scikit-learn scoring functions @@ -187,11 +187,11 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True, needs_proba sklearn.metrics.mean_squared_error, optimum=0, greater_is_better=False) mean_absolute_error = make_scorer('mean_absolute_error', - sklearn.metrics.mean_absolute_error, optimum=0, - greater_is_better=False) + sklearn.metrics.mean_absolute_error, + optimum=0, greater_is_better=False) median_absolute_error = make_scorer('median_absolute_error', - sklearn.metrics.median_absolute_error, optimum=0, - greater_is_better=False) + sklearn.metrics.median_absolute_error, + optimum=0, greater_is_better=False) # Standard Classification Scores accuracy = make_scorer('accuracy', sklearn.metrics.accuracy_score) diff --git a/example/example_metrics.py b/example/example_metrics.py index 603b82f946..dd306e6c63 100644 --- a/example/example_metrics.py +++ b/example/example_metrics.py @@ -55,6 +55,7 @@ def main(): accuracy_scorer = autosklearn.metrics.make_scorer( name="accu", score_func=accuracy, + optimum=1, greater_is_better=True, needs_proba=False, needs_threshold=False, @@ -77,6 +78,7 @@ def main(): accuracy_scorer = autosklearn.metrics.make_scorer( name="accu_add", score_func=accuracy_wk, + optimum=1, greater_is_better=True, needs_proba=False, needs_threshold=False, diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 7c834902ff..4d3c601aaf 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -345,12 +345,15 @@ def test_classification_binary(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for binary classification. - # TODO: Average precision should work for binary classification, but its behavior is not right. - if metric in ['average_precision', 'pac_score', 'precision_samples', 'recall_samples', 'f1_samples']: + # TODO: Average precision should work for binary classification, + # TODO: but its behavior is not right. + if metric in ['average_precision', 'pac_score', + 'precision_samples', 'recall_samples', 'f1_samples']: continue y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) - y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) + y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], + [1.0, 0.0], [1.0, 0.0]]) if metric is 'log_loss': previous_score = 0 # the best value for log loss is 0. else: @@ -358,17 +361,20 @@ def test_classification_binary(self): current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) - y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]]) + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], + [0.0, 1.0], [1.0, 0.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) + y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], + [0.0, 1.0], [0.0, 1.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]) + y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], + [0.0, 1.0], [0.0, 1.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) @@ -377,29 +383,34 @@ def test_classification_multiclass(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for multiclass classification. - if metric in ['pac_score', 'roc_auc', 'average_precision', 'precision', 'recall', 'f1', - 'precision_samples', 'recall_samples', 'f1_samples']: + if metric in ['pac_score', 'roc_auc', 'average_precision', + 'precision', 'recall', 'f1','precision_samples', + 'recall_samples', 'f1_samples']: continue y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) - y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) if metric is 'log_loss': # the best possible score for log_loss is 0. previous_score = 0 else: - previous_score = 1 # the best value for other losses is 1, and we flip the sign to minimize. + previous_score = 1 # the best value for other losses is 1. current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) - y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]]) + y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], + [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) - y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) + y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], + [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]) previous_score = current_score current_score = scorer(y_true, y_pred) self.assertLess(current_score, previous_score) From 1d56ebe015af30c1cf07586208d2d55de8833ce2 Mon Sep 17 00:00:00 2001 From: Jinu Date: Tue, 17 Apr 2018 22:24:07 +0200 Subject: [PATCH 4/7] . --- autosklearn/metrics/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 52a9d63e12..652966a1b2 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -148,7 +148,7 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True, ``score_func(y, y_pred, **kwargs)``. optimum : int or float, default=1 - The best value achievable by the score function. + The best score achievable by the score function. greater_is_better : boolean, default=True Whether score_func is a score function (default), meaning high is good, From f5086fa22b5598825675e04307fbc4b66701b677 Mon Sep 17 00:00:00 2001 From: Jinu Date: Tue, 17 Apr 2018 22:30:12 +0200 Subject: [PATCH 5/7] . --- autosklearn/metrics/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 652966a1b2..9e6bc77a1a 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -148,7 +148,8 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True, ``score_func(y, y_pred, **kwargs)``. optimum : int or float, default=1 - The best score achievable by the score function. + The best score achievable by the score function, i.e. maximum in case of + scorer function and minimum in case of loss function. greater_is_better : boolean, default=True Whether score_func is a score function (default), meaning high is good, From a535d2957ee2705d88ed366c275f9bbfa6510cf8 Mon Sep 17 00:00:00 2001 From: Jinu Date: Fri, 20 Apr 2018 13:35:11 +0200 Subject: [PATCH 6/7] . --- autosklearn/metrics/__init__.py | 40 +++++++++++++++++++++----------- test/test_metric/test_metrics.py | 23 ++++++------------ 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 9e6bc77a1a..1acfb50080 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -185,35 +185,49 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True, # Standard regression scores r2 = make_scorer('r2', sklearn.metrics.r2_score) mean_squared_error = make_scorer('mean_squared_error', - sklearn.metrics.mean_squared_error, optimum=0, + sklearn.metrics.mean_squared_error, + optimum=0, greater_is_better=False) mean_absolute_error = make_scorer('mean_absolute_error', sklearn.metrics.mean_absolute_error, - optimum=0, greater_is_better=False) + optimum=0, + greater_is_better=False) median_absolute_error = make_scorer('median_absolute_error', sklearn.metrics.median_absolute_error, - optimum=0, greater_is_better=False) + optimum=0, + greater_is_better=False) # Standard Classification Scores -accuracy = make_scorer('accuracy', sklearn.metrics.accuracy_score) +accuracy = make_scorer('accuracy', + sklearn.metrics.accuracy_score) balanced_accuracy = make_scorer('balanced_accuracy', classification_metrics.balanced_accuracy) -f1 = make_scorer('f1', sklearn.metrics.f1_score) +f1 = make_scorer('f1', + sklearn.metrics.f1_score) # Score functions that need decision values -roc_auc = make_scorer('roc_auc', sklearn.metrics.roc_auc_score, - greater_is_better=True, needs_threshold=True) +roc_auc = make_scorer('roc_auc', + sklearn.metrics.roc_auc_score, + greater_is_better=True, + needs_threshold=True) average_precision = make_scorer('average_precision', sklearn.metrics.average_precision_score, needs_threshold=True) -precision = make_scorer('precision', sklearn.metrics.precision_score) -recall = make_scorer('recall', sklearn.metrics.recall_score) +precision = make_scorer('precision', + sklearn.metrics.precision_score) +recall = make_scorer('recall', + sklearn.metrics.recall_score) # Score function for probabilistic classification -log_loss = make_scorer('log_loss', sklearn.metrics.log_loss, optimum=0, - greater_is_better=False, needs_proba=True) -pac_score = make_scorer('pac_score', classification_metrics.pac_score, - greater_is_better=True, needs_proba=True) +log_loss = make_scorer('log_loss', + sklearn.metrics.log_loss, + optimum=0, + greater_is_better=False, + needs_proba=True) +pac_score = make_scorer('pac_score', + classification_metrics.pac_score, + greater_is_better=True, + needs_proba=True) # TODO what about mathews correlation coefficient etc? diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 4d3c601aaf..4b7da5d438 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -317,12 +317,7 @@ def test_regression_all(self): for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items(): y_true = np.array([1, 2, 3, 4]) y_pred = y_true.copy() - - # the best possible score of r2 loss is 1. - if metric == 'r2': - previous_score = 1 - else: - previous_score = 0 + previous_score = scorer._optimum current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) @@ -346,7 +341,9 @@ def test_classification_binary(self): for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items(): # Skip functions not applicable for binary classification. # TODO: Average precision should work for binary classification, - # TODO: but its behavior is not right. + # TODO: but its behavior is not right. When y_pred is completely + # TODO: wrong, it does return 0.5, but when it is not completely + # TODO: wrong, it returns value smaller than 0.5. if metric in ['average_precision', 'pac_score', 'precision_samples', 'recall_samples', 'f1_samples']: continue @@ -354,10 +351,7 @@ def test_classification_binary(self): y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0]) y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]]) - if metric is 'log_loss': - previous_score = 0 # the best value for log loss is 0. - else: - previous_score = 1 # the best value for other losses is 1. + previous_score = scorer._optimum current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) @@ -390,10 +384,7 @@ def test_classification_multiclass(self): y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0]) y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) - if metric is 'log_loss': # the best possible score for log_loss is 0. - previous_score = 0 - else: - previous_score = 1 # the best value for other losses is 1. + previous_score = scorer._optimum current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) @@ -424,7 +415,7 @@ def test_classification_multilabel(self): continue y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]]) y_pred = y_true.copy() - previous_score = 1 + previous_score = scorer._optimum current_score = scorer(y_true, y_pred) self.assertAlmostEqual(current_score, previous_score) From a99d1d33a35fad42d4ec796209161a1ec57bf1bd Mon Sep 17 00:00:00 2001 From: Jinu Date: Fri, 20 Apr 2018 13:36:51 +0200 Subject: [PATCH 7/7] . --- autosklearn/metrics/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 1acfb50080..d72aec9e07 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -183,7 +183,8 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True, # Standard regression scores -r2 = make_scorer('r2', sklearn.metrics.r2_score) +r2 = make_scorer('r2', + sklearn.metrics.r2_score) mean_squared_error = make_scorer('mean_squared_error', sklearn.metrics.mean_squared_error, optimum=0,