From 12c51645c524d8571d526a83762ddda7cd5cef9e Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Tue, 17 Apr 2018 14:44:47 +0200
Subject: [PATCH 1/7] Added Unittest for metric functions (new branch).

---
 test/test_metric/test_metrics.py | 102 +++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 1038e0ab29..f32cf28950 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -308,3 +308,105 @@ def test_classification_metrics(self):
                     pass
                 else:
                     raise e
+
+
+class TestMetric(unittest.TestCase):
+
+    def test_regression_all(self):
+
+        for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items():
+            y_true = np.array([1, 2, 3, 4])
+            y_pred = y_true.copy()
+
+            # the best possible score of r2 loss is 1.
+            if metric == 'r2':
+                previous_score = 1
+            else:
+                previous_score = 0
+            current_score = scorer(y_true, y_pred)
+            self.assertAlmostEqual(current_score, previous_score)
+
+            y_pred = np.array([3, 4, 5, 6])
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([-1, 0, -1, 0])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([-5, 10, 7, -3])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+
+    def test_classification_binary(self):
+
+        for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
+            # Skip functions not applicable for binary classification.
+            if metric in ['pac_score', 'precision_samples', 'recall_samples', 'f1_samples']:
+                continue
+
+            y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0])
+            y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]])
+            if metric is 'log_loss':
+                previous_score = 0      # the best value for log loss is 0.
+            else:
+                previous_score = 1     # the best value for other losses is 1.
+            current_score = scorer(y_true, y_pred)
+            self.assertAlmostEqual(current_score, previous_score)
+
+            if metric is 'recall':
+                y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
+            else:
+                y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+    def test_classification_multiclass(self):
+
+        for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
+            # Skip functions not applicable for multiclass classification.
+            if metric in ['pac_score', 'roc_auc', 'average_precision', 'precision', 'recall', 'f1',
+                          'precision_samples', 'recall_samples', 'f1_samples']:
+                continue
+            y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0])
+            y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+            if metric is 'log_loss': # the best possible score for log_loss is 0.
+                previous_score = 0
+            else:
+                previous_score = 1 # the best value for other losses is 1, and we flip the sign to minimize.
+            current_score = scorer(y_true, y_pred)
+            self.assertAlmostEqual(current_score, previous_score)
+
+            y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+
+    def test_classification_multilabel(self):
+
+        for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
+            # Skip functions not applicable for multi-label classification.
+            if metric in ['accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'log_loss',
+                          'pac_score', 'precision', 'recall', 'f1']:
+                continue
+            y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]])
+            y_pred = y_true.copy()
+            previous_score = 1
+            current_score = scorer(y_true, y_pred)
+            self.assertAlmostEqual(current_score, previous_score)
+

From 6f71d9b79add7c07b387a3a1abe2ee7c034c6108 Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Tue, 17 Apr 2018 17:05:37 +0200
Subject: [PATCH 2/7] Create and modify unit tests for metric functions.

---
 autosklearn/evaluation/abstract_evaluator.py |  4 +-
 autosklearn/metrics/__init__.py              | 18 +++--
 test/test_metric/test_metrics.py             | 79 +++++++++++++-------
 3 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
index 2114d72ff3..6496950a11 100644
--- a/autosklearn/evaluation/abstract_evaluator.py
+++ b/autosklearn/evaluation/abstract_evaluator.py
@@ -213,9 +213,9 @@ def _loss(self, y_true, y_hat, all_scoring_functions=None):
             all_scoring_functions=all_scoring_functions)
 
         if hasattr(score, '__len__'):
-            err = {key: 1 - score[key] for key in score}
+            err = {key: self.metric._optimum - score[key] for key in score}
         else:
-            err = 1 - score
+            err = self.metric._optimum - score
 
         return err
 
diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 83f324796c..a017eb95d5 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -11,10 +11,11 @@
 
 
 class Scorer(object, metaclass=ABCMeta):
-    def __init__(self, name, score_func, sign, kwargs):
+    def __init__(self, name, score_func, optimum, sign, kwargs):
         self.name = name
         self._kwargs = kwargs
         self._score_func = score_func
+        self._optimum = optimum
         self._sign = sign
 
     @abstractmethod
@@ -133,7 +134,7 @@ def __call__(self, y_true, y_pred, sample_weight=None):
             return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
 
 
-def make_scorer(name, score_func, greater_is_better=True, needs_proba=False,
+def make_scorer(name, score_func, optimum=1, greater_is_better=True, needs_proba=False,
                 needs_threshold=False, **kwargs):
     """Make a scorer from a performance metric or loss function.
 
@@ -146,6 +147,9 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False,
         Score function (or loss function) with signature
         ``score_func(y, y_pred, **kwargs)``.
 
+    optimum : int or float, default=1
+        The best value achievable by the score function.
+
     greater_is_better : boolean, default=True
         Whether score_func is a score function (default), meaning high is good,
         or a loss function, meaning low is good. In the latter case, the
@@ -174,19 +178,19 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False,
         cls = _ThresholdScorer
     else:
         cls = _PredictScorer
-    return cls(name, score_func, sign, kwargs)
+    return cls(name, score_func, optimum, sign, kwargs)
 
 
 # Standard regression scores
 r2 = make_scorer('r2', sklearn.metrics.r2_score)
 mean_squared_error = make_scorer('mean_squared_error',
-                                 sklearn.metrics.mean_squared_error,
+                                 sklearn.metrics.mean_squared_error, optimum=0,
                                  greater_is_better=False)
 mean_absolute_error = make_scorer('mean_absolute_error',
-                                  sklearn.metrics.mean_absolute_error,
+                                  sklearn.metrics.mean_absolute_error, optimum=0,
                                   greater_is_better=False)
 median_absolute_error = make_scorer('median_absolute_error',
-                                    sklearn.metrics.median_absolute_error,
+                                    sklearn.metrics.median_absolute_error, optimum=0,
                                     greater_is_better=False)
 
 # Standard Classification Scores
@@ -205,7 +209,7 @@ def make_scorer(name, score_func, greater_is_better=True, needs_proba=False,
 recall = make_scorer('recall', sklearn.metrics.recall_score)
 
 # Score function for probabilistic classification
-log_loss = make_scorer('log_loss', sklearn.metrics.log_loss,
+log_loss = make_scorer('log_loss', sklearn.metrics.log_loss, optimum=0,
                        greater_is_better=False, needs_proba=True)
 pac_score = make_scorer('pac_score', classification_metrics.pac_score,
                         greater_is_better=True, needs_proba=True)
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index f32cf28950..7c834902ff 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -13,7 +13,7 @@ def test_predict_scorer_binary(self):
         y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
 
         scorer = autosklearn.metrics._PredictScorer(
-            'accuracy', sklearn.metrics.accuracy_score, 1, {})
+            'accuracy', sklearn.metrics.accuracy_score, 1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 1.0)
@@ -28,13 +28,13 @@ def test_predict_scorer_binary(self):
 
         scorer = autosklearn.metrics._PredictScorer(
             'bac', autosklearn.metrics.classification_metrics.balanced_accuracy,
-            1, {})
+            1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 0.5)
 
         scorer = autosklearn.metrics._PredictScorer(
-            'accuracy', sklearn.metrics.accuracy_score, -1, {})
+            'accuracy', sklearn.metrics.accuracy_score, 1, -1, {})
 
         y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
         score = scorer(y_true, y_pred)
@@ -45,7 +45,7 @@ def test_predict_scorer_multiclass(self):
         y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
 
         scorer = autosklearn.metrics._PredictScorer(
-            'accuracy', sklearn.metrics.accuracy_score, 1, {})
+            'accuracy', sklearn.metrics.accuracy_score, 1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 1.0)
@@ -60,13 +60,13 @@ def test_predict_scorer_multiclass(self):
 
         scorer = autosklearn.metrics._PredictScorer(
             'bac', autosklearn.metrics.classification_metrics.balanced_accuracy,
-            1, {})
+            1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 0.333333333)
 
         scorer = autosklearn.metrics._PredictScorer(
-            'accuracy', sklearn.metrics.accuracy_score, -1, {})
+            'accuracy', sklearn.metrics.accuracy_score, 1, -1, {})
 
         y_pred = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
         score = scorer(y_true, y_pred)
@@ -77,7 +77,7 @@ def test_predict_scorer_multilabel(self):
         y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
 
         scorer = autosklearn.metrics._PredictScorer(
-            'accuracy', sklearn.metrics.accuracy_score, 1, {})
+            'accuracy', sklearn.metrics.accuracy_score, 1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 1.0)
@@ -92,13 +92,13 @@ def test_predict_scorer_multilabel(self):
 
         scorer = autosklearn.metrics._PredictScorer(
             'bac', autosklearn.metrics.classification_metrics.balanced_accuracy,
-            1, {})
+            1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 0.5)
 
         scorer = autosklearn.metrics._PredictScorer(
-            'accuracy', sklearn.metrics.accuracy_score, -1, {})
+            'accuracy', sklearn.metrics.accuracy_score, 1, -1, {})
 
         y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
         score = scorer(y_true, y_pred)
@@ -109,7 +109,7 @@ def test_predict_scorer_regression(self):
         y_pred = y_true.copy()
 
         scorer = autosklearn.metrics._PredictScorer(
-            'r2', sklearn.metrics.r2_score, 1, {})
+            'r2', sklearn.metrics.r2_score, 1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 1.0)
@@ -123,7 +123,7 @@ def test_proba_scorer_binary(self):
         y_pred = [[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]
 
         scorer = autosklearn.metrics._ProbaScorer(
-            'accuracy', sklearn.metrics.log_loss, 1, {})
+            'accuracy', sklearn.metrics.log_loss, 0, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 0.0)
@@ -137,7 +137,7 @@ def test_proba_scorer_binary(self):
         self.assertAlmostEqual(score, 0.69314718055994529)
 
         scorer = autosklearn.metrics._ProbaScorer(
-            'accuracy', sklearn.metrics.log_loss, -1, {})
+            'accuracy', sklearn.metrics.log_loss, 0, -1, {})
 
         y_pred = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]
         score = scorer(y_true, y_pred)
@@ -148,7 +148,7 @@ def test_proba_scorer_multiclass(self):
         y_pred = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
 
         scorer = autosklearn.metrics._ProbaScorer(
-            'accuracy', sklearn.metrics.log_loss, 1, {})
+            'accuracy', sklearn.metrics.log_loss, 0, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 0.0)
@@ -162,7 +162,7 @@ def test_proba_scorer_multiclass(self):
         self.assertAlmostEqual(score, 1.0986122886681096)
 
         scorer = autosklearn.metrics._ProbaScorer(
-            'accuracy', sklearn.metrics.log_loss, -1, {})
+            'accuracy', sklearn.metrics.log_loss, 0, -1, {})
 
         y_pred = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
         score = scorer(y_true, y_pred)
@@ -173,7 +173,7 @@ def test_proba_scorer_multilabel(self):
         y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
 
         scorer = autosklearn.metrics._ProbaScorer(
-            'accuracy', sklearn.metrics.log_loss, 1, {})
+            'accuracy', sklearn.metrics.log_loss, 0, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 0.34657359027997314)
@@ -187,7 +187,7 @@ def test_proba_scorer_multilabel(self):
         self.assertAlmostEqual(score, 0.69314718055994529)
 
         scorer = autosklearn.metrics._ProbaScorer(
-            'accuracy', sklearn.metrics.log_loss, -1, {})
+            'accuracy', sklearn.metrics.log_loss, 0, -1, {})
 
         y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
         score = scorer(y_true, y_pred)
@@ -198,7 +198,7 @@ def test_threshold_scorer_binary(self):
         y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
 
         scorer = autosklearn.metrics._ThresholdScorer(
-            'accuracy', sklearn.metrics.roc_auc_score, 1, {})
+            'accuracy', sklearn.metrics.roc_auc_score, 1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 1.0)
@@ -212,7 +212,7 @@ def test_threshold_scorer_binary(self):
         self.assertAlmostEqual(score, 0.5)
 
         scorer = autosklearn.metrics._ThresholdScorer(
-            'accuracy', sklearn.metrics.roc_auc_score, -1, {})
+            'accuracy', sklearn.metrics.roc_auc_score, 1, -1, {})
 
         y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
         score = scorer(y_true, y_pred)
@@ -223,7 +223,7 @@ def test_threshold_scorer_multilabel(self):
         y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
 
         scorer = autosklearn.metrics._ThresholdScorer(
-            'accuracy', sklearn.metrics.roc_auc_score, 1, {})
+            'accuracy', sklearn.metrics.roc_auc_score, 1, 1, {})
 
         score = scorer(y_true, y_pred)
         self.assertAlmostEqual(score, 1.0)
@@ -237,7 +237,7 @@ def test_threshold_scorer_multilabel(self):
         self.assertAlmostEqual(score, 0.5)
 
         scorer = autosklearn.metrics._ThresholdScorer(
-            'accuracy', sklearn.metrics.roc_auc_score, -1, {})
+            'accuracy', sklearn.metrics.roc_auc_score, 1, -1, {})
 
         y_pred = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
         score = scorer(y_true, y_pred)
@@ -345,11 +345,12 @@ def test_classification_binary(self):
 
         for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
             # Skip functions not applicable for binary classification.
-            if metric in ['pac_score', 'precision_samples', 'recall_samples', 'f1_samples']:
+            # TODO: Average precision should work for binary classification, but its behavior is not right.
+            if metric in ['average_precision', 'pac_score', 'precision_samples', 'recall_samples', 'f1_samples']:
                 continue
 
-            y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0])
-            y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]])
+            y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
+            y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
             if metric is 'log_loss':
                 previous_score = 0      # the best value for log loss is 0.
             else:
@@ -357,15 +358,17 @@ def test_classification_binary(self):
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
-            if metric is 'recall':
-                y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
-            else:
-                y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]])
+            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
 
-            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]])
+            y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
@@ -396,12 +399,16 @@ def test_classification_multiclass(self):
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
 
+            y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
 
     def test_classification_multilabel(self):
 
         for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
             # Skip functions not applicable for multi-label classification.
-            if metric in ['accuracy', 'balanced_accuracy', 'roc_auc', 'average_precision', 'log_loss',
+            if metric in ['roc_auc', 'log_loss',
                           'pac_score', 'precision', 'recall', 'f1']:
                 continue
             y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]])
@@ -410,3 +417,17 @@ def test_classification_multilabel(self):
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
+            y_pred = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 1], [1, 1, 1]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 1], [1, 1, 0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)
+
+            y_pred = np.array([[0, 1, 1], [0, 0, 1], [1, 0, 0], [0, 0, 0]])
+            previous_score = current_score
+            current_score = scorer(y_true, y_pred)
+            self.assertLess(current_score, previous_score)

From 6ff5cffaf882f151986c2aa13534d2608972dad8 Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Tue, 17 Apr 2018 17:48:47 +0200
Subject: [PATCH 3/7] Add optimum in example_metrics.py and minor changes.

---
 autosklearn/metrics/__init__.py  | 12 +++++------
 example/example_metrics.py       |  2 ++
 test/test_metric/test_metrics.py | 37 +++++++++++++++++++++-----------
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index a017eb95d5..52a9d63e12 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -134,8 +134,8 @@ def __call__(self, y_true, y_pred, sample_weight=None):
             return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
 
 
-def make_scorer(name, score_func, optimum=1, greater_is_better=True, needs_proba=False,
-                needs_threshold=False, **kwargs):
+def make_scorer(name, score_func, optimum=1, greater_is_better=True,
+                needs_proba=False, needs_threshold=False, **kwargs):
     """Make a scorer from a performance metric or loss function.
 
     Factory inspired by scikit-learn which wraps scikit-learn scoring functions
@@ -187,11 +187,11 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True, needs_proba
                                  sklearn.metrics.mean_squared_error, optimum=0,
                                  greater_is_better=False)
 mean_absolute_error = make_scorer('mean_absolute_error',
-                                  sklearn.metrics.mean_absolute_error, optimum=0,
-                                  greater_is_better=False)
+                                  sklearn.metrics.mean_absolute_error,
+                                  optimum=0, greater_is_better=False)
 median_absolute_error = make_scorer('median_absolute_error',
-                                    sklearn.metrics.median_absolute_error, optimum=0,
-                                    greater_is_better=False)
+                                    sklearn.metrics.median_absolute_error,
+                                    optimum=0, greater_is_better=False)
 
 # Standard Classification Scores
 accuracy = make_scorer('accuracy', sklearn.metrics.accuracy_score)
diff --git a/example/example_metrics.py b/example/example_metrics.py
index 603b82f946..dd306e6c63 100644
--- a/example/example_metrics.py
+++ b/example/example_metrics.py
@@ -55,6 +55,7 @@ def main():
     accuracy_scorer = autosklearn.metrics.make_scorer(
         name="accu",
         score_func=accuracy,
+        optimum=1,
         greater_is_better=True,
         needs_proba=False,
         needs_threshold=False,
@@ -77,6 +78,7 @@ def main():
     accuracy_scorer = autosklearn.metrics.make_scorer(
         name="accu_add",
         score_func=accuracy_wk,
+        optimum=1,
         greater_is_better=True,
         needs_proba=False,
         needs_threshold=False,
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 7c834902ff..4d3c601aaf 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -345,12 +345,15 @@ def test_classification_binary(self):
 
         for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
             # Skip functions not applicable for binary classification.
-            # TODO: Average precision should work for binary classification, but its behavior is not right.
-            if metric in ['average_precision', 'pac_score', 'precision_samples', 'recall_samples', 'f1_samples']:
+            # TODO: Average precision should work for binary classification,
+            # TODO: but its behavior is not right.
+            if metric in ['average_precision', 'pac_score',
+                          'precision_samples', 'recall_samples', 'f1_samples']:
                 continue
 
             y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
-            y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]])
+            y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0],
+                                [1.0, 0.0], [1.0, 0.0]])
             if metric is 'log_loss':
                 previous_score = 0      # the best value for log loss is 0.
             else:
@@ -358,17 +361,20 @@ def test_classification_binary(self):
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
-            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]])
+            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0],
+                                [0.0, 1.0], [1.0, 0.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
 
-            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
+            y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0],
+                               [0.0, 1.0], [0.0, 1.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
 
-            y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
+            y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0],
+                               [0.0, 1.0], [0.0, 1.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
@@ -377,29 +383,34 @@ def test_classification_multiclass(self):
 
         for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
             # Skip functions not applicable for multiclass classification.
-            if metric in ['pac_score', 'roc_auc', 'average_precision', 'precision', 'recall', 'f1',
-                          'precision_samples', 'recall_samples', 'f1_samples']:
+            if metric in ['pac_score', 'roc_auc', 'average_precision',
+                          'precision', 'recall', 'f1','precision_samples',
+                          'recall_samples', 'f1_samples']:
                 continue
             y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0])
-            y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+            y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0],
+                            [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
             if metric is 'log_loss': # the best possible score for log_loss is 0.
                 previous_score = 0
             else:
-                previous_score = 1 # the best value for other losses is 1, and we flip the sign to minimize.
+                previous_score = 1 # the best value for other losses is 1.
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
-            y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+            y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0],
+                            [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
 
-            y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]])
+            y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0],
+                            [1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)
 
-            y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
+            y_pred = np.array([[0.0, 0.0, 1.0], [0.0, 0.0, 1.0],
+                            [1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
             previous_score = current_score
             current_score = scorer(y_true, y_pred)
             self.assertLess(current_score, previous_score)

From 1d56ebe015af30c1cf07586208d2d55de8833ce2 Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Tue, 17 Apr 2018 22:24:07 +0200
Subject: [PATCH 4/7] .

---
 autosklearn/metrics/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 52a9d63e12..652966a1b2 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -148,7 +148,7 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True,
         ``score_func(y, y_pred, **kwargs)``.
 
     optimum : int or float, default=1
-        The best value achievable by the score function.
+        The best score achievable by the score function.
 
     greater_is_better : boolean, default=True
         Whether score_func is a score function (default), meaning high is good,

From f5086fa22b5598825675e04307fbc4b66701b677 Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Tue, 17 Apr 2018 22:30:12 +0200
Subject: [PATCH 5/7] .

---
 autosklearn/metrics/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 652966a1b2..9e6bc77a1a 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -148,7 +148,8 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True,
         ``score_func(y, y_pred, **kwargs)``.
 
     optimum : int or float, default=1
-        The best score achievable by the score function.
+        The best score achievable by the score function, i.e. maximum in case of
+        scorer function and minimum in case of loss function.
 
     greater_is_better : boolean, default=True
         Whether score_func is a score function (default), meaning high is good,

From a535d2957ee2705d88ed366c275f9bbfa6510cf8 Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Fri, 20 Apr 2018 13:35:11 +0200
Subject: [PATCH 6/7] .

---
 autosklearn/metrics/__init__.py  | 40 +++++++++++++++++++++-----------
 test/test_metric/test_metrics.py | 23 ++++++------------
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 9e6bc77a1a..1acfb50080 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -185,35 +185,49 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True,
 # Standard regression scores
 r2 = make_scorer('r2', sklearn.metrics.r2_score)
 mean_squared_error = make_scorer('mean_squared_error',
-                                 sklearn.metrics.mean_squared_error, optimum=0,
+                                 sklearn.metrics.mean_squared_error,
+                                 optimum=0,
                                  greater_is_better=False)
 mean_absolute_error = make_scorer('mean_absolute_error',
                                   sklearn.metrics.mean_absolute_error,
-                                  optimum=0, greater_is_better=False)
+                                  optimum=0,
+                                  greater_is_better=False)
 median_absolute_error = make_scorer('median_absolute_error',
                                     sklearn.metrics.median_absolute_error,
-                                    optimum=0, greater_is_better=False)
+                                    optimum=0,
+                                    greater_is_better=False)
 
 # Standard Classification Scores
-accuracy = make_scorer('accuracy', sklearn.metrics.accuracy_score)
+accuracy = make_scorer('accuracy',
+                       sklearn.metrics.accuracy_score)
 balanced_accuracy = make_scorer('balanced_accuracy',
                                 classification_metrics.balanced_accuracy)
-f1 = make_scorer('f1', sklearn.metrics.f1_score)
+f1 = make_scorer('f1',
+                 sklearn.metrics.f1_score)
 
 # Score functions that need decision values
-roc_auc = make_scorer('roc_auc', sklearn.metrics.roc_auc_score,
-                      greater_is_better=True, needs_threshold=True)
+roc_auc = make_scorer('roc_auc',
+                      sklearn.metrics.roc_auc_score,
+                      greater_is_better=True,
+                      needs_threshold=True)
 average_precision = make_scorer('average_precision',
                                 sklearn.metrics.average_precision_score,
                                 needs_threshold=True)
-precision = make_scorer('precision', sklearn.metrics.precision_score)
-recall = make_scorer('recall', sklearn.metrics.recall_score)
+precision = make_scorer('precision',
+                        sklearn.metrics.precision_score)
+recall = make_scorer('recall',
+                     sklearn.metrics.recall_score)
 
 # Score function for probabilistic classification
-log_loss = make_scorer('log_loss', sklearn.metrics.log_loss, optimum=0,
-                       greater_is_better=False, needs_proba=True)
-pac_score = make_scorer('pac_score', classification_metrics.pac_score,
-                        greater_is_better=True, needs_proba=True)
+log_loss = make_scorer('log_loss',
+                       sklearn.metrics.log_loss,
+                       optimum=0,
+                       greater_is_better=False,
+                       needs_proba=True)
+pac_score = make_scorer('pac_score',
+                        classification_metrics.pac_score,
+                        greater_is_better=True,
+                        needs_proba=True)
 # TODO what about mathews correlation coefficient etc?
 
 
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 4d3c601aaf..4b7da5d438 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -317,12 +317,7 @@ def test_regression_all(self):
         for metric, scorer in autosklearn.metrics.REGRESSION_METRICS.items():
             y_true = np.array([1, 2, 3, 4])
             y_pred = y_true.copy()
-
-            # the best possible score of r2 loss is 1.
-            if metric == 'r2':
-                previous_score = 1
-            else:
-                previous_score = 0
+            previous_score = scorer._optimum
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
@@ -346,7 +341,9 @@ def test_classification_binary(self):
         for metric, scorer in autosklearn.metrics.CLASSIFICATION_METRICS.items():
             # Skip functions not applicable for binary classification.
             # TODO: Average precision should work for binary classification,
-            # TODO: but its behavior is not right.
+            # TODO: but its behavior is not right. When y_pred is completely
+            # TODO: wrong, it does return 0.5, but when it is not completely
+            # TODO: wrong, it returns value smaller than 0.5.
             if metric in ['average_precision', 'pac_score',
                           'precision_samples', 'recall_samples', 'f1_samples']:
                 continue
@@ -354,10 +351,7 @@ def test_classification_binary(self):
             y_true = np.array([1.0, 1.0, 1.0, 0.0, 0.0, 0.0])
             y_pred = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0],
                                 [1.0, 0.0], [1.0, 0.0]])
-            if metric is 'log_loss':
-                previous_score = 0      # the best value for log loss is 0.
-            else:
-                previous_score = 1     # the best value for other losses is 1.
+            previous_score = scorer._optimum
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
@@ -390,10 +384,7 @@ def test_classification_multiclass(self):
             y_true = np.array([0.0, 0.0, 1.0, 1.0, 2.0])
             y_pred = np.array([[1.0, 0.0, 0.0], [1.0, 0.0, 0.0],
                             [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
-            if metric is 'log_loss': # the best possible score for log_loss is 0.
-                previous_score = 0
-            else:
-                previous_score = 1 # the best value for other losses is 1.
+            previous_score = scorer._optimum
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 
@@ -424,7 +415,7 @@ def test_classification_multilabel(self):
                 continue
             y_true = np.array([[1, 0, 0], [1, 1, 0], [0, 1, 1], [1, 1, 1]])
             y_pred = y_true.copy()
-            previous_score = 1
+            previous_score = scorer._optimum
             current_score = scorer(y_true, y_pred)
             self.assertAlmostEqual(current_score, previous_score)
 

From a99d1d33a35fad42d4ec796209161a1ec57bf1bd Mon Sep 17 00:00:00 2001
From: Jinu <ahnj@cs.uni-freiburg.de>
Date: Fri, 20 Apr 2018 13:36:51 +0200
Subject: [PATCH 7/7] .

---
 autosklearn/metrics/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 1acfb50080..d72aec9e07 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -183,7 +183,8 @@ def make_scorer(name, score_func, optimum=1, greater_is_better=True,
 
 
 # Standard regression scores
-r2 = make_scorer('r2', sklearn.metrics.r2_score)
+r2 = make_scorer('r2',
+                 sklearn.metrics.r2_score)
 mean_squared_error = make_scorer('mean_squared_error',
                                  sklearn.metrics.mean_squared_error,
                                  optimum=0,