[SPARK-29960][ML][PYSPARK] MulticlassClassificationEvaluator support …

…hammingLoss ### What changes were proposed in this pull request? MulticlassClassificationEvaluator support hammingLoss ### Why are the changes needed? 1, it is an easy to compute hammingLoss based on confusion matrix 2, scikit-learn supports it ### Does this PR introduce any user-facing change? yes ### How was this patch tested? added testsuites Closes #26597 from zhengruifeng/multi_class_hamming_loss. Authored-by: zhengruifeng <ruifengz@foxmail.com> Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
apache · Nov 21, 2019 · 0f40d2a · 0f40d2a
1 parent 297cbab
commit 0f40d2a
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 4 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -46,7 +46,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
    * `"weightedPrecision"`, `"weightedRecall"`, `"weightedTruePositiveRate"`,
    * `"weightedFalsePositiveRate"`, `"weightedFMeasure"`, `"truePositiveRateByLabel"`,
    * `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`,
-   * `"fMeasureByLabel"`, `"logLoss"`)
+   * `"fMeasureByLabel"`, `"logLoss"`, `"hammingLoss"`)
    *
    * @group param
    */
@@ -172,13 +172,15 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
       case "precisionByLabel" => metrics.precision($(metricLabel))
       case "recallByLabel" => metrics.recall($(metricLabel))
       case "fMeasureByLabel" => metrics.fMeasure($(metricLabel), $(beta))
+      case "hammingLoss" => metrics.hammingLoss
       case "logLoss" => metrics.logLoss($(eps))
     }
   }
 
   @Since("1.5.0")
   override def isLargerBetter: Boolean = $(metricName) match {
-    case "weightedFalsePositiveRate" | "falsePositiveRateByLabel" | "logLoss" => false
+    case "weightedFalsePositiveRate" | "falsePositiveRateByLabel" | "logLoss" | "hammingLoss" =>
+      false
     case _ => true
   }
 
@@ -199,7 +201,7 @@ object MulticlassClassificationEvaluator
   private val supportedMetricNames = Array("f1", "accuracy", "weightedPrecision", "weightedRecall",
     "weightedTruePositiveRate", "weightedFalsePositiveRate", "weightedFMeasure",
     "truePositiveRateByLabel", "falsePositiveRateByLabel", "precisionByLabel", "recallByLabel",
-    "fMeasureByLabel", "logLoss")
+    "fMeasureByLabel", "logLoss", "hammingLoss")
 
   @Since("1.6.0")
   override def load(path: String): MulticlassClassificationEvaluator = super.load(path)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -240,6 +240,23 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[_ <: Product])
   @Since("1.1.0")
   lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
 
+  /**
+   * Returns Hamming-loss
+   */
+  @Since("3.0.0")
+  lazy val hammingLoss: Double = {
+    var numerator = 0.0
+    var denominator = 0.0
+    confusions.iterator.foreach {
+      case ((label, prediction), weight) =>
+        if (label != prediction) {
+          numerator += weight
+        }
+        denominator += weight
+    }
+    numerator / denominator
+  }
+
   /**
    * Returns the log-loss, aka logistic loss or cross-entropy loss.
    * @param eps log-loss is undefined for p=0 or p=1, so probabilities are

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
@@ -254,4 +254,35 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     val metrics2 = new MulticlassMetrics(rdd2)
     assert(metrics2.logLoss() ~== 0.9682005730687164 relTol delta)
   }
+
+  test("MulticlassMetrics supports hammingLoss") {
+    /*
+     Using the following Python code to verify the correctness.
+
+     from sklearn.metrics import hamming_loss
+     y_true = [2, 2, 3, 4]
+     y_pred = [1, 2, 3, 4]
+     weights = [1.5, 2.0, 1.0, 0.5]
+
+     >>> hamming_loss(y_true, y_pred)
+     0.25
+     >>> hamming_loss(y_true, y_pred, sample_weight=weights)
+     0.3
+    */
+
+    val preds = Seq(1.0, 2.0, 3.0, 4.0)
+    val labels = Seq(2.0, 2.0, 3.0, 4.0)
+    val weights = Seq(1.5, 2.0, 1.0, 0.5)
+
+    val rdd = sc.parallelize(preds.zip(labels))
+    val metrics = new MulticlassMetrics(rdd)
+    assert(metrics.hammingLoss ~== 0.25 relTol delta)
+
+    val rdd2 = sc.parallelize(preds.zip(labels).zip(weights))
+      .map { case ((pred, label), weight) =>
+        (pred, label, weight)
+      }
+    val metrics2 = new MulticlassMetrics(rdd2)
+    assert(metrics2.hammingLoss ~== 0.3 relTol delta)
+  }
 }
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
@@ -374,6 +374,10 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
     >>> evaluator.evaluate(dataset, {evaluator.metricName: "truePositiveRateByLabel",
     ...     evaluator.metricLabel: 1.0})
     0.75...
+    >>> evaluator.setMetricName("hammingLoss")
+    MulticlassClassificationEvaluator...
+    >>> evaluator.evaluate(dataset)
+    0.33...
     >>> mce_path = temp_path + "/mce"
     >>> evaluator.save(mce_path)
     >>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
@@ -408,7 +412,7 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
                        "(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
                        "weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
                        "falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|"
-                       "logLoss)",
+                       "logLoss|hammingLoss)",
                        typeConverter=TypeConverters.toString)
     metricLabel = Param(Params._dummy(), "metricLabel",
                         "The class whose metric will be computed in truePositiveRateByLabel|"