From 11de7ea38261b99e2cf870fccb714fb8c8b42657 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 22 Feb 2016 17:00:33 +0800
Subject: [PATCH 01/15] add kappa

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 191 ++++++++++++------
 1 file changed, 131 insertions(+), 60 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 3029b15f588a4..aae93beefb44c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -25,18 +25,19 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
- * ::Experimental::
- * Evaluator for multiclass classification.
- *
- * @param predictionAndLabels an RDD of (prediction, label) pairs.
- */
+  * ::Experimental::
+  * Evaluator for multiclass classification.
+  *
+  * @param predictionAndLabels an RDD of (prediction, label) pairs.
+  */
 @Since("1.1.0")
-class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
-   * An auxiliary constructor taking a DataFrame.
-   * @param predictionAndLabels a DataFrame with two double columns: prediction and label
-   */
+    * An auxiliary constructor taking a DataFrame.
+    *
+    * @param predictionAndLabels a DataFrame with two double columns: prediction and label
+    */
   private[mllib] def this(predictionAndLabels: DataFrame) =
     this(predictionAndLabels.rdd.map(r => (r.getDouble(0), r.getDouble(1))))
 
@@ -59,11 +60,11 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
     .collectAsMap()
 
   /**
-   * Returns confusion matrix:
-   * predicted classes are in columns,
-   * they are ordered by class label ascending,
-   * as in "labels"
-   */
+    * Returns confusion matrix:
+    * predicted classes are in columns,
+    * they are ordered by class label ascending,
+    * as in "labels"
+    */
   @Since("1.1.0")
   def confusionMatrix: Matrix = {
     val n = labels.size
@@ -81,16 +82,18 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-   * Returns true positive rate for a given label (category)
-   * @param label the label.
-   */
+    * Returns true positive rate for a given label (category)
+    *
+    * @param label the label.
+    */
   @Since("1.1.0")
   def truePositiveRate(label: Double): Double = recall(label)
 
   /**
-   * Returns false positive rate for a given label (category)
-   * @param label the label.
-   */
+    * Returns false positive rate for a given label (category)
+    *
+    * @param label the label.
+    */
   @Since("1.1.0")
   def falsePositiveRate(label: Double): Double = {
     val fp = fpByClass.getOrElse(label, 0)
@@ -98,9 +101,10 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-   * Returns precision for a given label (category)
-   * @param label the label.
-   */
+    * Returns precision for a given label (category)
+    *
+    * @param label the label.
+    */
   @Since("1.1.0")
   def precision(label: Double): Double = {
     val tp = tpByClass(label)
@@ -109,17 +113,19 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-   * Returns recall for a given label (category)
-   * @param label the label.
-   */
+    * Returns recall for a given label (category)
+    *
+    * @param label the label.
+    */
   @Since("1.1.0")
   def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label)
 
   /**
-   * Returns f-measure for a given label (category)
-   * @param label the label.
-   * @param beta the beta parameter.
-   */
+    * Returns f-measure for a given label (category)
+    *
+    * @param label the label.
+    * @param beta  the beta parameter.
+    */
   @Since("1.1.0")
   def fMeasure(label: Double, beta: Double): Double = {
     val p = precision(label)
@@ -129,86 +135,151 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-   * Returns f1-measure for a given label (category)
-   * @param label the label.
-   */
+    * Returns f1-measure for a given label (category)
+    *
+    * @param label the label.
+    */
   @Since("1.1.0")
   def fMeasure(label: Double): Double = fMeasure(label, 1.0)
 
   /**
-   * Returns precision
-   */
+    * Returns precision
+    */
   @Since("1.1.0")
   lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
 
   /**
-   * Returns recall
-   * (equals to precision for multiclass classifier
-   * because sum of all false positives is equal to sum
-   * of all false negatives)
-   */
+    * Returns recall
+    * (equals to precision for multiclass classifier
+    * because sum of all false positives is equal to sum
+    * of all false negatives)
+    */
   @Since("1.1.0")
   lazy val recall: Double = precision
 
   /**
-   * Returns f-measure
-   * (equals to precision and recall because precision equals recall)
-   */
+    * Returns f-measure
+    * (equals to precision and recall because precision equals recall)
+    */
   @Since("1.1.0")
   lazy val fMeasure: Double = precision
 
   /**
-   * Returns weighted true positive rate
-   * (equals to precision, recall and f-measure)
-   */
+    * Returns weighted true positive rate
+    * (equals to precision, recall and f-measure)
+    */
   @Since("1.1.0")
   lazy val weightedTruePositiveRate: Double = weightedRecall
 
   /**
-   * Returns weighted false positive rate
-   */
+    * Returns weighted false positive rate
+    */
   @Since("1.1.0")
   lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) =>
     falsePositiveRate(category) * count.toDouble / labelCount
   }.sum
 
   /**
-   * Returns weighted averaged recall
-   * (equals to precision, recall and f-measure)
-   */
+    * Returns weighted averaged recall
+    * (equals to precision, recall and f-measure)
+    */
   @Since("1.1.0")
   lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) =>
     recall(category) * count.toDouble / labelCount
   }.sum
 
   /**
-   * Returns weighted averaged precision
-   */
+    * Returns weighted averaged precision
+    */
   @Since("1.1.0")
   lazy val weightedPrecision: Double = labelCountByClass.map { case (category, count) =>
     precision(category) * count.toDouble / labelCount
   }.sum
 
   /**
-   * Returns weighted averaged f-measure
-   * @param beta the beta parameter.
-   */
+    * Returns weighted averaged f-measure
+    *
+    * @param beta the beta parameter.
+    */
   @Since("1.1.0")
   def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) =>
     fMeasure(category, beta) * count.toDouble / labelCount
   }.sum
 
   /**
-   * Returns weighted averaged f1-measure
-   */
+    * Returns weighted averaged f1-measure
+    */
   @Since("1.1.0")
   lazy val weightedFMeasure: Double = labelCountByClass.map { case (category, count) =>
     fMeasure(category, 1.0) * count.toDouble / labelCount
   }.sum
 
   /**
-   * Returns the sequence of labels in ascending order
-   */
+    * Returns the sequence of labels in ascending order
+    */
   @Since("1.1.0")
   lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
+
+
+  /**
+    * Returns the sequence of labels in ascending order
+    */
+  @Since("1.6.0")
+  def kappa(weights: Matrix): Double = {
+    val n = labels.size
+    require(weights.numRows == n)
+    require(weights.numCols == n)
+
+    weights.foreachActive {
+      case (i, j, w) =>
+        require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
+    }
+
+    val f =  (i:Double, j:Double) =>    {
+      weights.index(i, j)
+    }
+
+
+    1.0
+  }
+
+  /**
+    * Returns the sequence of labels in ascending order
+    */
+  @Since("1.6.0")
+  def weightedKappa(weights: (Double, Double) => Double): Double = {
+    val mat = confusionMatrix
+
+    val sumByRows = collection.mutable.Map[Int, Double]()
+    val sumByCols = collection.mutable.Map[Int, Double]()
+    var sum = 0.0
+
+    mat.foreachActive {
+      case (i, j, v) =>
+        val vRow = sumByRows.getOrElse(i, 0.0)
+        sumByRows.update(i, vRow + v)
+        val vCol = sumByCols.getOrElse(j, 0.0)
+        sumByCols.update(j, vCol + v)
+        sum += v
+    }
+
+    // weighted observed proportional agreement
+    var po = 0.0
+    // weighted proportional agreement expected just by chance
+    var pe = 0.0
+
+    mat.foreachActive {
+      case (i, j, v) =>
+        val w = weights(i, j)
+        require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
+
+        po += w * v
+        pe += w * sumByRows.getOrElse(i, 0.0) * sumByCols.getOrElse(j, 0.0)
+    }
+
+    po /= sum
+    pe /= sum * sum
+
+    (po - pe) / (1 - pe)
+  }
 }

From 4211e0b7624bf1cfe78d55a11a1e902dc6b006a1 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 22 Feb 2016 17:12:30 +0800
Subject: [PATCH 02/15] update kappa

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 47 ++++++++++++++++---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index aae93beefb44c..58870ea257a5b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -225,7 +225,43 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     * Returns the sequence of labels in ascending order
     */
   @Since("1.6.0")
-  def kappa(weights: Matrix): Double = {
+  def Kappa(weights: String): Double = {
+    weightedKappa("default")
+  }
+
+
+  /**
+    * Returns the sequence of labels in ascending order
+    */
+  @Since("1.6.0")
+  def weightedKappa(weights: String): Double = {
+    val wFunc = weights match {
+      case "linear" =>
+        (i: Int, j:Int) => Math.abs(i - j).toDouble
+      case "quadratic" =>
+        (i: Int, j:Int) => (i - j) * (i - j)
+      case "default" =>
+        (i: Int, j:Int) => {
+          if(i == j) {
+            0
+          } else {
+            0
+          }
+        }
+      case t =>
+        throw new IllegalArgumentException(
+          s"weightedKappa only supports {linear, quadratic, default} but got type ${t}.")
+    }
+
+    weightedKappa(wFunc)
+  }
+
+
+  /**
+    * Returns the sequence of labels in ascending order
+    */
+  @Since("1.6.0")
+  def weightedKappa(weights: Matrix): Double = {
     val n = labels.size
     require(weights.numRows == n)
     require(weights.numCols == n)
@@ -235,19 +271,16 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
         require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
     }
 
-    val f =  (i:Double, j:Double) =>    {
-      weights.index(i, j)
-    }
-
+    val wFunc = (i: Int, j:Int) => weights(i, j)
 
-    1.0
+    weightedKappa(wFunc)
   }
 
   /**
     * Returns the sequence of labels in ascending order
     */
   @Since("1.6.0")
-  def weightedKappa(weights: (Double, Double) => Double): Double = {
+  def weightedKappa(weights: (Int, Int) => Double): Double = {
     val mat = confusionMatrix
 
     val sumByRows = collection.mutable.Map[Int, Double]()

From 7c6de2a0b756767db4209682cf4acf127f8c9e56 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 22 Feb 2016 17:27:22 +0800
Subject: [PATCH 03/15] update kappa

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 58870ea257a5b..545a8e8bfc8c8 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -229,7 +229,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     weightedKappa("default")
   }
 
-
   /**
     * Returns the sequence of labels in ascending order
     */
@@ -239,13 +238,13 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
       case "linear" =>
         (i: Int, j:Int) => Math.abs(i - j).toDouble
       case "quadratic" =>
-        (i: Int, j:Int) => (i - j) * (i - j)
+        (i: Int, j:Int) => (i - j) * (i - j).toDouble
       case "default" =>
         (i: Int, j:Int) => {
           if(i == j) {
-            0
+            0.0
           } else {
-            0
+            1.0
           }
         }
       case t =>
@@ -271,7 +270,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
         require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
     }
 
-    val wFunc = (i: Int, j:Int) => weights(i, j)
+    val wFunc = (i: Int, j: Int) => weights(i, j)
 
     weightedKappa(wFunc)
   }
@@ -282,17 +281,16 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
   @Since("1.6.0")
   def weightedKappa(weights: (Int, Int) => Double): Double = {
     val mat = confusionMatrix
+    val n = mat.numRows
 
-    val sumByRows = collection.mutable.Map[Int, Double]()
-    val sumByCols = collection.mutable.Map[Int, Double]()
+    val sumByRows = Array.fill(n)(0.0)
+    val sumByCols = Array.fill(n)(0.0)
     var sum = 0.0
 
     mat.foreachActive {
       case (i, j, v) =>
-        val vRow = sumByRows.getOrElse(i, 0.0)
-        sumByRows.update(i, vRow + v)
-        val vCol = sumByCols.getOrElse(j, 0.0)
-        sumByCols.update(j, vCol + v)
+        sumByRows(i) += v
+        sumByCols(j) += v
         sum += v
     }
 
@@ -307,7 +305,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
         require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
 
         po += w * v
-        pe += w * sumByRows.getOrElse(i, 0.0) * sumByCols.getOrElse(j, 0.0)
+        pe += w * sumByRows(i) * sumByCols(j)
     }
 
     po /= sum

From d67592a390687fb69794b09104c737d5602620b0 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 22 Feb 2016 18:57:44 +0800
Subject: [PATCH 04/15] update kappa

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 545a8e8bfc8c8..1c86b6c1f743a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -225,23 +225,24 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     * Returns the sequence of labels in ascending order
     */
   @Since("1.6.0")
-  def Kappa(weights: String): Double = {
-    weightedKappa("default")
+  def Kappa(): Double = {
+    kappa("default")
   }
 
   /**
     * Returns the sequence of labels in ascending order
     */
   @Since("1.6.0")
-  def weightedKappa(weights: String): Double = {
+  def kappa(weights: String): Double = {
+
     val wFunc = weights match {
       case "linear" =>
-        (i: Int, j:Int) => Math.abs(i - j).toDouble
+        (i: Int, j: Int) => Math.abs(i - j).toDouble
       case "quadratic" =>
-        (i: Int, j:Int) => (i - j) * (i - j).toDouble
+        (i: Int, j: Int) => (i - j).toDouble * (i - j)
       case "default" =>
-        (i: Int, j:Int) => {
-          if(i == j) {
+        (i: Int, j: Int) => {
+          if (i == j) {
             0.0
           } else {
             1.0
@@ -252,7 +253,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
           s"weightedKappa only supports {linear, quadratic, default} but got type ${t}.")
     }
 
-    weightedKappa(wFunc)
+    kappa(wFunc)
   }
 
 
@@ -260,7 +261,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     * Returns the sequence of labels in ascending order
     */
   @Since("1.6.0")
-  def weightedKappa(weights: Matrix): Double = {
+  def kappa(weights: Matrix): Double = {
     val n = labels.size
     require(weights.numRows == n)
     require(weights.numCols == n)
@@ -272,14 +273,14 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
     val wFunc = (i: Int, j: Int) => weights(i, j)
 
-    weightedKappa(wFunc)
+    kappa(wFunc)
   }
 
   /**
     * Returns the sequence of labels in ascending order
     */
   @Since("1.6.0")
-  def weightedKappa(weights: (Int, Int) => Double): Double = {
+  def kappa(weights: (Int, Int) => Double): Double = {
     val mat = confusionMatrix
     val n = mat.numRows
 
@@ -294,23 +295,21 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
         sum += v
     }
 
-    // weighted observed proportional agreement
-    var po = 0.0
-    // weighted proportional agreement expected just by chance
-    var pe = 0.0
+    var numerator = 0.0
+    var denominator = 0.0
 
     mat.foreachActive {
       case (i, j, v) =>
         val w = weights(i, j)
         require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
-
-        po += w * v
-        pe += w * sumByRows(i) * sumByCols(j)
+        numerator += w * v
+        denominator += w * sumByRows(i) * sumByCols(j) / sum
     }
 
-    po /= sum
-    pe /= sum * sum
-
-    (po - pe) / (1 - pe)
+    if (denominator > 0) {
+      1 - numerator / denominator
+    } else {
+      1.0
+    }
   }
 }

From 18cee5902640e4e4ed13af9d18d3b51b78565da1 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 22 Feb 2016 19:24:30 +0800
Subject: [PATCH 05/15] update kappa

---
 .../org/apache/spark/mllib/evaluation/MulticlassMetrics.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 1c86b6c1f743a..d384cc019360e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -250,7 +250,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
         }
       case t =>
         throw new IllegalArgumentException(
-          s"weightedKappa only supports {linear, quadratic, default} but got type ${t}.")
+          s"kappa only supports {linear, quadratic, default} but got type ${t}.")
     }
 
     kappa(wFunc)

From 1ed06b461cf1e5e212da33973779743239a1e5ed Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 22 Feb 2016 21:25:57 +0800
Subject: [PATCH 06/15] update kappa

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 54 ++++++++++++-------
 .../evaluation/MulticlassMetricsSuite.scala   |  7 +++
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index d384cc019360e..bc549a6d31543 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -222,24 +222,31 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
 
   /**
-    * Returns the sequence of labels in ascending order
+    * Returns unweighted Cohen's Kappa
+    * Cohen's kappa coefficient is a statistic which measures inter-rater
+    * agreement for qualitative (categorical) items. It is generally thought
+    * to be a more robust measure than simple percent agreement calculation,
+    * since kappa takes into account the agreement occurring by chance.
+    * The kappa score is a number between -1 and 1. Scores above 0.8 are
+    * generally considered good agreement; zero or lower means no agreement
+    * (practically random labels).
     */
   @Since("1.6.0")
-  def Kappa(): Double = {
+  def kappa(): Double = {
     kappa("default")
   }
 
   /**
-    * Returns the sequence of labels in ascending order
+    * Returns Cohen's Kappa with built-in weighted types
+    *
+    * @param weights the weighted type. "default" means no weighted;
+    *                "linear" means linear weighted;
+    *                "quadratic" means quadratic weighted.
     */
   @Since("1.6.0")
   def kappa(weights: String): Double = {
 
-    val wFunc = weights match {
-      case "linear" =>
-        (i: Int, j: Int) => Math.abs(i - j).toDouble
-      case "quadratic" =>
-        (i: Int, j: Int) => (i - j).toDouble * (i - j)
+    val func = weights match {
       case "default" =>
         (i: Int, j: Int) => {
           if (i == j) {
@@ -248,17 +255,24 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
             1.0
           }
         }
+      case "linear" =>
+        (i: Int, j: Int) => Math.abs(i - j).toDouble
+      case "quadratic" =>
+        (i: Int, j: Int) => (i - j).toDouble * (i - j)
       case t =>
         throw new IllegalArgumentException(
           s"kappa only supports {linear, quadratic, default} but got type ${t}.")
     }
 
-    kappa(wFunc)
+    kappa(func)
   }
 
 
   /**
-    * Returns the sequence of labels in ascending order
+    * Returns Cohen's Kappa with user-defined weight matrix
+    *
+    * @param weights the weight matrix, must be of the same shape with Confusion Matrix.
+    *                Note: Each Element in it must be no less than zero.
     */
   @Since("1.6.0")
   def kappa(weights: Matrix): Double = {
@@ -271,13 +285,17 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
         require(w >= 0, s"weight for (${i}, ${j}) must be no less than 0 but got ${w}")
     }
 
-    val wFunc = (i: Int, j: Int) => weights(i, j)
+    val func = (i: Int, j: Int) => weights(i, j)
 
-    kappa(wFunc)
+    kappa(func)
   }
 
   /**
-    * Returns the sequence of labels in ascending order
+    * Returns Cohen's Kappa with user-defined weight calculation function
+    *
+    * @param weights the weight calculation function. It takes two number as inputs,
+    *                and return a number no less than zero as the corresponding weight.
+    *                Note: Each return must not be negative.
     */
   @Since("1.6.0")
   def kappa(weights: (Int, Int) => Double): Double = {
@@ -288,11 +306,11 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     val sumByCols = Array.fill(n)(0.0)
     var sum = 0.0
 
-    mat.foreachActive {
-      case (i, j, v) =>
-        sumByRows(i) += v
-        sumByCols(j) += v
-        sum += v
+    for (i <- 0 until n; j <- 0 until n) {
+      val v = mat(i, j)
+      sumByRows(i) += v
+      sumByCols(j) += v
+      sum += v
     }
 
     var numerator = 0.0
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
index d55bc8c3ec09f..6b485acc29557 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
@@ -51,6 +51,9 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
     val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
     val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)
+    val unweighted_kappa = 0.47058823529411764
+    val linear_weighted_kappa = 0.4193548387096774
+    val quadratic_weighted_kappa = 0.3571428571428571
 
     assert(metrics.confusionMatrix.toArray.sameElements(confusionMatrix.toArray))
     assert(math.abs(metrics.falsePositiveRate(0.0) - fpRate0) < delta)
@@ -85,5 +88,9 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(math.abs(metrics.weightedFMeasure(2.0) -
       ((4.0 / 9) * f2measure0 + (4.0 / 9) * f2measure1 + (1.0 / 9) * f2measure2)) < delta)
     assert(metrics.labels.sameElements(labels))
+
+    assert(math.abs(metrics.kappa - unweighted_kappa) < delta)
+    assert(math.abs(metrics.kappa("linear") - linear_weighted_kappa) < delta)
+    assert(math.abs(metrics.kappa("quadratic") - quadratic_weighted_kappa) < delta)
   }
 }

From 7244d9fb48de0a3d4add2bcd9b34169b41d6dc40 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 13:16:33 +0800
Subject: [PATCH 07/15] fix some nits

---
 .../spark/mllib/evaluation/MulticlassMetrics.scala | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index bc549a6d31543..fe7241df1d9f6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -247,6 +247,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
   def kappa(weights: String): Double = {
 
     val func = weights match {
+      // standard kappa without weighting
       case "default" =>
         (i: Int, j: Int) => {
           if (i == j) {
@@ -255,13 +256,20 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
             1.0
           }
         }
+      // linear weighted kappa
       case "linear" =>
-        (i: Int, j: Int) => Math.abs(i - j).toDouble
+        (i: Int, j: Int) =>
+          math.abs(i - j).toDouble
+      // quadratic weighted kappa
       case "quadratic" =>
-        (i: Int, j: Int) => (i - j).toDouble * (i - j)
+        (i: Int, j: Int) => {
+          val d = i - j
+          d.toDouble * d
+        }
+      // unknown weighting type
       case t =>
         throw new IllegalArgumentException(
-          s"kappa only supports {linear, quadratic, default} but got type ${t}.")
+          s"kappa only supports weighting type {linear, quadratic, default} but got type ${t}.")
     }
 
     kappa(func)

From 6869a72d8eeaeb12bada0c093d9c0e411cdc3786 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 13:20:27 +0800
Subject: [PATCH 08/15] reformat

---
 .../spark/mllib/evaluation/MulticlassMetrics.scala   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index fe7241df1d9f6..bfd74fcd84036 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -24,12 +24,12 @@ import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
-/**
-  * ::Experimental::
-  * Evaluator for multiclass classification.
-  *
-  * @param predictionAndLabels an RDD of (prediction, label) pairs.
-  */
+  /**
+    * ::Experimental::
+    * Evaluator for multiclass classification.
+    *
+    * @param predictionAndLabels an RDD of (prediction, label) pairs.
+    */
 @Since("1.1.0")
 class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double)]) {
 

From e6806d81ff3aad6f49aff0339738d6b9b43270e3 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 13:22:05 +0800
Subject: [PATCH 09/15] reformat

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 32 +++++++------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index bfd74fcd84036..e9c8fa239f252 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -24,18 +24,17 @@ import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
-  /**
-    * ::Experimental::
-    * Evaluator for multiclass classification.
-    *
-    * @param predictionAndLabels an RDD of (prediction, label) pairs.
-    */
+/**
+  * ::Experimental::
+  * Evaluator for multiclass classification.
+  *
+  * @param predictionAndLabels an RDD of (prediction, label) pairs.
+  */
 @Since("1.1.0")
-class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
     * An auxiliary constructor taking a DataFrame.
-    *
     * @param predictionAndLabels a DataFrame with two double columns: prediction and label
     */
   private[mllib] def this(predictionAndLabels: DataFrame) =
@@ -83,7 +82,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns true positive rate for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -91,7 +89,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns false positive rate for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -102,7 +99,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns precision for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -114,7 +110,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns recall for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -122,9 +117,8 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns f-measure for a given label (category)
-    *
     * @param label the label.
-    * @param beta  the beta parameter.
+    * @param beta the beta parameter.
     */
   @Since("1.1.0")
   def fMeasure(label: Double, beta: Double): Double = {
@@ -136,7 +130,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns f1-measure for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -198,7 +191,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns weighted averaged f-measure
-    *
     * @param beta the beta parameter.
     */
   @Since("1.1.0")
@@ -231,7 +223,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     * generally considered good agreement; zero or lower means no agreement
     * (practically random labels).
     */
-  @Since("1.6.0")
+  @Since("2.0.0")
   def kappa(): Double = {
     kappa("default")
   }
@@ -243,7 +235,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     *                "linear" means linear weighted;
     *                "quadratic" means quadratic weighted.
     */
-  @Since("1.6.0")
+  @Since("2.0.0")
   def kappa(weights: String): Double = {
 
     val func = weights match {
@@ -282,7 +274,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     * @param weights the weight matrix, must be of the same shape with Confusion Matrix.
     *                Note: Each Element in it must be no less than zero.
     */
-  @Since("1.6.0")
+  @Since("2.0.0")
   def kappa(weights: Matrix): Double = {
     val n = labels.size
     require(weights.numRows == n)
@@ -305,7 +297,7 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
     *                and return a number no less than zero as the corresponding weight.
     *                Note: Each return must not be negative.
     */
-  @Since("1.6.0")
+  @Since("2.0.0")
   def kappa(weights: (Int, Int) => Double): Double = {
     val mat = confusionMatrix
     val n = mat.numRows

From d65e60ab64cb68cec6083acd6792afe6a7c5b9e9 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 13:34:31 +0800
Subject: [PATCH 10/15] reformat

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index e9c8fa239f252..500da8af3614a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -24,17 +24,18 @@ import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
-/**
-  * ::Experimental::
-  * Evaluator for multiclass classification.
-  *
-  * @param predictionAndLabels an RDD of (prediction, label) pairs.
-  */
+ /**
+   * ::Experimental::
+   * Evaluator for multiclass classification.
+   *
+   * @param predictionAndLabels an RDD of (prediction, label) pairs.
+   */
 @Since("1.1.0")
-class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
     * An auxiliary constructor taking a DataFrame.
+    *
     * @param predictionAndLabels a DataFrame with two double columns: prediction and label
     */
   private[mllib] def this(predictionAndLabels: DataFrame) =
@@ -82,6 +83,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns true positive rate for a given label (category)
+    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -89,6 +91,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns false positive rate for a given label (category)
+    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -99,6 +102,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns precision for a given label (category)
+    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -110,6 +114,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns recall for a given label (category)
+    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -117,8 +122,9 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns f-measure for a given label (category)
+    *
     * @param label the label.
-    * @param beta the beta parameter.
+    * @param beta  the beta parameter.
     */
   @Since("1.1.0")
   def fMeasure(label: Double, beta: Double): Double = {
@@ -130,6 +136,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns f1-measure for a given label (category)
+    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -191,6 +198,7 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
   /**
     * Returns weighted averaged f-measure
+    *
     * @param beta the beta parameter.
     */
   @Since("1.1.0")

From 3ec57f1eec567dd80b30140a8ff7e3f5c91c4d6d Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 13:57:40 +0800
Subject: [PATCH 11/15] reformat by copy and paste

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 500da8af3614a..ec3a41463f80b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -1,3 +1,4 @@
+
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -24,18 +25,17 @@ import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
- /**
-   * ::Experimental::
-   * Evaluator for multiclass classification.
-   *
-   * @param predictionAndLabels an RDD of (prediction, label) pairs.
-   */
+/**
+  * ::Experimental::
+  * Evaluator for multiclass classification.
+  *
+  * @param predictionAndLabels an RDD of (prediction, label) pairs.
+  */
 @Since("1.1.0")
-class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
     * An auxiliary constructor taking a DataFrame.
-    *
     * @param predictionAndLabels a DataFrame with two double columns: prediction and label
     */
   private[mllib] def this(predictionAndLabels: DataFrame) =
@@ -83,7 +83,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns true positive rate for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -91,7 +90,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns false positive rate for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -102,7 +100,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns precision for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -114,7 +111,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns recall for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -122,9 +118,8 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns f-measure for a given label (category)
-    *
     * @param label the label.
-    * @param beta  the beta parameter.
+    * @param beta the beta parameter.
     */
   @Since("1.1.0")
   def fMeasure(label: Double, beta: Double): Double = {
@@ -136,7 +131,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns f1-measure for a given label (category)
-    *
     * @param label the label.
     */
   @Since("1.1.0")
@@ -198,7 +192,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
 
   /**
     * Returns weighted averaged f-measure
-    *
     * @param beta the beta parameter.
     */
   @Since("1.1.0")
@@ -220,7 +213,6 @@ class MulticlassMetrics @Since("1.1.0")(predictionAndLabels: RDD[(Double, Double
   @Since("1.1.0")
   lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
 
-
   /**
     * Returns unweighted Cohen's Kappa
     * Cohen's kappa coefficient is a statistic which measures inter-rater

From 3f770e925df2df5a3f89e1f6798a1bb95b8bfd7b Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 14:04:36 +0800
Subject: [PATCH 12/15] reformat

---
 .../org/apache/spark/mllib/evaluation/MulticlassMetrics.scala    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index ec3a41463f80b..19b3a095addbf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -1,4 +1,3 @@
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with

From b3d75b6f6bc4f23f34fd4cbdeb2c0d2253445ba7 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 14:08:04 +0800
Subject: [PATCH 13/15] use Matrix.foreachActive

---
 .../spark/mllib/evaluation/MulticlassMetrics.scala     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 19b3a095addbf..aada286232942 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -305,11 +305,11 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
     val sumByCols = Array.fill(n)(0.0)
     var sum = 0.0
 
-    for (i <- 0 until n; j <- 0 until n) {
-      val v = mat(i, j)
-      sumByRows(i) += v
-      sumByCols(j) += v
-      sum += v
+    mat.foreachActive {
+      case (i, j, v) =>
+        sumByRows(i) += v
+        sumByCols(j) += v
+        sum += v
     }
 
     var numerator = 0.0

From 144b984017badfff04b38f71ffc471f8d9646a27 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 Feb 2016 14:20:32 +0800
Subject: [PATCH 14/15] del space before *

---
 .../mllib/evaluation/MulticlassMetrics.scala  | 167 +++++++++---------
 1 file changed, 82 insertions(+), 85 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index aada286232942..973402d1e67e2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -25,18 +25,18 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
 
 /**
-  * ::Experimental::
-  * Evaluator for multiclass classification.
-  *
-  * @param predictionAndLabels an RDD of (prediction, label) pairs.
-  */
+ * ::Experimental::
+ * Evaluator for multiclass classification.
+ *
+ * @param predictionAndLabels an RDD of (prediction, label) pairs.
+ */
 @Since("1.1.0")
 class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) {
 
   /**
-    * An auxiliary constructor taking a DataFrame.
-    * @param predictionAndLabels a DataFrame with two double columns: prediction and label
-    */
+   * An auxiliary constructor taking a DataFrame.
+   * @param predictionAndLabels a DataFrame with two double columns: prediction and label
+   */
   private[mllib] def this(predictionAndLabels: DataFrame) =
     this(predictionAndLabels.rdd.map(r => (r.getDouble(0), r.getDouble(1))))
 
@@ -59,11 +59,11 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
     .collectAsMap()
 
   /**
-    * Returns confusion matrix:
-    * predicted classes are in columns,
-    * they are ordered by class label ascending,
-    * as in "labels"
-    */
+   * Returns confusion matrix:
+   * predicted classes are in columns,
+   * they are ordered by class label ascending,
+   * as in "labels"
+   */
   @Since("1.1.0")
   def confusionMatrix: Matrix = {
     val n = labels.size
@@ -81,16 +81,16 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-    * Returns true positive rate for a given label (category)
-    * @param label the label.
-    */
+   * Returns true positive rate for a given label (category)
+   * @param label the label.
+   */
   @Since("1.1.0")
   def truePositiveRate(label: Double): Double = recall(label)
 
   /**
-    * Returns false positive rate for a given label (category)
-    * @param label the label.
-    */
+   * Returns false positive rate for a given label (category)
+   * @param label the label.
+   */
   @Since("1.1.0")
   def falsePositiveRate(label: Double): Double = {
     val fp = fpByClass.getOrElse(label, 0)
@@ -98,9 +98,9 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-    * Returns precision for a given label (category)
-    * @param label the label.
-    */
+   * Returns precision for a given label (category)
+   * @param label the label.
+   */
   @Since("1.1.0")
   def precision(label: Double): Double = {
     val tp = tpByClass(label)
@@ -109,17 +109,17 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-    * Returns recall for a given label (category)
-    * @param label the label.
-    */
+   * Returns recall for a given label (category)
+   * @param label the label.
+   */
   @Since("1.1.0")
   def recall(label: Double): Double = tpByClass(label).toDouble / labelCountByClass(label)
 
   /**
-    * Returns f-measure for a given label (category)
-    * @param label the label.
-    * @param beta the beta parameter.
-    */
+   * Returns f-measure for a given label (category)
+   * @param label the label.
+   * @param beta the beta parameter.
+   */
   @Since("1.1.0")
   def fMeasure(label: Double, beta: Double): Double = {
     val p = precision(label)
@@ -129,111 +129,110 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-    * Returns f1-measure for a given label (category)
-    * @param label the label.
-    */
+   * Returns f1-measure for a given label (category)
+   * @param label the label.
+   */
   @Since("1.1.0")
   def fMeasure(label: Double): Double = fMeasure(label, 1.0)
 
   /**
-    * Returns precision
-    */
+   * Returns precision
+   */
   @Since("1.1.0")
   lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
 
   /**
-    * Returns recall
-    * (equals to precision for multiclass classifier
-    * because sum of all false positives is equal to sum
-    * of all false negatives)
-    */
+   * Returns recall
+   * (equals to precision for multiclass classifier
+   * because sum of all false positives is equal to sum
+   * of all false negatives)
+   */
   @Since("1.1.0")
   lazy val recall: Double = precision
 
   /**
-    * Returns f-measure
-    * (equals to precision and recall because precision equals recall)
-    */
+   * Returns f-measure
+   * (equals to precision and recall because precision equals recall)
+   */
   @Since("1.1.0")
   lazy val fMeasure: Double = precision
 
   /**
-    * Returns weighted true positive rate
-    * (equals to precision, recall and f-measure)
-    */
+   * Returns weighted true positive rate
+   * (equals to precision, recall and f-measure)
+   */
   @Since("1.1.0")
   lazy val weightedTruePositiveRate: Double = weightedRecall
 
   /**
-    * Returns weighted false positive rate
-    */
+   * Returns weighted false positive rate
+   */
   @Since("1.1.0")
   lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) =>
     falsePositiveRate(category) * count.toDouble / labelCount
   }.sum
 
   /**
-    * Returns weighted averaged recall
-    * (equals to precision, recall and f-measure)
-    */
+   * Returns weighted averaged recall
+   * (equals to precision, recall and f-measure)
+   */
   @Since("1.1.0")
   lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) =>
     recall(category) * count.toDouble / labelCount
   }.sum
 
   /**
-    * Returns weighted averaged precision
-    */
+   * Returns weighted averaged precision
+   */
   @Since("1.1.0")
   lazy val weightedPrecision: Double = labelCountByClass.map { case (category, count) =>
     precision(category) * count.toDouble / labelCount
   }.sum
 
   /**
-    * Returns weighted averaged f-measure
-    * @param beta the beta parameter.
-    */
+   * Returns weighted averaged f-measure
+   * @param beta the beta parameter.
+   */
   @Since("1.1.0")
   def weightedFMeasure(beta: Double): Double = labelCountByClass.map { case (category, count) =>
     fMeasure(category, beta) * count.toDouble / labelCount
   }.sum
 
   /**
-    * Returns weighted averaged f1-measure
-    */
+   * Returns weighted averaged f1-measure
+   */
   @Since("1.1.0")
   lazy val weightedFMeasure: Double = labelCountByClass.map { case (category, count) =>
     fMeasure(category, 1.0) * count.toDouble / labelCount
   }.sum
 
   /**
-    * Returns the sequence of labels in ascending order
-    */
+   * Returns the sequence of labels in ascending order
+   */
   @Since("1.1.0")
   lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted
 
   /**
-    * Returns unweighted Cohen's Kappa
-    * Cohen's kappa coefficient is a statistic which measures inter-rater
-    * agreement for qualitative (categorical) items. It is generally thought
-    * to be a more robust measure than simple percent agreement calculation,
-    * since kappa takes into account the agreement occurring by chance.
-    * The kappa score is a number between -1 and 1. Scores above 0.8 are
-    * generally considered good agreement; zero or lower means no agreement
-    * (practically random labels).
-    */
+   * Returns unweighted Cohen's Kappa
+   * Cohen's kappa coefficient is a statistic which measures inter-rater
+   * agreement for qualitative (categorical) items. It is generally thought
+   * to be a more robust measure than simple percent agreement calculation,
+   * since kappa takes into account the agreement occurring by chance.
+   * The kappa score is a number between -1 and 1. Scores above 0.8 are
+   * generally considered good agreement; zero or lower means no agreement
+   * (practically random labels).
+   */
   @Since("2.0.0")
   def kappa(): Double = {
     kappa("default")
   }
 
   /**
-    * Returns Cohen's Kappa with built-in weighted types
-    *
-    * @param weights the weighted type. "default" means no weighted;
-    *                "linear" means linear weighted;
-    *                "quadratic" means quadratic weighted.
-    */
+   * Returns Cohen's Kappa with built-in weighted types
+   * @param weights the weighted type. "default" means no weighted;
+   *                "linear" means linear weighted;
+   *                "quadratic" means quadratic weighted.
+   */
   @Since("2.0.0")
   def kappa(weights: String): Double = {
 
@@ -268,11 +267,10 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
 
 
   /**
-    * Returns Cohen's Kappa with user-defined weight matrix
-    *
-    * @param weights the weight matrix, must be of the same shape with Confusion Matrix.
-    *                Note: Each Element in it must be no less than zero.
-    */
+   * Returns Cohen's Kappa with user-defined weight matrix
+   * @param weights the weight matrix, must be of the same shape with Confusion Matrix.
+   *                Note: Each Element in it must be no less than zero.
+   */
   @Since("2.0.0")
   def kappa(weights: Matrix): Double = {
     val n = labels.size
@@ -290,12 +288,11 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
   }
 
   /**
-    * Returns Cohen's Kappa with user-defined weight calculation function
-    *
-    * @param weights the weight calculation function. It takes two number as inputs,
-    *                and return a number no less than zero as the corresponding weight.
-    *                Note: Each return must not be negative.
-    */
+   * Returns Cohen's Kappa with user-defined weight calculation function
+   * @param weights the weight calculation function. It takes two number as inputs,
+   *                and return a number no less than zero as the corresponding weight.
+   *                Note: Each return must not be negative.
+   */
   @Since("2.0.0")
   def kappa(weights: (Int, Int) => Double): Double = {
     val mat = confusionMatrix

From 9e789205276503817a08c42681bd74a8f3fbffee Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 1 Mar 2016 11:23:14 +0800
Subject: [PATCH 15/15] add verify python code

---
 .../mllib/evaluation/MulticlassMetricsSuite.scala | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
index 6b485acc29557..087dd3411a68f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MulticlassMetricsSuite.scala
@@ -51,6 +51,21 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
     val f2measure0 = (1 + 2 * 2) * precision0 * recall0 / (2 * 2 * precision0 + recall0)
     val f2measure1 = (1 + 2 * 2) * precision1 * recall1 / (2 * 2 * precision1 + recall1)
     val f2measure2 = (1 + 2 * 2) * precision2 * recall2 / (2 * 2 * precision2 + recall2)
+
+    /* Verify results using the `Python` code:
+       from sklearn.metrics import cohen_kappa_score
+       from ml_metrics import quadratic_weighted_kappa, linear_weighted_kappa, kappa
+       preds = [0, 0, 0, 1, 1, 1, 1, 2, 2]
+       labels = [0, 1, 0, 0, 1, 1, 1, 2, 0]
+       cohen_kappa_score(preds, labels)
+       > 0.47058823529411781
+       quadratic_weighted_kappa(preds, labels)
+       > 0.3571428571428571
+       linear_weighted_kappa(preds, labels)
+       > 0.4193548387096774
+       kappa(preds, labels)
+       > 0.47058823529411764
+     */
     val unweighted_kappa = 0.47058823529411764
     val linear_weighted_kappa = 0.4193548387096774
     val quadratic_weighted_kappa = 0.3571428571428571