From 972d5aceb7247c79606b208733fca3c364a7bda8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 9 May 2015 00:22:30 +0800
Subject: [PATCH 1/3] Add MulticlassMetrics in PySpark/MLlib

---
 .../mllib/evaluation/MulticlassMetrics.scala  |   8 ++
 python/pyspark/mllib/evaluation.py            | 127 ++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 666362ae6739a..4628dc5690913 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -23,6 +23,7 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.linalg.{Matrices, Matrix}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
 
 /**
  * ::Experimental::
@@ -33,6 +34,13 @@ import org.apache.spark.rdd.RDD
 @Experimental
 class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
 
+  /**
+   * An auxiliary constructor taking a DataFrame.
+   * @param predictionAndLabels a DataFrame with two double columns: prediction and label
+   */
+  private[mllib] def this(predictionAndLabels: DataFrame) =
+    this(predictionAndLabels.map(r => (r.getDouble(0), r.getDouble(1))))
+
   private lazy val labelCountByClass: Map[Double, Long] = predictionAndLabels.values.countByValue()
   private lazy val labelCount: Long = labelCountByClass.values.sum
   private lazy val tpByClass: Map[Double, Int] = predictionAndLabels
diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 3e11df09da6b1..a766cdd1d5b36 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -141,6 +141,133 @@ def r2(self):
         return self.call("r2")
 
 
+class MulticlassMetrics(JavaModelWrapper):
+    """
+    Evaluator for multiclass classification.
+
+    >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
+    ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
+    >>> metrics = MulticlassMetrics(predictionAndLabels)
+    >>> metrics.falsePositiveRate(0.0)
+    0.2
+    >>> metrics.precision(1.0)
+    0.75
+    >>> metrics.recall(2.0)
+    1.0
+    >>> metrics.fMeasure(0.0, 2.0)
+    0.52...
+    >>> metrics.precision()
+    0.66...
+    >>> metrics.recall()
+    0.66...
+    >>> metrics.weightedFalsePositiveRate
+    0.19...
+    >>> metrics.weightedPrecision
+    0.68...
+    >>> metrics.weightedRecall
+    0.66...
+    >>> metrics.weightedFMeasure()
+    0.66...
+    """
+
+    def __init__(self, predictionAndLabels):
+        """
+        :param predictionAndLabels an RDD of (prediction, label) pairs.
+        """
+        sc = predictionAndLabels.ctx
+        sql_ctx = SQLContext(sc)
+        df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([
+            StructField("prediction", DoubleType(), nullable=False),
+            StructField("label", DoubleType(), nullable=False)]))
+        java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
+        java_model = java_class(df._jdf)
+        super(MulticlassMetrics, self).__init__(java_model)
+
+    def truePositiveRate(self, label):
+        """
+        Returns true positive rate for a given label (category).
+        """
+        return self.call("truePositiveRate", label)
+
+    def falsePositiveRate(self, label):
+        """
+        Returns false positive rate for a given label (category).
+        """
+        return self.call("falsePositiveRate", label)
+
+    def precision(self, label=None):
+        """
+        Returns precision or precision for a given label (category) if specified.
+        """
+        if label is None:
+            return self.call("precision")
+        else:
+            return self.call("precision", float(label))
+
+    def recall(self, label=None):
+        """
+        Returns recall or recall for a given label (category) if specified.
+        """
+        if label is None:
+            return self.call("recall")
+        else:
+            return self.call("recall", float(label))
+
+    def fMeasure(self, label=None, beta=None):
+        """
+        Returns f-measure or f-measure for a given label (category) if specified.
+        """
+        if beta is None:
+            if label is None:
+                return self.call("fMeasure")
+            else:
+                return self.call("fMeasure", label)
+        else:
+            if label is None:
+                raise Exception("If the beta parameter is specified, label can not be none")
+            else:
+                return self.call("fMeasure", label, beta)
+
+    @property
+    def weightedTruePositiveRate(self):
+        """
+        Returns weighted true positive rate.
+        (equals to precision, recall and f-measure)
+        """
+        return self.call("weightedTruePositiveRate")
+
+    @property
+    def weightedFalsePositiveRate(self):
+        """
+        Returns weighted false positive rate.
+        """
+        return self.call("weightedFalsePositiveRate")
+
+    @property
+    def weightedRecall(self):
+        """
+        Returns weighted averaged recall.
+        (equals to precision, recall and f-measure)
+        """
+        return self.call("weightedRecall")
+
+    @property
+    def weightedPrecision(self):
+        """
+        Returns weighted averaged precision.
+        """
+        return self.call("weightedPrecision")
+
+    def weightedFMeasure(self, beta=None):
+        """
+        Returns weighted averaged f-measure.
+        """
+        if beta is None:
+            return self.call("weightedFMeasure")
+        else:
+            return self.call("weightedFMeasure", beta)
+
+
 def _test():
     import doctest
     from pyspark import SparkContext

From 53c045db9f53891fc981a411071308bafc001e73 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 9 May 2015 21:40:53 +0800
Subject: [PATCH 2/3] keep compatibility for python 2.6

---
 python/pyspark/mllib/evaluation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index a766cdd1d5b36..56b0e0ecab679 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -149,11 +149,11 @@ class MulticlassMetrics(JavaModelWrapper):
     ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)])
     >>> metrics = MulticlassMetrics(predictionAndLabels)
     >>> metrics.falsePositiveRate(0.0)
-    0.2
+    0.2...
     >>> metrics.precision(1.0)
-    0.75
+    0.75...
     >>> metrics.recall(2.0)
-    1.0
+    1.0...
     >>> metrics.fMeasure(0.0, 2.0)
     0.52...
     >>> metrics.precision()

From bb3e4ba9b1ef9cf0cc1ef5e3f41437471b30e89e Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sun, 10 May 2015 12:40:32 +0800
Subject: [PATCH 3/3] trigger jenkins

---
 python/pyspark/mllib/evaluation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 56b0e0ecab679..36914597de228 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -168,6 +168,8 @@ class MulticlassMetrics(JavaModelWrapper):
     0.66...
     >>> metrics.weightedFMeasure()
     0.66...
+    >>> metrics.weightedFMeasure(2.0)
+    0.65...
     """
 
     def __init__(self, predictionAndLabels):