From 972d5aceb7247c79606b208733fca3c364a7bda8 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 9 May 2015 00:22:30 +0800 Subject: [PATCH 1/3] Add MulticlassMetrics in PySpark/MLlib --- .../mllib/evaluation/MulticlassMetrics.scala | 8 ++ python/pyspark/mllib/evaluation.py | 127 ++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 666362ae6739a..4628dc5690913 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkContext._ import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Matrices, Matrix} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame /** * ::Experimental:: @@ -33,6 +34,13 @@ import org.apache.spark.rdd.RDD @Experimental class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { + /** + * An auxiliary constructor taking a DataFrame. + * @param predictionAndLabels a DataFrame with two double columns: prediction and label + */ + private[mllib] def this(predictionAndLabels: DataFrame) = + this(predictionAndLabels.map(r => (r.getDouble(0), r.getDouble(1)))) + private lazy val labelCountByClass: Map[Double, Long] = predictionAndLabels.values.countByValue() private lazy val labelCount: Long = labelCountByClass.values.sum private lazy val tpByClass: Map[Double, Int] = predictionAndLabels diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 3e11df09da6b1..a766cdd1d5b36 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -141,6 +141,133 @@ def r2(self): return self.call("r2") +class MulticlassMetrics(JavaModelWrapper): + """ + Evaluator for multiclass classification. + + >>> predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), + ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) + >>> metrics = MulticlassMetrics(predictionAndLabels) + >>> metrics.falsePositiveRate(0.0) + 0.2 + >>> metrics.precision(1.0) + 0.75 + >>> metrics.recall(2.0) + 1.0 + >>> metrics.fMeasure(0.0, 2.0) + 0.52... + >>> metrics.precision() + 0.66... + >>> metrics.recall() + 0.66... + >>> metrics.weightedFalsePositiveRate + 0.19... + >>> metrics.weightedPrecision + 0.68... + >>> metrics.weightedRecall + 0.66... + >>> metrics.weightedFMeasure() + 0.66... + """ + + def __init__(self, predictionAndLabels): + """ + :param predictionAndLabels an RDD of (prediction, label) pairs. + """ + sc = predictionAndLabels.ctx + sql_ctx = SQLContext(sc) + df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([ + StructField("prediction", DoubleType(), nullable=False), + StructField("label", DoubleType(), nullable=False)])) + java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics + java_model = java_class(df._jdf) + super(MulticlassMetrics, self).__init__(java_model) + + def truePositiveRate(self, label): + """ + Returns true positive rate for a given label (category). + """ + return self.call("truePositiveRate", label) + + def falsePositiveRate(self, label): + """ + Returns false positive rate for a given label (category). + """ + return self.call("falsePositiveRate", label) + + def precision(self, label=None): + """ + Returns precision or precision for a given label (category) if specified. + """ + if label is None: + return self.call("precision") + else: + return self.call("precision", float(label)) + + def recall(self, label=None): + """ + Returns recall or recall for a given label (category) if specified. + """ + if label is None: + return self.call("recall") + else: + return self.call("recall", float(label)) + + def fMeasure(self, label=None, beta=None): + """ + Returns f-measure or f-measure for a given label (category) if specified. + """ + if beta is None: + if label is None: + return self.call("fMeasure") + else: + return self.call("fMeasure", label) + else: + if label is None: + raise Exception("If the beta parameter is specified, label can not be none") + else: + return self.call("fMeasure", label, beta) + + @property + def weightedTruePositiveRate(self): + """ + Returns weighted true positive rate. + (equals to precision, recall and f-measure) + """ + return self.call("weightedTruePositiveRate") + + @property + def weightedFalsePositiveRate(self): + """ + Returns weighted false positive rate. + """ + return self.call("weightedFalsePositiveRate") + + @property + def weightedRecall(self): + """ + Returns weighted averaged recall. + (equals to precision, recall and f-measure) + """ + return self.call("weightedRecall") + + @property + def weightedPrecision(self): + """ + Returns weighted averaged precision. + """ + return self.call("weightedPrecision") + + def weightedFMeasure(self, beta=None): + """ + Returns weighted averaged f-measure. + """ + if beta is None: + return self.call("weightedFMeasure") + else: + return self.call("weightedFMeasure", beta) + + def _test(): import doctest from pyspark import SparkContext From 53c045db9f53891fc981a411071308bafc001e73 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 9 May 2015 21:40:53 +0800 Subject: [PATCH 2/3] keep compatibility for python 2.6 --- python/pyspark/mllib/evaluation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index a766cdd1d5b36..56b0e0ecab679 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -149,11 +149,11 @@ class MulticlassMetrics(JavaModelWrapper): ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) >>> metrics = MulticlassMetrics(predictionAndLabels) >>> metrics.falsePositiveRate(0.0) - 0.2 + 0.2... >>> metrics.precision(1.0) - 0.75 + 0.75... >>> metrics.recall(2.0) - 1.0 + 1.0... >>> metrics.fMeasure(0.0, 2.0) 0.52... >>> metrics.precision() From bb3e4ba9b1ef9cf0cc1ef5e3f41437471b30e89e Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sun, 10 May 2015 12:40:32 +0800 Subject: [PATCH 3/3] trigger jenkins --- python/pyspark/mllib/evaluation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 56b0e0ecab679..36914597de228 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -168,6 +168,8 @@ class MulticlassMetrics(JavaModelWrapper): 0.66... >>> metrics.weightedFMeasure() 0.66... + >>> metrics.weightedFMeasure(2.0) + 0.65... """ def __init__(self, predictionAndLabels):