From aac3bc50156f4a6a6ec058cc8bc247f6b49401e1 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 6 May 2015 21:54:55 +0800 Subject: [PATCH 1/2] Add RegressionMetrics in PySpark/MLlib --- .../mllib/evaluation/RegressionMetrics.scala | 9 +++ python/pyspark/mllib/evaluation.py | 67 +++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala index 693117d820580..e577bf87f885e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RegressionMetrics.scala @@ -22,6 +22,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.Logging import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer} +import org.apache.spark.sql.DataFrame /** * :: Experimental :: @@ -32,6 +33,14 @@ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Multivariate @Experimental class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging { + /** + * An auxiliary constructor taking a DataFrame. + * @param predictionAndObservations a DataFrame with two double columns: + * prediction and observation + */ + private[mllib] def this(predictionAndObservations: DataFrame) = + this(predictionAndObservations.map(r => (r.getDouble(0), r.getDouble(1)))) + /** * Use MultivariateOnlineSummarizer to calculate summary statistics of observations and errors. */ diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 16cb49cc0cfff..3e25262c30e8d 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -67,6 +67,73 @@ def unpersist(self): self.call("unpersist") +class RegressionMetrics(JavaModelWrapper): + """ + Evaluator for regression. + + >>> predictionAndObservations = sc.parallelize([ + ... (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)]) + >>> metrics = RegressionMetrics(predictionAndObservations) + >>> metrics.explainedVariance() + 0.95... + >>> metrics.meanAbsoluteError() + 0.5... + >>> metrics.meanSquaredError() + 0.37... + >>> metrics.rootMeanSquaredError() + 0.61... + >>> metrics.r2() + 0.94... + """ + + def __init__(self, predictionAndObservations): + """ + :param predictionAndObservations: an RDD of (prediction, observation) pairs. + """ + sc = predictionAndObservations.ctx + sql_ctx = SQLContext(sc) + df = sql_ctx.createDataFrame(predictionAndObservations, schema=StructType([ + StructField("prediction", DoubleType(), nullable=False), + StructField("observation", DoubleType(), nullable=False)])) + java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics + java_model = java_class(df._jdf) + super(RegressionMetrics, self).__init__(java_model) + + def explainedVariance(self): + """ + Returns the explained variance regression score. + explainedVariance = 1 - variance(y - \hat{y}) / variance(y) + """ + return self.call("explainedVariance") + + def meanAbsoluteError(self): + """ + Returns the mean absolute error, which is a risk function corresponding to the + expected value of the absolute error loss or l1-norm loss. + """ + return self.call("meanAbsoluteError") + + def meanSquaredError(self): + """ + Returns the mean squared error, which is a risk function corresponding to the + expected value of the squared error loss or quadratic loss. + """ + return self.call("meanSquaredError") + + def rootMeanSquaredError(self): + """ + Returns the root mean squared error, which is defined as the square root of + the mean squared error. + """ + return self.call("rootMeanSquaredError") + + def r2(self): + """ + Returns R^2^, the coefficient of determination. + """ + return self.call("r2") + + def _test(): import doctest from pyspark import SparkContext From 6934af3a9c66e7e1cf7e9ba504cbc704386b1afe Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 8 May 2015 00:20:46 +0800 Subject: [PATCH 2/2] change to @property --- python/pyspark/mllib/evaluation.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 3e25262c30e8d..3e11df09da6b1 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -27,9 +27,9 @@ class BinaryClassificationMetrics(JavaModelWrapper): >>> scoreAndLabels = sc.parallelize([ ... (0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)], 2) >>> metrics = BinaryClassificationMetrics(scoreAndLabels) - >>> metrics.areaUnderROC() + >>> metrics.areaUnderROC 0.70... - >>> metrics.areaUnderPR() + >>> metrics.areaUnderPR 0.83... >>> metrics.unpersist() """ @@ -47,6 +47,7 @@ def __init__(self, scoreAndLabels): java_model = java_class(df._jdf) super(BinaryClassificationMetrics, self).__init__(java_model) + @property def areaUnderROC(self): """ Computes the area under the receiver operating characteristic @@ -54,6 +55,7 @@ def areaUnderROC(self): """ return self.call("areaUnderROC") + @property def areaUnderPR(self): """ Computes the area under the precision-recall curve. @@ -74,15 +76,15 @@ class RegressionMetrics(JavaModelWrapper): >>> predictionAndObservations = sc.parallelize([ ... (2.5, 3.0), (0.0, -0.5), (2.0, 2.0), (8.0, 7.0)]) >>> metrics = RegressionMetrics(predictionAndObservations) - >>> metrics.explainedVariance() + >>> metrics.explainedVariance 0.95... - >>> metrics.meanAbsoluteError() + >>> metrics.meanAbsoluteError 0.5... - >>> metrics.meanSquaredError() + >>> metrics.meanSquaredError 0.37... - >>> metrics.rootMeanSquaredError() + >>> metrics.rootMeanSquaredError 0.61... - >>> metrics.r2() + >>> metrics.r2 0.94... """ @@ -99,6 +101,7 @@ def __init__(self, predictionAndObservations): java_model = java_class(df._jdf) super(RegressionMetrics, self).__init__(java_model) + @property def explainedVariance(self): """ Returns the explained variance regression score. @@ -106,6 +109,7 @@ def explainedVariance(self): """ return self.call("explainedVariance") + @property def meanAbsoluteError(self): """ Returns the mean absolute error, which is a risk function corresponding to the @@ -113,6 +117,7 @@ def meanAbsoluteError(self): """ return self.call("meanAbsoluteError") + @property def meanSquaredError(self): """ Returns the mean squared error, which is a risk function corresponding to the @@ -120,6 +125,7 @@ def meanSquaredError(self): """ return self.call("meanSquaredError") + @property def rootMeanSquaredError(self): """ Returns the root mean squared error, which is defined as the square root of @@ -127,6 +133,7 @@ def rootMeanSquaredError(self): """ return self.call("rootMeanSquaredError") + @property def r2(self): """ Returns R^2^, the coefficient of determination.