From ed66bb09eddf776e932b29a7e4889128aa775946 Mon Sep 17 00:00:00 2001 From: Danilo Ascione Date: Tue, 20 Dec 2016 17:23:28 +0100 Subject: [PATCH] [SPARK-18948][MLlib] Add Mean Percentile Rank metric for ranking algorithms --- .../mllib/JavaRankingMetricsExample.java | 20 +++++++++----- .../mllib/RankingMetricsExample.scala | 3 +++ .../mllib/evaluation/RankingMetrics.scala | 27 +++++++++++++++++++ .../evaluation/JavaRankingMetricsSuite.java | 17 ++++++------ .../evaluation/RankingMetricsSuite.scala | 8 ++++-- 5 files changed, 57 insertions(+), 18 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java index 54dfc404ca6e9..74b638917bced 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaRankingMetricsExample.java @@ -18,19 +18,22 @@ package org.apache.spark.examples.mllib; // $example on$ -import java.util.*; - -import scala.Tuple2; - -import org.apache.spark.api.java.*; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.mllib.evaluation.RankingMetrics; +import org.apache.spark.mllib.evaluation.RegressionMetrics; import org.apache.spark.mllib.recommendation.ALS; import org.apache.spark.mllib.recommendation.MatrixFactorizationModel; import org.apache.spark.mllib.recommendation.Rating; +import scala.Tuple2; + +import java.util.ArrayList; +import java.util.List; + // $example off$ -import org.apache.spark.SparkConf; public class JavaRankingMetricsExample { public static void main(String[] args) { @@ -142,6 +145,9 @@ public List call(Rating[] docs) { // Mean average precision System.out.format("Mean average precision = %f\n", metrics.meanAveragePrecision()); + // Mean percentile rank + System.out.format("Mean percentile rank = %f\n", metrics.meanPercentileRank()); + // Evaluate the model using numerical ratings and regression metrics JavaRDD> userProducts = ratings.map( new Function>() { diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala index d514891da78fc..beffeb0c93f95 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala @@ -84,6 +84,9 @@ object RankingMetricsExample { // Mean average precision println(s"Mean average precision = ${metrics.meanAveragePrecision}") + // Mean percentile rank + println(s"Mean percentile rank = ${metrics.meanPercentileRank}") + // Normalized discounted cumulative gain Array(1, 3, 5).foreach { k => println(s"NDCG at $k = ${metrics.ndcgAt(k)}") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala index b98aa0534152b..047ee84ad0a2d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala @@ -108,6 +108,33 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: RDD[(Array[T], Array[T])] }.mean() } + /** + * Compute the mean percentile rank (MPR) of all the queries. + * + * See the following paper for detail: + * Hu, Y., Y. Koren, and C. Volinsky. "Collaborative Filtering for Implicit Feedback Datasets." + * In 2008 Eighth IEEE International Conference on Data Mining, 263-72, 2008. + * doi:10.1109/ICDM.2008.22. + */ + @Since("2.2.0") + lazy val meanPercentileRank: Double = { + val rank_ui: (Seq[T], T) => Double = (pred: Seq[T], label: T) => { + val l_i = pred.indexOf(label) + + if (l_i == -1) { + 1 + } else { + l_i.toDouble / pred.size + } + } + + val ranks = predictionAndLabels.map { case (pred, lab) => + lab.map(label => rank_ui(pred, label)) + } + ranks.flatMap(rank => rank) + .mean() + } + /** * Compute the average NDCG value of all the queries, truncated at ranking position k. * The discounted cumulative gain at position k is computed as: diff --git a/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java b/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java index e9d7e4fdbe8ce..0b0e0530887c3 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/evaluation/JavaRankingMetricsSuite.java @@ -17,18 +17,16 @@ package org.apache.spark.mllib.evaluation; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import scala.Tuple2; -import scala.Tuple2$; - +import org.apache.spark.SharedSparkSession; +import org.apache.spark.api.java.JavaRDD; import org.junit.Assert; import org.junit.Test; +import scala.Tuple2; +import scala.Tuple2$; -import org.apache.spark.SharedSparkSession; -import org.apache.spark.api.java.JavaRDD; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; public class JavaRankingMetricsSuite extends SharedSparkSession { private transient JavaRDD, List>> predictionAndLabels; @@ -50,6 +48,7 @@ public void rankingMetrics() { @SuppressWarnings("unchecked") RankingMetrics metrics = RankingMetrics.of(predictionAndLabels); Assert.assertEquals(0.355026, metrics.meanAveragePrecision(), 1e-5); + Assert.assertEquals(0.4375, metrics.meanPercentileRank(), 1e-5); Assert.assertEquals(0.75 / 3.0, metrics.precisionAt(4), 1e-5); } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala index f334be2c2ba83..05d2c436090c7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/RankingMetricsSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.mllib.util.TestingUtils._ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { - test("Ranking metrics: MAP, NDCG") { + test("Ranking metrics: MAP, MPR, NDCG") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2, 7, 8, 3, 9, 10, 4, 5), Array(1, 2, 3, 4, 5)), @@ -34,6 +34,7 @@ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { val metrics = new RankingMetrics(predictionAndLabels) val map = metrics.meanAveragePrecision + val mpr = metrics.meanPercentileRank assert(metrics.precisionAt(1) ~== 1.0/3 absTol eps) assert(metrics.precisionAt(2) ~== 1.0/3 absTol eps) @@ -45,13 +46,15 @@ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { assert(map ~== 0.355026 absTol eps) + assert(mpr ~== 0.4375 absTol eps) + assert(metrics.ndcgAt(3) ~== 1.0/3 absTol eps) assert(metrics.ndcgAt(5) ~== 0.328788 absTol eps) assert(metrics.ndcgAt(10) ~== 0.487913 absTol eps) assert(metrics.ndcgAt(15) ~== metrics.ndcgAt(10) absTol eps) } - test("MAP, NDCG with few predictions (SPARK-14886)") { + test("MAP, MPR, NDCG with few predictions (SPARK-14886)") { val predictionAndLabels = sc.parallelize( Seq( (Array(1, 6, 2), Array(1, 2, 3, 4, 5)), @@ -64,6 +67,7 @@ class RankingMetricsSuite extends SparkFunSuite with MLlibTestSparkContext { assert(metrics.precisionAt(2) ~== 0.25 absTol eps) assert(metrics.ndcgAt(1) ~== 0.5 absTol eps) assert(metrics.ndcgAt(2) ~== 0.30657 absTol eps) + assert(metrics.meanPercentileRank ~== 0.83333 absTol eps) } }