From cb5dce1565edca67a3763b7610137b48545ea998 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Choudhary Date: Thu, 1 Mar 2018 21:45:12 +0530 Subject: [PATCH] Adding more binary classification evaluation metrics --- ...avaBinaryClassificationMetricsExample.java | 24 +++++++ .../BinaryClassificationMetricsExample.scala | 37 +++++++++++ .../BinaryClassificationMetrics.scala | 33 +++++++++- .../BinaryClassificationMetricComputers.scala | 63 +++++++++++++++++++ 4 files changed, 156 insertions(+), 1 deletion(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java index b9d0313c6bb56..e068e24f051e0 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBinaryClassificationMetricsExample.java @@ -68,6 +68,30 @@ public static void main(String[] args) { JavaRDD recall = metrics.recallByThreshold().toJavaRDD(); System.out.println("Recall by threshold: " + recall.collect()); + // False omission rate + JavaRDD falseOmissionRate = metrics.forByThreshold().toJavaRDD(); + System.out.println("False omission rate by threshold: " + falseOmissionRate.collect()); + + // False discovery rate + JavaRDD falseDiscoveryRate = metrics.fdrByThreshold().toJavaRDD(); + System.out.println("False discovery rate by threshold: " + falseDiscoveryRate.collect()); + + // Negative predictive value + JavaRDD negativePredictiveValue = metrics.npvByThreshold().toJavaRDD(); + System.out.println("Negative predictive value by threshold: " + negativePredictiveValue.collect()); + + // False negative rate + JavaRDD falseNegativeRate = metrics.fnrByThreshold().toJavaRDD(); + System.out.println("False negative rate by threshold: " + falseNegativeRate.collect()); + + // True negative rate + JavaRDD trueNegativeRate = metrics.specificityByThreshold().toJavaRDD(); + System.out.println("True negative rate by threshold: " + trueNegativeRate.collect()); + + // False positive rate + JavaRDD falsePositiveate = metrics.fprByThreshold().toJavaRDD(); + System.out.println("False positive rate by threshold: " + falsePositiveate.collect()); + // F Score by threshold JavaRDD f1Score = metrics.fMeasureByThreshold().toJavaRDD(); System.out.println("F1 Score by threshold: " + f1Score.collect()); diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala index b9263ac6fcff6..5b204664ff367 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala @@ -69,6 +69,43 @@ object BinaryClassificationMetricsExample { println(s"Threshold: $t, Recall: $r") } + // False omission rate + val falseOmissionRate = metrics.forByThreshold + falseOmissionRate.foreach { case (t, r) => + println(s"Threshold: $t, False omission rate: $r") + } + + // False discovery rate + val falseDiscoveryRate = metrics.fdrByThreshold() + falseDiscoveryRate.foreach { case (t, r) => + println(s"Threshold: $t, False discovery rate: $r") + } + + // Negative predictive value + val negativePredictiveValue = metrics.npvByThreshold() + negativePredictiveValue.foreach { case (t, r) => + println(s"Threshold: $t, Negative predictive value: $r") + } + + // False negative rate + val falseNegativeRate = metrics.fnrByThreshold() + falseNegativeRate.foreach { case (t, r) => + println(s"Threshold: $t, Negative predictive value: $r") + } + + // True negative rate + val trueNegativeRate = metrics.specificityByThreshold() + trueNegativeRate.foreach { case (t, r) => + println(s"Threshold: $t, True negative rate: $r") + } + + // False positive rate + val falsePositiveate = metrics.fprByThreshold() + falsePositiveate.foreach { case (t, r) => + println(s"Threshold: $t, False positive rate: $r") + } + + // Precision-Recall Curve val PRC = metrics.pr diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 2cfcf38eb4ca8..7fd4c74ccaee8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -143,7 +143,38 @@ class BinaryClassificationMetrics @Since("1.3.0") ( @Since("1.0.0") def recallByThreshold(): RDD[(Double, Double)] = createCurve(Recall) - private lazy val ( + /** + * Returns the (threshold, False omission rate) curve. + */ + def forByThreshold(): RDD[(Double, Double)] = createCurve(FalseOmissionRate) + + /** + * Returns the (threshold, False discovery rate) curve. + */ + def fdrByThreshold(): RDD[(Double, Double)] = createCurve(FalseDiscoveryRate) + + /** + * Returns the (threshold, Negative predictive value) curve. + */ + def npvByThreshold(): RDD[(Double, Double)] = createCurve(NegativePredictiveValue) + + /** + * Returns the (threshold, False Negative rate) curve. + */ + def fnrByThreshold(): RDD[(Double, Double)] = createCurve(FalseNegativeRate) + + /** + * Returns the (threshold, True Negative rate) curve. + */ + def specificityByThreshold(): RDD[(Double, Double)] = createCurve(TrueNegativeRate) + + /** + * Returns the (threshold, False positive rate) curve. + */ + def fprByThreshold(): RDD[(Double, Double)] = createCurve(FalsePositiveRate) + + + private lazy val ( cumulativeCounts: RDD[(Double, BinaryLabelCounter)], confusions: RDD[(Double, BinaryConfusionMatrix)]) = { // Create a bin for each distinct score value, count positives and negatives within each bin, diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala index 5a4c6aef50b7b..bfda826c08d57 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/binary/BinaryClassificationMetricComputers.scala @@ -47,6 +47,7 @@ private[evaluation] object FalsePositiveRate extends BinaryClassificationMetricC } } + /** Recall. Defined as 0.0 when there are no positive examples. */ private[evaluation] object Recall extends BinaryClassificationMetricComputer { override def apply(c: BinaryConfusionMatrix): Double = { @@ -58,6 +59,68 @@ private[evaluation] object Recall extends BinaryClassificationMetricComputer { } } + +/** False Omission Rate by threshold. Defined as 0.0 when there are no positive examples. */ +private[evaluation] object FalseOmissionRate extends BinaryClassificationMetricComputer { + override def apply(c: BinaryConfusionMatrix): Double = { + val totalNegatives = c.numTrueNegatives + c.numFalseNegatives + if (c.numNegatives == 0) { + 0.0 + } else { + c.numFalseNegatives.toDouble / totalNegatives + } + } +} + +/** False Discovery Rate by threshold. Defined as 0.0 when there are no positive examples. */ +private[evaluation] object FalseDiscoveryRate extends BinaryClassificationMetricComputer { + override def apply(c: BinaryConfusionMatrix): Double = { + if (c.numPositives == 0) { + 0.0 + } else { + c.numFalsePositives.toDouble / c.numNegatives + } + } +} + +/** Negative Predictive Value by threshold. Defined as 0.0 when there are no positive examples. */ +private[evaluation] object NegativePredictiveValue extends BinaryClassificationMetricComputer { + override def apply(c: BinaryConfusionMatrix): Double = { + val totalNegatives = c.numTrueNegatives + c.numFalseNegatives + if (c.numNegatives == 0) { + 0.0 + } else { + c.numTrueNegatives.toDouble / totalNegatives + } + } +} + +/** False Negative Rate by threshold. Defined as 0.0 when there are no positive examples. */ +private[evaluation] object FalseNegativeRate extends BinaryClassificationMetricComputer { + override def apply(c: BinaryConfusionMatrix): Double = { + if (c.numNegatives == 0) { + 0.0 + } else { + c.numFalseNegatives.toDouble / c.numPositives + } + } +} + + +/** True Negative Rate by threshold. Defined as 0.0 when there are no positive examples. */ +private[evaluation] object TrueNegativeRate extends BinaryClassificationMetricComputer { + override def apply(c: BinaryConfusionMatrix): Double = { + if (c.numNegatives == 0) { + 0.0 + } else { + c.numTrueNegatives.toDouble / c.numNegatives + } + } +} + + + + /** * F-Measure. Defined as 0 if both precision and recall are 0. EG in the case that all examples * are false positives.