Skip to content

Commit

Permalink
[SPARK-29960][ML][PYSPARK] MulticlassClassificationEvaluator support …
Browse files Browse the repository at this point in the history
…hammingLoss

### What changes were proposed in this pull request?
MulticlassClassificationEvaluator support hammingLoss

### Why are the changes needed?
1, it is an easy to compute hammingLoss based on confusion matrix
2, scikit-learn supports it

### Does this PR introduce any user-facing change?
yes

### How was this patch tested?
added testsuites

Closes #26597 from zhengruifeng/multi_class_hamming_loss.

Authored-by: zhengruifeng <ruifengz@foxmail.com>
Signed-off-by: zhengruifeng <ruifengz@foxmail.com>
  • Loading branch information
zhengruifeng committed Nov 21, 2019
1 parent 297cbab commit 0f40d2a
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 4 deletions.
Expand Up @@ -46,7 +46,7 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
* `"weightedPrecision"`, `"weightedRecall"`, `"weightedTruePositiveRate"`,
* `"weightedFalsePositiveRate"`, `"weightedFMeasure"`, `"truePositiveRateByLabel"`,
* `"falsePositiveRateByLabel"`, `"precisionByLabel"`, `"recallByLabel"`,
* `"fMeasureByLabel"`, `"logLoss"`)
* `"fMeasureByLabel"`, `"logLoss"`, `"hammingLoss"`)
*
* @group param
*/
Expand Down Expand Up @@ -172,13 +172,15 @@ class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") overrid
case "precisionByLabel" => metrics.precision($(metricLabel))
case "recallByLabel" => metrics.recall($(metricLabel))
case "fMeasureByLabel" => metrics.fMeasure($(metricLabel), $(beta))
case "hammingLoss" => metrics.hammingLoss
case "logLoss" => metrics.logLoss($(eps))
}
}

@Since("1.5.0")
override def isLargerBetter: Boolean = $(metricName) match {
case "weightedFalsePositiveRate" | "falsePositiveRateByLabel" | "logLoss" => false
case "weightedFalsePositiveRate" | "falsePositiveRateByLabel" | "logLoss" | "hammingLoss" =>
false
case _ => true
}

Expand All @@ -199,7 +201,7 @@ object MulticlassClassificationEvaluator
private val supportedMetricNames = Array("f1", "accuracy", "weightedPrecision", "weightedRecall",
"weightedTruePositiveRate", "weightedFalsePositiveRate", "weightedFMeasure",
"truePositiveRateByLabel", "falsePositiveRateByLabel", "precisionByLabel", "recallByLabel",
"fMeasureByLabel", "logLoss")
"fMeasureByLabel", "logLoss", "hammingLoss")

@Since("1.6.0")
override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
Expand Down
Expand Up @@ -240,6 +240,23 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[_ <: Product])
@Since("1.1.0")
lazy val labels: Array[Double] = tpByClass.keys.toArray.sorted

/**
* Returns Hamming-loss
*/
@Since("3.0.0")
lazy val hammingLoss: Double = {
var numerator = 0.0
var denominator = 0.0
confusions.iterator.foreach {
case ((label, prediction), weight) =>
if (label != prediction) {
numerator += weight
}
denominator += weight
}
numerator / denominator
}

/**
* Returns the log-loss, aka logistic loss or cross-entropy loss.
* @param eps log-loss is undefined for p=0 or p=1, so probabilities are
Expand Down
Expand Up @@ -254,4 +254,35 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {
val metrics2 = new MulticlassMetrics(rdd2)
assert(metrics2.logLoss() ~== 0.9682005730687164 relTol delta)
}

test("MulticlassMetrics supports hammingLoss") {
/*
Using the following Python code to verify the correctness.
from sklearn.metrics import hamming_loss
y_true = [2, 2, 3, 4]
y_pred = [1, 2, 3, 4]
weights = [1.5, 2.0, 1.0, 0.5]
>>> hamming_loss(y_true, y_pred)
0.25
>>> hamming_loss(y_true, y_pred, sample_weight=weights)
0.3
*/

val preds = Seq(1.0, 2.0, 3.0, 4.0)
val labels = Seq(2.0, 2.0, 3.0, 4.0)
val weights = Seq(1.5, 2.0, 1.0, 0.5)

val rdd = sc.parallelize(preds.zip(labels))
val metrics = new MulticlassMetrics(rdd)
assert(metrics.hammingLoss ~== 0.25 relTol delta)

val rdd2 = sc.parallelize(preds.zip(labels).zip(weights))
.map { case ((pred, label), weight) =>
(pred, label, weight)
}
val metrics2 = new MulticlassMetrics(rdd2)
assert(metrics2.hammingLoss ~== 0.3 relTol delta)
}
}
6 changes: 5 additions & 1 deletion python/pyspark/ml/evaluation.py
Expand Up @@ -374,6 +374,10 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
>>> evaluator.evaluate(dataset, {evaluator.metricName: "truePositiveRateByLabel",
... evaluator.metricLabel: 1.0})
0.75...
>>> evaluator.setMetricName("hammingLoss")
MulticlassClassificationEvaluator...
>>> evaluator.evaluate(dataset)
0.33...
>>> mce_path = temp_path + "/mce"
>>> evaluator.save(mce_path)
>>> evaluator2 = MulticlassClassificationEvaluator.load(mce_path)
Expand Down Expand Up @@ -408,7 +412,7 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
"(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate|"
"weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel|"
"falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel|"
"logLoss)",
"logLoss|hammingLoss)",
typeConverter=TypeConverters.toString)
metricLabel = Param(Params._dummy(), "metricLabel",
"The class whose metric will be computed in truePositiveRateByLabel|"
Expand Down

0 comments on commit 0f40d2a

Please sign in to comment.