Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-14682][ML] Provide evaluateEachIteration method or equivalent for spark.ml GBTs #21097

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -273,6 +273,7 @@ class GBTRegressionModel private[ml](
* Method to compute error or loss for every iteration of gradient boosting.
*
* @param dataset Dataset for validation.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add doc for "loss" arg, including what the options are

* @param loss The loss function used to compute error. Supported options: squared, absolute
*/
@Since("2.4.0")
def evaluateEachIteration(dataset: Dataset[_], loss: String): Array[Double] = {
Expand Down
Expand Up @@ -367,11 +367,31 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {

test("model evaluateEachIteration") {
val gbt = new GBTClassifier()
.setSeed(1L)
.setMaxDepth(2)
.setMaxIter(2)
val model = gbt.fit(trainData.toDF)
val eval = model.evaluateEachIteration(validationData.toDF)
assert(Vectors.dense(eval) ~== Vectors.dense(1.7641, 1.8209) relTol 1E-3)
.setMaxIter(3)
.setLossType("logistic")
val model3 = gbt.fit(trainData.toDF)
val model1 = new GBTClassificationModel("gbt-cls-model-test1",
model3.trees.take(1), model3.treeWeights.take(1), model3.numFeatures, model3.numClasses)
val model2 = new GBTClassificationModel("gbt-cls-model-test2",
model3.trees.take(2), model3.treeWeights.take(2), model3.numFeatures, model3.numClasses)

for (evalLossType <- GBTClassifier.supportedLossTypes) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

evalLossType is not used, so I'd remove this loop.

val evalArr = model3.evaluateEachIteration(validationData.toDF)
val remappedValidationData = validationData.map(
x => new LabeledPoint((x.label * 2) - 1, x.features))
val lossErr1 = GradientBoostedTrees.computeError(remappedValidationData,
model1.trees, model1.treeWeights, model1.getOldLossType)
val lossErr2 = GradientBoostedTrees.computeError(remappedValidationData,
model2.trees, model2.treeWeights, model2.getOldLossType)
val lossErr3 = GradientBoostedTrees.computeError(remappedValidationData,
model3.trees, model3.treeWeights, model3.getOldLossType)

assert(evalArr(0) ~== lossErr1 relTol 1E-3)
assert(evalArr(1) ~== lossErr2 relTol 1E-3)
assert(evalArr(2) ~== lossErr3 relTol 1E-3)
}
}

/////////////////////////////////////////////////////////////////////////////
Expand Down
Expand Up @@ -20,7 +20,7 @@ package org.apache.spark.ml.regression
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.tree.impl.TreeTests
import org.apache.spark.ml.tree.impl.{GradientBoostedTrees, TreeTests}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
Expand Down Expand Up @@ -203,16 +203,32 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest {
}

test("model evaluateEachIteration") {
val gbt = new GBTRegressor()
.setMaxDepth(2)
.setMaxIter(2)
.setLossType("squared")
val model = gbt.fit(trainData.toDF)
val eval1 = model.evaluateEachIteration(validationData.toDF, "squared")
assert(Vectors.dense(eval1) ~== Vectors.dense(0.3736, 0.3745) relTol 1E-3)

val eval2 = model.evaluateEachIteration(validationData.toDF, "absolute")
assert(Vectors.dense(eval2) ~== Vectors.dense(0.3908, 0.3931) relTol 1E-3)
for (lossType <- GBTRegressor.supportedLossTypes) {
val gbt = new GBTRegressor()
.setSeed(1L)
.setMaxDepth(2)
.setMaxIter(3)
.setLossType(lossType)
val model3 = gbt.fit(trainData.toDF)
val model1 = new GBTRegressionModel("gbt-reg-model-test1",
model3.trees.take(1), model3.treeWeights.take(1), model3.numFeatures)
val model2 = new GBTRegressionModel("gbt-reg-model-test2",
model3.trees.take(2), model3.treeWeights.take(2), model3.numFeatures)

for (evalLossType <- GBTRegressor.supportedLossTypes) {
val evalArr = model3.evaluateEachIteration(validationData.toDF, evalLossType)
val lossErr1 = GradientBoostedTrees.computeError(validationData,
model1.trees, model1.treeWeights, model1.convertToOldLossType(evalLossType))
val lossErr2 = GradientBoostedTrees.computeError(validationData,
model2.trees, model2.treeWeights, model2.convertToOldLossType(evalLossType))
val lossErr3 = GradientBoostedTrees.computeError(validationData,
model3.trees, model3.treeWeights, model3.convertToOldLossType(evalLossType))

assert(evalArr(0) ~== lossErr1 relTol 1E-3)
assert(evalArr(1) ~== lossErr2 relTol 1E-3)
assert(evalArr(2) ~== lossErr3 relTol 1E-3)
}
}
}

/////////////////////////////////////////////////////////////////////////////
Expand Down