Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-14682][ML] Provide evaluateEachIteration method or equivalent for spark.ml GBTs #21097

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -334,6 +334,21 @@ class GBTClassificationModel private[ml](
// hard coded loss, which is not meant to be changed in the model
private val loss = getOldLossType

/**
* Method to compute error or loss for every iteration of gradient boosting.
*
* @param dataset Dataset for validation.
*/
@Since("2.4.0")
def evaluateEachIteration(dataset: Dataset[_]): Array[Double] = {
val data = dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map {
case Row(label: Double, features: Vector) => LabeledPoint(label, features)
}
GradientBoostedTrees.evaluateEachIteration(data, trees, treeWeights, loss,
OldAlgo.Classification
)
}

@Since("2.0.0")
override def write: MLWriter = new GBTClassificationModel.GBTClassificationModelWriter(this)
}
Expand Down
Expand Up @@ -34,7 +34,7 @@ import org.apache.spark.ml.util.DefaultParamsReader.Metadata
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel => OldGBTModel}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions._

/**
Expand Down Expand Up @@ -269,6 +269,21 @@ class GBTRegressionModel private[ml](
new OldGBTModel(OldAlgo.Regression, _trees.map(_.toOld), _treeWeights)
}

/**
* Method to compute error or loss for every iteration of gradient boosting.
*
* @param dataset Dataset for validation.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add doc for "loss" arg, including what the options are

* @param loss The loss function used to compute error. Supported options: squared, absolute
*/
@Since("2.4.0")
def evaluateEachIteration(dataset: Dataset[_], loss: String): Array[Double] = {
val data = dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map {
case Row(label: Double, features: Vector) => LabeledPoint(label, features)
}
GradientBoostedTrees.evaluateEachIteration(data, trees, treeWeights,
convertToOldLossType(loss), OldAlgo.Regression)
}

@Since("2.0.0")
override def write: MLWriter = new GBTRegressionModel.GBTRegressionModelWriter(this)
}
Expand Down
Expand Up @@ -579,7 +579,11 @@ private[ml] trait GBTRegressorParams extends GBTParams with TreeRegressorParams

/** (private[ml]) Convert new loss to old loss. */
override private[ml] def getOldLossType: OldLoss = {
getLossType match {
convertToOldLossType(getLossType)
}

private[ml] def convertToOldLossType(loss: String): OldLoss = {
loss match {
case "squared" => OldSquaredError
case "absolute" => OldAbsoluteError
case _ =>
Expand Down
Expand Up @@ -25,7 +25,7 @@ import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.tree.RegressionLeafNode
import org.apache.spark.ml.tree.impl.TreeTests
import org.apache.spark.ml.tree.impl.{GradientBoostedTrees, TreeTests}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
Expand Down Expand Up @@ -365,6 +365,33 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest {
assert(mostImportantFeature !== mostIF)
}

test("model evaluateEachIteration") {
val gbt = new GBTClassifier()
.setSeed(1L)
.setMaxDepth(2)
.setMaxIter(3)
.setLossType("logistic")
val model3 = gbt.fit(trainData.toDF)
val model1 = new GBTClassificationModel("gbt-cls-model-test1",
model3.trees.take(1), model3.treeWeights.take(1), model3.numFeatures, model3.numClasses)
val model2 = new GBTClassificationModel("gbt-cls-model-test2",
model3.trees.take(2), model3.treeWeights.take(2), model3.numFeatures, model3.numClasses)

val evalArr = model3.evaluateEachIteration(validationData.toDF)
val remappedValidationData = validationData.map(
x => new LabeledPoint((x.label * 2) - 1, x.features))
val lossErr1 = GradientBoostedTrees.computeError(remappedValidationData,
model1.trees, model1.treeWeights, model1.getOldLossType)
val lossErr2 = GradientBoostedTrees.computeError(remappedValidationData,
model2.trees, model2.treeWeights, model2.getOldLossType)
val lossErr3 = GradientBoostedTrees.computeError(remappedValidationData,
model3.trees, model3.treeWeights, model3.getOldLossType)

assert(evalArr(0) ~== lossErr1 relTol 1E-3)
assert(evalArr(1) ~== lossErr2 relTol 1E-3)
assert(evalArr(2) ~== lossErr3 relTol 1E-3)
}

/////////////////////////////////////////////////////////////////////////////
// Tests of model save/load
/////////////////////////////////////////////////////////////////////////////
Expand Down
Expand Up @@ -20,8 +20,9 @@ package org.apache.spark.ml.regression
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.tree.impl.TreeTests
import org.apache.spark.ml.tree.impl.{GradientBoostedTrees, TreeTests}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
Expand Down Expand Up @@ -201,7 +202,34 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest {
assert(mostImportantFeature !== mostIF)
}


test("model evaluateEachIteration") {
for (lossType <- GBTRegressor.supportedLossTypes) {
val gbt = new GBTRegressor()
.setSeed(1L)
.setMaxDepth(2)
.setMaxIter(3)
.setLossType(lossType)
val model3 = gbt.fit(trainData.toDF)
val model1 = new GBTRegressionModel("gbt-reg-model-test1",
model3.trees.take(1), model3.treeWeights.take(1), model3.numFeatures)
val model2 = new GBTRegressionModel("gbt-reg-model-test2",
model3.trees.take(2), model3.treeWeights.take(2), model3.numFeatures)

for (evalLossType <- GBTRegressor.supportedLossTypes) {
val evalArr = model3.evaluateEachIteration(validationData.toDF, evalLossType)
val lossErr1 = GradientBoostedTrees.computeError(validationData,
model1.trees, model1.treeWeights, model1.convertToOldLossType(evalLossType))
val lossErr2 = GradientBoostedTrees.computeError(validationData,
model2.trees, model2.treeWeights, model2.convertToOldLossType(evalLossType))
val lossErr3 = GradientBoostedTrees.computeError(validationData,
model3.trees, model3.treeWeights, model3.convertToOldLossType(evalLossType))

assert(evalArr(0) ~== lossErr1 relTol 1E-3)
assert(evalArr(1) ~== lossErr2 relTol 1E-3)
assert(evalArr(2) ~== lossErr3 relTol 1E-3)
}
}
}

/////////////////////////////////////////////////////////////////////////////
// Tests of model save/load
Expand Down