From 52ec9cd1c9920650cb588d15e1301e998a036371 Mon Sep 17 00:00:00 2001
From: sethah
Date: Wed, 24 Aug 2016 17:13:33 -0700
Subject: [PATCH 01/24] first pass at merging MLOR with LOR
---
.../classification/LogisticRegression.scala | 422 ++++++++++++++----
.../classification/LogisticRegression.scala | 5 +-
.../MultinomialLogisticRegressionSuite.scala | 210 ++++-----
3 files changed, 454 insertions(+), 183 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 757d52052d87f..c8c06a4d7752b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -50,6 +50,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol
with HasStandardization with HasWeightCol with HasThreshold with HasAggregationDepth {
+ import LogisticRegression._
+
/**
* Set threshold in binary classification, in range [0, 1].
*
@@ -71,6 +73,25 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
set(threshold, value)
}
+
+ /**
+ * Param for the name of family which is a description of the error distribution
+ * to be used in the model.
+ * Supported options: "multinomial", "binomial".
+ * Default is "multinomial".
+ *
+ * @group param
+ */
+ @Since("2.0.0")
+ final val family: Param[String] = new Param(this, "family",
+ "The name of family which is a description of the error distribution to be used in the " +
+ s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.",
+ ParamValidators.inArray[String](supportedFamilyNames))
+
+ /** @group getParam */
+ @Since("2.0.0")
+ def getFamily: String = $(family)
+
/**
* Get threshold for binary classification.
*
@@ -220,6 +241,17 @@ class LogisticRegression @Since("1.2.0") (
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
setDefault(fitIntercept -> true)
+ /**
+ * Sets the value of param [[family]].
+ * Default is "multinomial".
+ *
+ * @group setParam
+ */
+ // TODO: don't use strings?
+ @Since("2.0.0")
+ def setFamily(value: String): this.type = set(family, value)
+ setDefault(family -> "multinomial")
+
/**
* Whether to standardize the training features before fitting the model.
* The coefficients of models will be always returned on the original scale,
@@ -311,8 +343,25 @@ class LogisticRegression @Since("1.2.0") (
val histogram = labelSummarizer.histogram
val numInvalid = labelSummarizer.countInvalid
- val numClasses = histogram.length
val numFeatures = summarizer.mean.size
+ val numFeaturesPlusIntercept = if (getFitIntercept) numFeatures + 1 else numFeatures
+
+ val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
+ case Some(n: Int) =>
+ require(n >= histogram.length, s"Specified number of classes $n was " +
+ s"less than the number of unique labels ${histogram.length}")
+ n
+ case None => histogram.length
+ }
+ val isBinaryClassification = numClasses == 1 || numClasses == 2
+ val isMultinomial = !((!isSet(family) && isBinaryClassification) || $(family) == "binomial")
+ val numCoefficientSets = if (isMultinomial) numClasses else 1
+
+ // TODO: use enumeration or similar
+ if (!isMultinomial) {
+ require(isBinaryClassification, s"Binomial family only supports 1 or 2" +
+ s"outcome classes but found $numClasses")
+ }
if (isDefined(thresholds)) {
require($(thresholds).length == numClasses, this.getClass.getSimpleName +
@@ -333,22 +382,18 @@ class LogisticRegression @Since("1.2.0") (
val isConstantLabel = histogram.count(_ != 0) == 1
- if (numClasses > 2) {
- val msg = s"LogisticRegression with ElasticNet in ML package only supports " +
- s"binary classification. Found $numClasses in the input dataset. Consider using " +
- s"MultinomialLogisticRegression instead."
- logError(msg)
- throw new SparkException(msg)
- } else if ($(fitIntercept) && numClasses == 2 && isConstantLabel) {
- logWarning(s"All labels are one and fitIntercept=true, so the coefficients will be " +
- s"zeros and the intercept will be positive infinity; as a result, " +
- s"training is not needed.")
- (Vectors.sparse(numFeatures, Seq()), Double.PositiveInfinity, Array.empty[Double])
- } else if ($(fitIntercept) && numClasses == 1) {
- logWarning(s"All labels are zero and fitIntercept=true, so the coefficients will be " +
- s"zeros and the intercept will be negative infinity; as a result, " +
- s"training is not needed.")
- (Vectors.sparse(numFeatures, Seq()), Double.NegativeInfinity, Array.empty[Double])
+ if ($(fitIntercept) && isConstantLabel) {
+ logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
+ s"will be zeros. Training is not needed.")
+ val constantLabelIndex = Vectors.dense(histogram).argmax
+ val coefficientMatrix = Matrices.sparse(numCoefficientSets, numFeatures,
+ Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
+ val interceptVector = if (isMultinomial) {
+ Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity)))
+ } else {
+ Vectors.dense(if (numClasses == 2) Double.PositiveInfinity else Double.NegativeInfinity)
+ }
+ (coefficientMatrix, interceptVector, Array.empty[Double])
} else {
if (!$(fitIntercept) && isConstantLabel) {
logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
@@ -370,35 +415,52 @@ class LogisticRegression @Since("1.2.0") (
val bcFeaturesStd = instances.context.broadcast(featuresStd)
val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept),
- $(standardization), bcFeaturesStd, regParamL2, multinomial = false, $(aggregationDepth))
+ $(standardization), bcFeaturesStd, regParamL2, multinomial = isMultinomial,
+ $(aggregationDepth))
val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) {
new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
} else {
val standardizationParam = $(standardization)
+ // TODO: check this works in both cases
def regParamL1Fun = (index: Int) => {
// Remove the L1 penalization on the intercept
- if (index == numFeatures) {
+ val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0)
+ if (isIntercept) {
0.0
} else {
if (standardizationParam) {
regParamL1
} else {
+ val featureIndex = if ($(fitIntercept)) {
+ index % numFeaturesPlusIntercept
+ } else {
+ index % numFeatures
+ }
// If `standardization` is false, we still standardize the data
// to improve the rate of convergence; as a result, we have to
// perform this reverse standardization by penalizing each component
// differently to get effectively the same objective function when
// the training dataset is not standardized.
- if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0
+ if (featuresStd(featureIndex) != 0.0) {
+ regParamL1 / featuresStd(featureIndex)
+ } else {
+ 0.0
+ }
}
}
}
new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
}
- val initialCoefficientsWithIntercept =
- Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures)
+ // TODO: double check this
+ val initialCoefficientsWithIntercept = if (isMultinomial) {
+ Vectors.zeros(numClasses * numFeaturesPlusIntercept)
+ } else {
+ Vectors.zeros(numFeaturesPlusIntercept)
+ }
+ // TODO: need to add this for multinomial case
if (optInitialModel.isDefined && optInitialModel.get.coefficients.size != numFeatures) {
val vecSize = optInitialModel.get.coefficients.size
logWarning(
@@ -406,13 +468,46 @@ class LogisticRegression @Since("1.2.0") (
s"expected size $numFeatures")
}
- if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) {
- val initialCoefficientsWithInterceptArray = initialCoefficientsWithIntercept.toArray
- optInitialModel.get.coefficients.foreachActive { case (index, value) =>
- initialCoefficientsWithInterceptArray(index) = value
- }
- if ($(fitIntercept)) {
- initialCoefficientsWithInterceptArray(numFeatures) == optInitialModel.get.intercept
+ // TODO: removing initial model for now
+// if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) {
+// val initialCoefficientsWithInterceptArray = initialCoefficientsWithIntercept.toArray
+// optInitialModel.get.coefficients.foreachActive { case (index, value) =>
+// initialCoefficientsWithInterceptArray(index) = value
+// }
+// if ($(fitIntercept)) {
+// initialCoefficientsWithInterceptArray(numFeatures) == optInitialModel.get.intercept
+// }
+// }
+ if ($(fitIntercept) && isMultinomial) {
+ // TODO: can we merge the logic or something here?
+ /*
+ For multinomial logistic regression, when we initialize the coefficients as zeros,
+ it will converge faster if we initialize the intercepts such that
+ it follows the distribution of the labels.
+ {{{
+ P(1) = \exp(b_1) / Z
+ ...
+ P(K) = \exp(b_K) / Z
+ where Z = \sum_{k=1}^{K} \exp(b_k)
+ }}}
+ Since this doesn't have a unique solution, one of the solutions that satisfies the
+ above equations is
+ {{{
+ \exp(b_k) = count_k * \exp(\lambda)
+ b_k = \log(count_k) * \lambda
+ }}}
+ \lambda is a free parameter, so choose the phase \lambda such that the
+ mean is centered. This yields
+ {{{
+ b_k = \log(count_k)
+ b_k' = b_k - \mean(b_k)
+ }}}
+ */
+ val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
+ val rawMean = rawIntercepts.sum / rawIntercepts.length
+ rawIntercepts.indices.foreach { i =>
+ initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) =
+ rawIntercepts(i) - rawMean
}
} else if ($(fitIntercept)) {
/*
@@ -452,6 +547,7 @@ class LogisticRegression @Since("1.2.0") (
logError(msg)
throw new SparkException(msg)
}
+ bcFeaturesStd.destroy(blocking = false)
/*
The coefficients are trained in the scaled space; we're converting them back to
@@ -460,25 +556,62 @@ class LogisticRegression @Since("1.2.0") (
as a result, no scaling is needed.
*/
val rawCoefficients = state.x.toArray.clone()
- var i = 0
- while (i < numFeatures) {
- rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 }
- i += 1
+ // TODO: I think this will work for both binomial and multinomial
+ val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i =>
+ // flatIndex will loop though rawCoefficients, and skip the intercept terms.
+ val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
+ val featureIndex = i % numFeatures
+ if (featuresStd(featureIndex) != 0.0) {
+ rawCoefficients(flatIndex) / featuresStd(featureIndex)
+ } else {
+ 0.0
+ }
}
- bcFeaturesStd.destroy(blocking = false)
+ val coefficientMatrix =
+ new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true)
- if ($(fitIntercept)) {
- (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last,
- arrayBuilder.result())
+ if ($(regParam) == 0.0 && isMultinomial) {
+ /*
+ When no regularization is applied, the coefficients lack identifiability because
+ we do not use a pivot class. We can add any constant value to the coefficients and
+ get the same likelihood. So here, we choose the mean centered coefficients for
+ reproducibility. This method follows the approach in glmnet, described here:
+
+ Friedman, et al. "Regularization Paths for Generalized Linear Models via
+ Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf
+ */
+ val coefficientMean = coefficientMatrix.values.sum / coefficientMatrix.values.length
+ coefficientMatrix.update(_ - coefficientMean)
+ }
+
+ val interceptsArray: Array[Double] = if ($(fitIntercept)) {
+ Array.tabulate(numCoefficientSets) { i =>
+ val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1
+ rawCoefficients(coefIndex)
+ }
+ } else {
+ Array[Double]()
+ }
+ /*
+ The intercepts are never regularized, so we always center the mean.
+ */
+ val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
+ val interceptMean = interceptsArray.sum / numClasses
+ interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
+ Vectors.dense(interceptsArray)
+ } else if (interceptsArray.nonEmpty) {
+ Vectors.dense(interceptsArray)
} else {
- (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result())
+ Vectors.sparse(numClasses, Seq())
}
+ (coefficientMatrix, interceptVector, arrayBuilder.result())
}
}
if (handlePersistence) instances.unpersist()
- val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept))
+ val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses,
+ isMultinomial))
val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol()
val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
summaryModel.transform(dataset),
@@ -500,6 +633,8 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] {
@Since("1.6.0")
override def load(path: String): LogisticRegression = super.load(path)
+
+ private[classification] lazy val supportedFamilyNames = Array("binomial", "multinomial")
}
/**
@@ -508,11 +643,34 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] {
@Since("1.4.0")
class LogisticRegressionModel private[spark] (
@Since("1.4.0") override val uid: String,
- @Since("2.0.0") val coefficients: Vector,
- @Since("1.3.0") val intercept: Double)
+ @Since("2.1.0") val coefficientMatrix: Matrix,
+ @Since("2.1.0") val interceptVector: Vector,
+ @Since("1.3.0") override val numClasses: Int,
+ private val isMultinomial: Boolean)
extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
with LogisticRegressionParams with MLWritable {
+ def this(uid: String, coefficients: Vector, intercept: Double) {
+ this(uid,
+ new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true),
+ Vectors.dense(intercept), 2, false)
+ }
+
+ @Since("2.0.0")
+ // TODO: this should convert sparse to sparse and dense to dense
+ val coefficients: Vector = Vectors.dense(coefficientMatrix.toArray)
+
+ @Since("1.3.0")
+ def intercept: Double = {
+ if (isMultinomial) {
+ logWarning("Multiclass model contains an vector of intercepts, use interceptVector instead." +
+ "Returning 0.0 as placeholder.")
+ }
+ _intercept
+ }
+
+ private val _intercept = if (!isMultinomial) interceptVector.toArray.head else 0.0
+
@Since("1.5.0")
override def setThreshold(value: Double): this.type = super.setThreshold(value)
@@ -527,7 +685,14 @@ class LogisticRegressionModel private[spark] (
/** Margin (rawPrediction) for class label 1. For binary classification only. */
private val margin: Vector => Double = (features) => {
- BLAS.dot(features, coefficients) + intercept
+ BLAS.dot(features, coefficients) + _intercept
+ }
+
+ /** Margin (rawPrediction) for each class label. */
+ private val margins: Vector => Vector = (features) => {
+ val m = interceptVector.toDense.copy
+ BLAS.gemv(1.0, coefficientMatrix, features, 1.0, m)
+ m
}
/** Score (probability) for class label 1. For binary classification only. */
@@ -536,11 +701,36 @@ class LogisticRegressionModel private[spark] (
1.0 / (1.0 + math.exp(-m))
}
- @Since("1.6.0")
- override val numFeatures: Int = coefficients.size
+ /** Score (probability) for each class label. */
+ private val scores: Vector => Vector = (features) => {
+ val m = margins(features)
+ val maxMarginIndex = m.argmax
+ val marginArray = m.toArray
+ val maxMargin = marginArray(maxMarginIndex)
- @Since("1.3.0")
- override val numClasses: Int = 2
+ // adjust margins for overflow
+ val sum = {
+ var temp = 0.0
+ var k = 0
+ while (k < numClasses) {
+ marginArray(k) = if (maxMargin > 0) {
+ math.exp(marginArray(k) - maxMargin)
+ } else {
+ math.exp(marginArray(k))
+ }
+ temp += marginArray(k)
+ k += 1
+ }
+ temp
+ }
+
+ val scores = Vectors.dense(marginArray)
+ BLAS.scal(1 / sum, scores)
+ scores
+ }
+
+ @Since("1.6.0")
+ override val numFeatures: Int = coefficientMatrix.numCols
private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None
@@ -597,19 +787,80 @@ class LogisticRegressionModel private[spark] (
*/
override protected def predict(features: Vector): Double = {
// Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
- if (score(features) > getThreshold) 1 else 0
+ if (isMultinomial) {
+ if (isDefined(thresholds)) {
+ val thresholds: Array[Double] = getThresholds
+ val probabilities = scores(features).toArray
+ var argMax = 0
+ var max = Double.NegativeInfinity
+ var i = 0
+ while (i < numClasses) {
+ if (thresholds(i) == 0.0) {
+ max = Double.PositiveInfinity
+ argMax = i
+ } else {
+ val scaled = probabilities(i) / thresholds(i)
+ if (scaled > max) {
+ max = scaled
+ argMax = i
+ }
+ }
+ i += 1
+ }
+ argMax
+ } else {
+ scores(features).argmax
+ }
+ }
+ else {
+ if (score(features) > getThreshold) 1 else 0
+ }
}
override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
rawPrediction match {
case dv: DenseVector =>
- var i = 0
- val size = dv.size
- while (i < size) {
- dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i)))
- i += 1
+ if (isMultinomial) {
+ val size = dv.size
+ val values = dv.values
+
+ // get the maximum margin
+ val maxMarginIndex = rawPrediction.argmax
+ val maxMargin = rawPrediction(maxMarginIndex)
+
+ if (maxMargin == Double.PositiveInfinity) {
+ var k = 0
+ while (k < size) {
+ values(k) = if (k == maxMarginIndex) 1.0 else 0.0
+ k += 1
+ }
+ } else {
+ val sum = {
+ var temp = 0.0
+ var k = 0
+ while (k < numClasses) {
+ values(k) = if (maxMargin > 0) {
+ math.exp(values(k) - maxMargin)
+ } else {
+ math.exp(values(k))
+ }
+ temp += values(k)
+ k += 1
+ }
+ temp
+ }
+ BLAS.scal(1 / sum, dv)
+ }
+ dv
+ } else {
+ var i = 0
+ val size = dv.size
+ while (i < size) {
+ dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i)))
+ i += 1
+ }
+ dv
}
- dv
case sv: SparseVector =>
throw new RuntimeException("Unexpected error in LogisticRegressionModel:" +
" raw2probabilitiesInPlace encountered SparseVector")
@@ -617,33 +868,46 @@ class LogisticRegressionModel private[spark] (
}
override protected def predictRaw(features: Vector): Vector = {
- val m = margin(features)
- Vectors.dense(-m, m)
+ if (isMultinomial) {
+ margins(features)
+ } else {
+ val m = margin(features)
+ Vectors.dense(-m, m)
+ }
}
@Since("1.4.0")
override def copy(extra: ParamMap): LogisticRegressionModel = {
- val newModel = copyValues(new LogisticRegressionModel(uid, coefficients, intercept), extra)
+ val newModel = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
+ numClasses, isMultinomial), extra)
if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
newModel.setParent(parent)
}
-
+ // TODO: basically check all these methods
override protected def raw2prediction(rawPrediction: Vector): Double = {
- // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
- val t = getThreshold
- val rawThreshold = if (t == 0.0) {
- Double.NegativeInfinity
- } else if (t == 1.0) {
- Double.PositiveInfinity
+ if (isMultinomial) {
+ super.raw2prediction(rawPrediction)
} else {
- math.log(t / (1.0 - t))
+ // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
+ val t = getThreshold
+ val rawThreshold = if (t == 0.0) {
+ Double.NegativeInfinity
+ } else if (t == 1.0) {
+ Double.PositiveInfinity
+ } else {
+ math.log(t / (1.0 - t))
+ }
+ if (rawPrediction(1) > rawThreshold) 1 else 0
}
- if (rawPrediction(1) > rawThreshold) 1 else 0
}
override protected def probability2prediction(probability: Vector): Double = {
// Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
- if (probability(1) > getThreshold) 1 else 0
+ if (isMultinomial) {
+ super.probability2prediction(probability)
+ } else {
+ if (probability(1) > getThreshold) 1 else 0
+ }
}
/**
@@ -676,15 +940,16 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
private case class Data(
numClasses: Int,
numFeatures: Int,
- intercept: Double,
- coefficients: Vector)
+ interceptVector: Vector,
+ coefficientMatrix: Matrix,
+ isMultinomial: Boolean)
override protected def saveImpl(path: String): Unit = {
// Save metadata and Params
DefaultParamsWriter.saveMetadata(instance, path, sc)
// Save model data: numClasses, numFeatures, intercept, coefficients
- val data = Data(instance.numClasses, instance.numFeatures, instance.intercept,
- instance.coefficients)
+ val data = Data(instance.numClasses, instance.numFeatures, instance.interceptVector,
+ instance.coefficientMatrix, instance.isMultinomial)
val dataPath = new Path(path, "data").toString
sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
}
@@ -702,13 +967,15 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
val dataPath = new Path(path, "data").toString
val data = sparkSession.read.format("parquet").load(dataPath)
- // We will need numClasses, numFeatures in the future for multinomial logreg support.
- // TODO: remove numClasses and numFeatures fields?
- val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) =
- MLUtils.convertVectorColumnsToML(data, "coefficients")
- .select("numClasses", "numFeatures", "intercept", "coefficients")
- .head()
- val model = new LogisticRegressionModel(metadata.uid, coefficients, intercept)
+ val convertedCoefs = MLUtils.convertMatrixColumnsToML(data, "coefficientMatrix")
+ val converted = MLUtils.convertVectorColumnsToML(convertedCoefs, "interceptVector")
+ .select("numClasses", "numFeatures", "interceptVector", "coefficientMatrix",
+ "isMultinomial")
+ // TODO: numFeatures not needed?
+ val Row(numClasses: Int, numFeatures: Int, interceptVector: Vector,
+ coefficientMatrix: Matrix, isMultinomial: Boolean) = converted.head()
+ val model = new LogisticRegressionModel(metadata.uid, coefficientMatrix, interceptVector,
+ numClasses, isMultinomial)
DefaultParamsReader.getAndSetParams(model, metadata)
model
@@ -1103,6 +1370,7 @@ class BinaryLogisticRegressionSummary private[classification] (
* $$
*
*
+ *
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param bcFeaturesStd The broadcast standard deviation values of the features.
* @param numClasses the number of possible outcomes for k classes classification problem in
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index e4cbf5acbc11d..ad3dab33d2909 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -19,6 +19,7 @@ package org.apache.spark.mllib.classification
import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
+import org.apache.spark.ml.linalg.DenseMatrix
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.classification.impl.GLMClassificationModel
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
@@ -429,9 +430,11 @@ class LogisticRegressionWithLBFGS
lr.setElasticNetParam(elasticNetParam)
lr.setStandardization(useFeatureScaling)
if (userSuppliedWeights) {
+ // TODO: check this
val uid = Identifiable.randomUID("logreg-static")
lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(
- uid, initialWeights.asML, 1.0))
+ uid, new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed=true),
+ Vectors.dense(0.0).asML, 2, false))
}
lr.setFitIntercept(addIntercept)
lr.setMaxIter(optimizer.getNumIterations())
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
index 0913fe559c562..9c7e08820d93b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
@@ -87,14 +87,14 @@ class MultinomialLogisticRegressionSuite
}
test("params") {
- ParamsSuite.checkParams(new MultinomialLogisticRegression)
- val model = new MultinomialLogisticRegressionModel("mLogReg",
- Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2)
+ ParamsSuite.checkParams(new LogisticRegression)
+ val model = new LogisticRegressionModel("mLogReg",
+ Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true)
ParamsSuite.checkParams(model)
}
test("multinomial logistic regression: default params") {
- val mlr = new MultinomialLogisticRegression
+ val mlr = new LogisticRegression
assert(mlr.getLabelCol === "label")
assert(mlr.getFeaturesCol === "features")
assert(mlr.getPredictionCol === "prediction")
@@ -112,15 +112,15 @@ class MultinomialLogisticRegressionSuite
assert(model.getPredictionCol === "prediction")
assert(model.getRawPredictionCol === "rawPrediction")
assert(model.getProbabilityCol === "probability")
- assert(model.intercepts !== Vectors.dense(0.0, 0.0))
+ assert(model.interceptVector !== Vectors.dense(0.0, 0.0))
assert(model.hasParent)
}
test("multinomial logistic regression with intercept without regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
val model1 = trainer1.fit(multinomialDataset)
@@ -166,21 +166,21 @@ class MultinomialLogisticRegressionSuite
0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true)
val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407)
- assert(model1.coefficients ~== coefficientsR relTol 0.05)
- assert(model1.coefficients.toArray.sum ~== 0.0 absTol eps)
- assert(model1.intercepts ~== interceptsR relTol 0.05)
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR relTol 0.05)
- assert(model2.coefficients.toArray.sum ~== 0.0 absTol eps)
- assert(model2.intercepts ~== interceptsR relTol 0.05)
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.interceptVector ~== interceptsR relTol 0.05)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.interceptVector ~== interceptsR relTol 0.05)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression without intercept without regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
val model1 = trainer1.fit(multinomialDataset)
@@ -226,23 +226,23 @@ class MultinomialLogisticRegressionSuite
-0.3036269, 0.9449630, -0.2271038, -0.4364839,
0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true)
- assert(model1.coefficients ~== coefficientsR relTol 0.05)
- assert(model1.coefficients.toArray.sum ~== 0.0 absTol eps)
- assert(model1.intercepts.toArray === Array.fill(3)(0.0))
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR relTol 0.05)
- assert(model2.coefficients.toArray.sum ~== 0.0 absTol eps)
- assert(model2.intercepts.toArray === Array.fill(3)(0.0))
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression with intercept with L1 regularization") {
// use tighter constraints because OWL-QN solver takes longer to converge
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
.setMaxIter(300).setTol(1e-10)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
.setMaxIter(300).setTol(1e-10)
@@ -328,18 +328,18 @@ class MultinomialLogisticRegressionSuite
0.0, 0.0, 0.0, 0.0), isTransposed = true)
val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428)
- assert(model1.coefficients ~== coefficientsRStd absTol 0.02)
- assert(model1.intercepts ~== interceptsRStd relTol 0.1)
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR absTol 0.02)
- assert(model2.intercepts ~== interceptsR relTol 0.1)
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02)
+ assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02)
+ assert(model2.interceptVector ~== interceptsR relTol 0.1)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression without intercept with L1 regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
val model1 = trainer1.fit(multinomialDataset)
@@ -421,18 +421,18 @@ class MultinomialLogisticRegressionSuite
0.0, 0.1943624, -0.1902577, -0.1028789,
0.0, 0.0, 0.0, 0.0), isTransposed = true)
- assert(model1.coefficients ~== coefficientsRStd absTol 0.01)
- assert(model1.intercepts.toArray === Array.fill(3)(0.0))
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR absTol 0.01)
- assert(model2.intercepts.toArray === Array.fill(3)(0.0))
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression with intercept with L2 regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
val model1 = trainer1.fit(multinomialDataset)
@@ -516,18 +516,18 @@ class MultinomialLogisticRegressionSuite
0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true)
val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212)
- assert(model1.coefficients ~== coefficientsRStd relTol 0.05)
- assert(model1.intercepts ~== interceptsRStd relTol 0.05)
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR relTol 0.05)
- assert(model2.intercepts ~== interceptsR relTol 0.05)
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05)
+ assert(model1.interceptVector ~== interceptsRStd relTol 0.05)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model2.interceptVector ~== interceptsR relTol 0.05)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression without intercept with L2 regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
val model1 = trainer1.fit(multinomialDataset)
@@ -607,19 +607,19 @@ class MultinomialLogisticRegressionSuite
-0.08469036, 0.38996748, -0.16468436, -0.22522976,
0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true)
- assert(model1.coefficients ~== coefficientsRStd absTol 0.01)
- assert(model1.intercepts.toArray === Array.fill(3)(0.0))
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR absTol 0.01)
- assert(model2.intercepts.toArray === Array.fill(3)(0.0))
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression with intercept with elasticnet regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
.setMaxIter(300).setTol(1e-10)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
.setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
.setMaxIter(300).setTol(1e-10)
@@ -704,19 +704,19 @@ class MultinomialLogisticRegressionSuite
0.0, 0.0, 0.0, 0.0), isTransposed = true)
val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366)
- assert(model1.coefficients ~== coefficientsRStd absTol 0.01)
- assert(model1.intercepts ~== interceptsRStd absTol 0.01)
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR absTol 0.01)
- assert(model2.intercepts ~== interceptsR absTol 0.01)
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector ~== interceptsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector ~== interceptsR absTol 0.01)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
test("multinomial logistic regression without intercept with elasticnet regularization") {
- val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
.setMaxIter(300).setTol(1e-10)
- val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
.setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
.setMaxIter(300).setTol(1e-10)
@@ -798,12 +798,12 @@ class MultinomialLogisticRegressionSuite
0.0, 0.14666497, -0.16570638, -0.05982875,
0.0, 0.0, 0.0, 0.0), isTransposed = true)
- assert(model1.coefficients ~== coefficientsRStd absTol 0.01)
- assert(model1.intercepts.toArray === Array.fill(3)(0.0))
- assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficients ~== coefficientsR absTol 0.01)
- assert(model2.intercepts.toArray === Array.fill(3)(0.0))
- assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
/*
@@ -814,9 +814,9 @@ class MultinomialLogisticRegressionSuite
*/
test("prediction") {
- val model = new MultinomialLogisticRegressionModel("mLogReg",
+ val model = new LogisticRegressionModel("mLogReg",
Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)),
- Vectors.dense(0.0, 0.0, 0.0), 3)
+ Vectors.dense(0.0, 0.0, 0.0), 3, true)
val overFlowData = spark.createDataFrame(Seq(
LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)),
LabeledPoint(1.0, Vectors.dense(0.0, -1.0))
@@ -837,7 +837,7 @@ class MultinomialLogisticRegressionSuite
}
test("multinomial logistic regression: Predictor, Classifier methods") {
- val mlr = new MultinomialLogisticRegression
+ val mlr = new LogisticRegression
val model = mlr.fit(dataset)
assert(model.numClasses === 3)
@@ -852,9 +852,9 @@ class MultinomialLogisticRegressionSuite
val margins = Array.tabulate(3) { k =>
var margin = 0.0
features.foreachActive { (index, value) =>
- margin += value * model.coefficients(k, index)
+ margin += value * model.coefficientMatrix(k, index)
}
- margin += model.intercepts(k)
+ margin += model.interceptVector(k)
margin
}
assert(raw ~== Vectors.dense(margins) relTol eps)
@@ -884,21 +884,21 @@ class MultinomialLogisticRegressionSuite
}
test("multinomial logistic regression coefficients should be centered") {
- val mlr = new MultinomialLogisticRegression().setMaxIter(1)
+ val mlr = new LogisticRegression().setMaxIter(1)
val model = mlr.fit(dataset)
- assert(model.intercepts.toArray.sum ~== 0.0 absTol 1e-6)
- assert(model.coefficients.toArray.sum ~== 0.0 absTol 1e-6)
+ assert(model.interceptVector.toArray.sum ~== 0.0 absTol 1e-6)
+ assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol 1e-6)
}
test("numClasses specified in metadata/inferred") {
- val mlr = new MultinomialLogisticRegression().setMaxIter(1)
+ val mlr = new LogisticRegression().setMaxIter(1)
// specify more classes than unique label values
val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata()
val df = dataset.select(dataset("label").as("label", labelMeta), dataset("features"))
val model1 = mlr.fit(df)
assert(model1.numClasses === 4)
- assert(model1.intercepts.size === 4)
+ assert(model1.interceptVector.size === 4)
// specify two classes when there are really three
val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata()
@@ -919,7 +919,7 @@ class MultinomialLogisticRegressionSuite
LabeledPoint(4.0, Vectors.dense(1.0)),
LabeledPoint(4.0, Vectors.dense(2.0)))
)
- val mlr = new MultinomialLogisticRegression
+ val mlr = new LogisticRegression().setFamily("multinomial")
val model = mlr.fit(constantData)
val results = model.transform(constantData)
results.select("rawPrediction", "probability", "prediction").collect().foreach {
@@ -966,7 +966,7 @@ class MultinomialLogisticRegressionSuite
val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i =>
LabeledPoint(i.toDouble, Vectors.dense(i.toDouble))
})
- val mlr = new MultinomialLogisticRegression().setWeightCol("weight")
+ val mlr = new LogisticRegression().setWeightCol("weight")
val model = mlr.fit(outlierData)
val results = model.transform(testData).select("label", "prediction").collect()
@@ -979,11 +979,11 @@ class MultinomialLogisticRegressionSuite
42L)
val weightedModel = mlr.fit(weightedData)
val overSampledModel = mlr.setWeightCol("").fit(overSampledData)
- assert(weightedModel.coefficients ~== overSampledModel.coefficients relTol 0.01)
+ assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01)
}
test("thresholds prediction") {
- val mlr = new MultinomialLogisticRegression
+ val mlr = new LogisticRegression
val model = mlr.fit(dataset)
val basePredictions = model.transform(dataset).select("prediction").collect()
@@ -1010,28 +1010,28 @@ class MultinomialLogisticRegressionSuite
})
}
- test("read/write") {
- def checkModelData(
- model: MultinomialLogisticRegressionModel,
- model2: MultinomialLogisticRegressionModel): Unit = {
- assert(model.intercepts === model2.intercepts)
- assert(model.coefficients.toArray === model2.coefficients.toArray)
- assert(model.numClasses === model2.numClasses)
- assert(model.numFeatures === model2.numFeatures)
- }
- val mlr = new MultinomialLogisticRegression()
- testEstimatorAndModelReadWrite(mlr, dataset,
- MultinomialLogisticRegressionSuite.allParamSettings,
- checkModelData)
- }
+// test("read/write") {
+// def checkModelData(
+// model: LogisticRegressionModel,
+// model2: LogisticRegressionModel): Unit = {
+// assert(model.interceptVector === model2.interceptVector)
+// assert(model.coefficientMatrix.toArray === model2.coefficients.toArray)
+// assert(model.numClasses === model2.numClasses)
+// assert(model.numFeatures === model2.numFeatures)
+// }
+// val mlr = new LogisticRegression()
+// testEstimatorAndModelReadWrite(mlr, dataset,
+// MultinomialLogisticRegressionSuite.allParamSettings,
+// checkModelData)
+// }
test("should support all NumericType labels and not support other types") {
- val mlr = new MultinomialLogisticRegression().setMaxIter(1)
+ val mlr = new LogisticRegression().setMaxIter(1)
MLTestingUtils
- .checkNumericTypes[MultinomialLogisticRegressionModel, MultinomialLogisticRegression](
+ .checkNumericTypes[LogisticRegressionModel, LogisticRegression](
mlr, spark) { (expected, actual) =>
- assert(expected.intercepts === actual.intercepts)
- assert(expected.coefficients.toArray === actual.coefficients.toArray)
+ assert(expected.interceptVector === actual.interceptVector)
+ assert(expected.coefficientMatrix.toArray === actual.coefficients.toArray)
}
}
}
From d4675bea0c531a786381adb9c4763f97ae8bcb9e Mon Sep 17 00:00:00 2001
From: sethah
Date: Wed, 24 Aug 2016 22:05:46 -0700
Subject: [PATCH 02/24] add initial model
---
.../classification/LogisticRegression.scala | 46 +++++++++++--------
.../LogisticRegressionSuite.scala | 36 ++++++++++++++-
2 files changed, 61 insertions(+), 21 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c8c06a4d7752b..15a2450f464de 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -354,10 +354,10 @@ class LogisticRegression @Since("1.2.0") (
case None => histogram.length
}
val isBinaryClassification = numClasses == 1 || numClasses == 2
+ // TODO: use enumeration or similar
val isMultinomial = !((!isSet(family) && isBinaryClassification) || $(family) == "binomial")
val numCoefficientSets = if (isMultinomial) numClasses else 1
- // TODO: use enumeration or similar
if (!isMultinomial) {
require(isBinaryClassification, s"Binomial family only supports 1 or 2" +
s"outcome classes but found $numClasses")
@@ -461,25 +461,33 @@ class LogisticRegression @Since("1.2.0") (
}
// TODO: need to add this for multinomial case
- if (optInitialModel.isDefined && optInitialModel.get.coefficients.size != numFeatures) {
- val vecSize = optInitialModel.get.coefficients.size
- logWarning(
- s"Initial coefficients will be ignored!! As its size $vecSize did not match the " +
- s"expected size $numFeatures")
+ val initialModelIsValid = optInitialModel.exists { model =>
+ val providedCoefs = model.coefficientMatrix
+ val modelValid = (providedCoefs.numRows == numCoefficientSets) &&
+ (providedCoefs.numCols == numFeatures) &&
+ (model.interceptVector.size == numCoefficientSets)
+ if (!modelValid) {
+ logWarning(s"Initial coefficients will be ignored! Its dimensions " +
+ s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the expected " +
+ s"size ($numCoefficientSets, $numFeatures)")
+ }
+ modelValid
}
- // TODO: removing initial model for now
-// if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) {
-// val initialCoefficientsWithInterceptArray = initialCoefficientsWithIntercept.toArray
-// optInitialModel.get.coefficients.foreachActive { case (index, value) =>
-// initialCoefficientsWithInterceptArray(index) = value
-// }
-// if ($(fitIntercept)) {
-// initialCoefficientsWithInterceptArray(numFeatures) == optInitialModel.get.intercept
-// }
-// }
- if ($(fitIntercept) && isMultinomial) {
- // TODO: can we merge the logic or something here?
+ if (initialModelIsValid) {
+ val initialCoefArray = initialCoefficientsWithIntercept.toArray
+ val providedCoefArray = optInitialModel.get.coefficientMatrix.toArray
+ providedCoefArray.indices.foreach { i =>
+ val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
+ initialCoefArray(flatIndex) = providedCoefArray(i)
+ }
+ if ($(fitIntercept)) {
+ optInitialModel.get.interceptVector.foreachActive { (index, value) =>
+ val coefIndex = (index + 1) * numFeaturesPlusIntercept - 1
+ initialCoefArray(coefIndex) = value
+ }
+ }
+ } else if ($(fitIntercept) && isMultinomial) {
/*
For multinomial logistic regression, when we initialize the coefficients as zeros,
it will converge faster if we initialize the intercepts such that
@@ -556,7 +564,6 @@ class LogisticRegression @Since("1.2.0") (
as a result, no scaling is needed.
*/
val rawCoefficients = state.x.toArray.clone()
- // TODO: I think this will work for both binomial and multinomial
val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i =>
// flatIndex will loop though rawCoefficients, and skip the intercept terms.
val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
@@ -612,6 +619,7 @@ class LogisticRegression @Since("1.2.0") (
val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses,
isMultinomial))
+ // TODO: need to implement model summary for MLOR... probably best to do it in another JIRA
val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol()
val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
summaryModel.transform(dataset),
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index a1b48539c46e0..a0af82c2ea42c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -25,7 +25,7 @@ import scala.util.control.Breaks._
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.linalg.{DenseMatrix, Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
@@ -37,7 +37,8 @@ class LogisticRegressionSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@transient var dataset: Dataset[_] = _
- @transient var binaryDataset: DataFrame = _
+ @transient var binaryDataset: Dataset[_] = _
+ @transient var multinomialDataset: Dataset[_] = _
private val eps: Double = 1e-5
override def beforeAll(): Unit = {
@@ -57,6 +58,23 @@ class LogisticRegressionSuite
spark.createDataFrame(sc.parallelize(testData, 4))
}
+
+ multinomialDataset = {
+ val nPoints = 10000
+ val coefficients = Array(
+ -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
+ -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)
+
+ val xMean = Array(5.843, 3.057, 3.758, 1.199)
+ val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
+
+ val testData = generateMultinomialLogisticInput(
+ coefficients, xMean, xVariance, addIntercept = true, nPoints, 42)
+
+ val df = spark.createDataFrame(sc.parallelize(testData, 4))
+ df.cache()
+ df
+ }
}
/**
@@ -886,6 +904,20 @@ class LogisticRegressionSuite
assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
}
+ test("set initial model") {
+ // TODO: the binary one doesn't converge any faster
+ // TODO: should they converge after one or two iterations?
+ val lr = new LogisticRegression()
+ val model1 = lr.fit(binaryDataset)
+ val lr2 = new LogisticRegression().setInitialModel(model1)
+ val model2 = lr2.fit(binaryDataset)
+
+ val lr3 = new LogisticRegression()
+ val model3 = lr3.fit(multinomialDataset)
+ val lr4 = new LogisticRegression().setInitialModel(model3)
+ val model4 = lr4.fit(multinomialDataset)
+ }
+
test("logistic regression with all labels the same") {
val sameLabels = dataset
.withColumn("zeroLabel", lit(0.0))
From a399ef3ab4b9720f081b2e234f993eef61c5587b Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 25 Aug 2016 09:16:33 -0700
Subject: [PATCH 03/24] fixing some todos, added dual support for weighted
tests
---
.../classification/LogisticRegression.scala | 65 +++++----
.../LogisticRegressionSuite.scala | 138 +++++++++++-------
2 files changed, 120 insertions(+), 83 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 15a2450f464de..aca96aa3ba3a1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -75,16 +75,22 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
- * Param for the name of family which is a description of the error distribution
+ * Param for the name of family which is a description of the label distribution
* to be used in the model.
- * Supported options: "multinomial", "binomial".
- * Default is "multinomial".
+ * Supported options: "auto", "multinomial", "binomial".
+ * Supported options:
+ * - "auto": Automatically select the family based on the number of classes:
+ * If numClasses == 1 || numClasses == 2, set to "binomial".
+ * Else, set to "multinomial"
+ * - "binomial": Binary logistic regression with pivoting.
+ * - "multinomial": Multinomial (softmax) regression without pivoting.
+ * Default is "auto".
*
* @group param
*/
@Since("2.0.0")
final val family: Param[String] = new Param(this, "family",
- "The name of family which is a description of the error distribution to be used in the " +
+ "The name of family which is a description of the label distribution to be used in the " +
s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.",
ParamValidators.inArray[String](supportedFamilyNames))
@@ -243,14 +249,13 @@ class LogisticRegression @Since("1.2.0") (
/**
* Sets the value of param [[family]].
- * Default is "multinomial".
+ * Default is "auto".
*
* @group setParam
*/
- // TODO: don't use strings?
@Since("2.0.0")
def setFamily(value: String): this.type = set(family, value)
- setDefault(family -> "multinomial")
+ setDefault(family -> "auto")
/**
* Whether to standardize the training features before fitting the model.
@@ -267,6 +272,7 @@ class LogisticRegression @Since("1.2.0") (
setDefault(standardization -> true)
@Since("1.5.0")
+ // TODO: Check this behavior
override def setThreshold(value: Double): this.type = super.setThreshold(value)
@Since("1.5.0")
@@ -354,12 +360,12 @@ class LogisticRegression @Since("1.2.0") (
case None => histogram.length
}
val isBinaryClassification = numClasses == 1 || numClasses == 2
- // TODO: use enumeration or similar
- val isMultinomial = !((!isSet(family) && isBinaryClassification) || $(family) == "binomial")
+ val isMultinomial = ($(family) == LogisticRegression.auto && !isBinaryClassification) ||
+ ($(family) == LogisticRegression.multinomial)
val numCoefficientSets = if (isMultinomial) numClasses else 1
if (!isMultinomial) {
- require(isBinaryClassification, s"Binomial family only supports 1 or 2" +
+ require(isBinaryClassification, s"Binomial family only supports 1 or 2 " +
s"outcome classes but found $numClasses")
}
@@ -422,7 +428,6 @@ class LogisticRegression @Since("1.2.0") (
new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
} else {
val standardizationParam = $(standardization)
- // TODO: check this works in both cases
def regParamL1Fun = (index: Int) => {
// Remove the L1 penalization on the intercept
val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0)
@@ -453,14 +458,8 @@ class LogisticRegression @Since("1.2.0") (
new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
}
- // TODO: double check this
- val initialCoefficientsWithIntercept = if (isMultinomial) {
- Vectors.zeros(numClasses * numFeaturesPlusIntercept)
- } else {
- Vectors.zeros(numFeaturesPlusIntercept)
- }
+ val initialCoefficientsWithIntercept = Vectors.zeros(numCoefficientSets * numFeatures)
- // TODO: need to add this for multinomial case
val initialModelIsValid = optInitialModel.exists { model =>
val providedCoefs = model.coefficientMatrix
val modelValid = (providedCoefs.numRows == numCoefficientSets) &&
@@ -619,15 +618,19 @@ class LogisticRegression @Since("1.2.0") (
val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses,
isMultinomial))
- // TODO: need to implement model summary for MLOR... probably best to do it in another JIRA
- val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol()
- val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
- summaryModel.transform(dataset),
- probabilityColName,
- $(labelCol),
- $(featuresCol),
- objectiveHistory)
- val m = model.setSummary(logRegSummary)
+ // TODO: implement summary model for multinomial case
+ val m = if (!isMultinomial) {
+ val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol()
+ val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
+ summaryModel.transform(dataset),
+ probabilityColName,
+ $(labelCol),
+ $(featuresCol),
+ objectiveHistory)
+ model.setSummary(logRegSummary)
+ } else {
+ model
+ }
instr.logSuccess(m)
m
}
@@ -642,7 +645,11 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] {
@Since("1.6.0")
override def load(path: String): LogisticRegression = super.load(path)
- private[classification] lazy val supportedFamilyNames = Array("binomial", "multinomial")
+ private val multinomial = "multinomial"
+ private val binomial = "binomial"
+ private val auto = "auto"
+
+ private[classification] lazy val supportedFamilyNames = Array(auto, binomial, multinomial)
}
/**
@@ -891,7 +898,7 @@ class LogisticRegressionModel private[spark] (
if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get)
newModel.setParent(parent)
}
- // TODO: basically check all these methods
+
override protected def raw2prediction(rawPrediction: Vector): Double = {
if (isMultinomial) {
super.raw2prediction(rawPrediction)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index a0af82c2ea42c..899158e45954a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -85,6 +85,9 @@ class LogisticRegressionSuite
binaryDataset.rdd.map { case Row(label: Double, features: Vector) =>
label + "," + features.toArray.mkString(",")
}.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
+ multinomialDataset.rdd.map { case Row(label: Double, features: Vector) =>
+ label + "," + features.toArray.mkString(",")
+ }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset")
}
test("params") {
@@ -100,6 +103,7 @@ class LogisticRegressionSuite
assert(lr.getPredictionCol === "prediction")
assert(lr.getRawPredictionCol === "rawPrediction")
assert(lr.getProbabilityCol === "probability")
+ assert(lr.getFamily === "multinomial")
assert(!lr.isDefined(lr.weightCol))
assert(lr.getFitIntercept)
assert(lr.getStandardization)
@@ -221,7 +225,6 @@ class LogisticRegressionSuite
}
test("logistic regression: Predictor, Classifier methods") {
- val spark = this.spark
val lr = new LogisticRegression
val model = lr.fit(dataset)
@@ -811,6 +814,7 @@ class LogisticRegressionSuite
}
test("evaluate on test set") {
+ // TODO: add for multiclass
// Evaluate on test set should be same as that of the transformed training data.
val lr = new LogisticRegression()
.setMaxIter(10)
@@ -845,63 +849,89 @@ class LogisticRegressionSuite
}
- test("binary logistic regression with weighted samples") {
- val (dataset, weightedDataset) = {
- val nPoints = 1000
- val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
- val xMean = Array(5.843, 3.057, 3.758, 1.199)
- val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
- val testData =
- generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42)
-
- // Let's over-sample the positive samples twice.
- val data1 = testData.flatMap { case labeledPoint: LabeledPoint =>
- if (labeledPoint.label == 1.0) {
- Iterator(labeledPoint, labeledPoint)
- } else {
- Iterator(labeledPoint)
- }
- }
+ test("binary logistic regression with weighted data") {
+ val numClasses = 2
+ val numPoints = 40
+ val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark,
+ numClasses, numPoints)
+ val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i =>
+ LabeledPoint(i.toDouble, Vectors.dense(i.toDouble))
+ })
+ val lr = new LogisticRegression().setWeightCol("weight")
+ val model = lr.fit(outlierData)
+ val results = model.transform(testData).select("label", "prediction").collect()
+
+ // check that the predictions are the one to one mapping
+ results.foreach { case Row(label: Double, pred: Double) =>
+ assert(label === pred)
+ }
+ val (overSampledData, weightedData) =
+ MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features",
+ 42L)
+ val weightedModel = lr.fit(weightedData)
+ val overSampledModel = lr.setWeightCol("").fit(overSampledData)
+ assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01)
+ }
- val rnd = new Random(8392)
- val data2 = testData.flatMap { case LabeledPoint(label: Double, features: Vector) =>
- if (rnd.nextGaussian() > 0.0) {
- if (label == 1.0) {
- Iterator(
- Instance(label, 1.2, features),
- Instance(label, 0.8, features),
- Instance(0.0, 0.0, features))
- } else {
- Iterator(
- Instance(label, 0.3, features),
- Instance(1.0, 0.0, features),
- Instance(label, 0.1, features),
- Instance(label, 0.6, features))
- }
- } else {
- if (label == 1.0) {
- Iterator(Instance(label, 2.0, features))
- } else {
- Iterator(Instance(label, 1.0, features))
- }
- }
- }
+ test("multinomial logistic regression with weighted data") {
+ val numClasses = 5
+ val numPoints = 40
+ val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark,
+ numClasses, numPoints)
+ val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i =>
+ LabeledPoint(i.toDouble, Vectors.dense(i.toDouble))
+ })
+ val mlr = new LogisticRegression().setWeightCol("weight")
+ val model = mlr.fit(outlierData)
+ val results = model.transform(testData).select("label", "prediction").collect()
+
+ // check that the predictions are the one to one mapping
+ results.foreach { case Row(label: Double, pred: Double) =>
+ assert(label === pred)
+ }
+ val (overSampledData, weightedData) =
+ MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features",
+ 42L)
+ val weightedModel = mlr.fit(weightedData)
+ val overSampledModel = mlr.setWeightCol("").fit(overSampledData)
+ assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01)
+ }
- (spark.createDataFrame(sc.parallelize(data1, 4)),
- spark.createDataFrame(sc.parallelize(data2, 4)))
+ test("set family") {
+ val lr = new LogisticRegression().setMaxIter(1)
+ // don't set anything for binary classification
+ val model1 = lr.fit(binaryDataset)
+ assert(model1.coefficientMatrix.numRows === 1 && model1.coefficientMatrix.numCols === 4)
+ assert(model1.interceptVector.size === 1)
+
+ // set to multinomial for binary classification
+ val model2 = lr.setFamily("multinomial").fit(binaryDataset)
+ assert(model2.coefficientMatrix.numRows === 2 && model2.coefficientMatrix.numCols === 4)
+ assert(model2.interceptVector.size === 2)
+
+ // set to binary for binary classification
+ val model3 = lr.setFamily("binomial").fit(binaryDataset)
+ assert(model3.coefficientMatrix.numRows === 1 && model3.coefficientMatrix.numCols === 4)
+ assert(model3.interceptVector.size === 1)
+
+ // don't set anything for multiclass classification
+ val mlr = new LogisticRegression().setMaxIter(1)
+ val model4 = mlr.fit(multinomialDataset)
+ assert(model4.coefficientMatrix.numRows === 3 && model4.coefficientMatrix.numCols === 4)
+ assert(model4.interceptVector.size === 3)
+
+ // set to binary for multiclass classification
+ mlr.setFamily("binomial")
+ val thrown = intercept[IllegalArgumentException] {
+ mlr.fit(multinomialDataset)
}
+ assert(thrown.getMessage.contains("Binomial family only supports 1 or 2 outcome classes"))
- val trainer1a = (new LogisticRegression).setFitIntercept(true)
- .setRegParam(0.0).setStandardization(true)
- val trainer1b = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight")
- .setRegParam(0.0).setStandardization(true)
- val model1a0 = trainer1a.fit(dataset)
- val model1a1 = trainer1a.fit(weightedDataset)
- val model1b = trainer1b.fit(weightedDataset)
- assert(model1a0.coefficients !~= model1a1.coefficients absTol 1E-3)
- assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
- assert(model1a0.coefficients ~== model1b.coefficients absTol 1E-3)
- assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+ // set to multinomial for multiclass
+ mlr.setFamily("multinomial")
+ val model5 = mlr.fit(multinomialDataset)
+ assert(model5.coefficientMatrix.numRows === 3 && model5.coefficientMatrix.numCols === 4)
+ assert(model5.interceptVector.size === 3)
}
test("set initial model") {
From a35469019ba6ca0cb0fd9877c28ae02aba46d337 Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 25 Aug 2016 13:11:44 -0700
Subject: [PATCH 04/24] all auxiliary tests are merged to LOR, and added
initial model test
---
.../classification/LogisticRegression.scala | 6 +-
.../LogisticRegressionSuite.scala | 315 ++++++++++++++++--
.../MultinomialLogisticRegressionSuite.scala | 264 ++-------------
3 files changed, 322 insertions(+), 263 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index aca96aa3ba3a1..9b1845eaef98a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -458,7 +458,8 @@ class LogisticRegression @Since("1.2.0") (
new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
}
- val initialCoefficientsWithIntercept = Vectors.zeros(numCoefficientSets * numFeatures)
+ val initialCoefficientsWithIntercept =
+ Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept)
val initialModelIsValid = optInitialModel.exists { model =>
val providedCoefs = model.coefficientMatrix
@@ -678,7 +679,7 @@ class LogisticRegressionModel private[spark] (
@Since("1.3.0")
def intercept: Double = {
if (isMultinomial) {
- logWarning("Multiclass model contains an vector of intercepts, use interceptVector instead." +
+ logWarning("Multiclass model contains a vector of intercepts, use interceptVector instead." +
"Returning 0.0 as placeholder.")
}
_intercept
@@ -940,6 +941,7 @@ class LogisticRegressionModel private[spark] (
@Since("1.6.0")
object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
+ // TODO: we need to be able to load old models as well
@Since("1.6.0")
override def read: MLReader[LogisticRegressionModel] = new LogisticRegressionModelReader
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 899158e45954a..a8e94fafa50ed 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,6 +17,8 @@
package org.apache.spark.ml.classification
+import org.apache.spark.ml.attribute.NominalAttribute
+
import scala.collection.JavaConverters._
import scala.language.existentials
import scala.util.Random
@@ -25,7 +27,7 @@ import scala.util.control.Breaks._
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{DenseMatrix, Vector, Vectors}
+import org.apache.spark.ml.linalg.{Matrices, DenseMatrix, Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
@@ -36,7 +38,8 @@ import org.apache.spark.sql.functions.lit
class LogisticRegressionSuite
extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
- @transient var dataset: Dataset[_] = _
+ @transient var smallBinaryDataset: Dataset[_] = _
+ @transient var smallMultinomialDataset: Dataset[_] = _
@transient var binaryDataset: Dataset[_] = _
@transient var multinomialDataset: Dataset[_] = _
private val eps: Double = 1e-5
@@ -44,7 +47,25 @@ class LogisticRegressionSuite
override def beforeAll(): Unit = {
super.beforeAll()
- dataset = spark.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42))
+ smallBinaryDataset =
+ spark.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42))
+
+ smallMultinomialDataset = {
+ val nPoints = 100
+ val coefficients = Array(
+ -0.57997, 0.912083, -0.371077,
+ -0.16624, -0.84355, -0.048509)
+
+ val xMean = Array(5.843, 3.057)
+ val xVariance = Array(0.6856, 0.1899)
+
+ val testData = generateMultinomialLogisticInput(
+ coefficients, xMean, xVariance, addIntercept = true, nPoints, 42)
+
+ val df = spark.createDataFrame(sc.parallelize(testData, 4))
+ df.cache()
+ df
+ }
binaryDataset = {
val nPoints = 10000
@@ -78,7 +99,7 @@ class LogisticRegressionSuite
}
/**
- * Enable the ignored test to export the dataset into CSV format,
+ * Enable the ignored test to export the smallBinaryDataset into CSV format,
* so we can validate the training accuracy compared with R's glmnet package.
*/
ignore("export test data into CSV format") {
@@ -103,12 +124,12 @@ class LogisticRegressionSuite
assert(lr.getPredictionCol === "prediction")
assert(lr.getRawPredictionCol === "rawPrediction")
assert(lr.getProbabilityCol === "probability")
- assert(lr.getFamily === "multinomial")
+ assert(lr.getFamily === "auto")
assert(!lr.isDefined(lr.weightCol))
assert(lr.getFitIntercept)
assert(lr.getStandardization)
- val model = lr.fit(dataset)
- model.transform(dataset)
+ val model = lr.fit(smallBinaryDataset)
+ model.transform(smallBinaryDataset)
.select("label", "probability", "prediction", "rawPrediction")
.collect()
assert(model.getThreshold === 0.5)
@@ -122,11 +143,11 @@ class LogisticRegressionSuite
test("empty probabilityCol") {
val lr = new LogisticRegression().setProbabilityCol("")
- val model = lr.fit(dataset)
+ val model = lr.fit(smallBinaryDataset)
assert(model.hasSummary)
// Validate that we re-insert a probability column for evaluation
val fieldNames = model.summary.predictions.schema.fieldNames
- assert(dataset.schema.fieldNames.toSet.subsetOf(
+ assert(smallBinaryDataset.schema.fieldNames.toSet.subsetOf(
fieldNames.toSet))
assert(fieldNames.exists(s => s.startsWith("probability_")))
}
@@ -163,17 +184,59 @@ class LogisticRegressionSuite
// thresholds and threshold must be consistent: values
withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") {
intercept[IllegalArgumentException] {
- val lr2model = lr2.fit(dataset,
+ val lr2model = lr2.fit(smallBinaryDataset,
lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0))
lr2model.getThreshold
}
}
}
+ test("thresholds prediction") {
+ val blr = new LogisticRegression().setFamily("binomial")
+ val binaryModel = blr.fit(smallBinaryDataset)
+
+ binaryModel.setThreshold(1.0)
+ val binaryZeroPredictions =
+ binaryModel.transform(smallBinaryDataset).select("prediction").collect()
+ assert(binaryZeroPredictions.forall(_.getDouble(0) === 0.0))
+
+ binaryModel.setThreshold(0.0)
+ val binaryOnePredictions =
+ binaryModel.transform(smallBinaryDataset).select("prediction").collect()
+ assert(binaryOnePredictions.forall(_.getDouble(0) === 1.0))
+
+
+ val mlr = new LogisticRegression().setFamily("multinomial")
+ val model = mlr.fit(smallMultinomialDataset)
+ val basePredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
+
+ // should predict all zeros
+ model.setThresholds(Array(1, 1000, 1000))
+ val zeroPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
+ assert(zeroPredictions.forall(_.getDouble(0) === 0.0))
+
+ // should predict all ones
+ model.setThresholds(Array(1000, 1, 1000))
+ val onePredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
+ assert(onePredictions.forall(_.getDouble(0) === 1.0))
+
+ // should predict all twos
+ model.setThresholds(Array(1000, 1000, 1))
+ val twoPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
+ assert(twoPredictions.forall(_.getDouble(0) === 2.0))
+
+ // constant threshold scaling is the same as no thresholds
+ model.setThresholds(Array(1000, 1000, 1000))
+ val scaledPredictions = model.transform(smallMultinomialDataset).select("prediction").collect()
+ assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
+ scaled.getDouble(0) === base.getDouble(0)
+ })
+ }
+
test("logistic regression doesn't fit intercept when fitIntercept is off") {
val lr = new LogisticRegression
lr.setFitIntercept(false)
- val model = lr.fit(dataset)
+ val model = lr.fit(smallBinaryDataset)
assert(model.intercept === 0.0)
// copied model must have the same parent.
@@ -187,7 +250,7 @@ class LogisticRegressionSuite
.setRegParam(1.0)
.setThreshold(0.6)
.setProbabilityCol("myProbability")
- val model = lr.fit(dataset)
+ val model = lr.fit(smallBinaryDataset)
val parent = model.parent.asInstanceOf[LogisticRegression]
assert(parent.getMaxIter === 10)
assert(parent.getRegParam === 1.0)
@@ -196,16 +259,16 @@ class LogisticRegressionSuite
// Modify model params, and check that the params worked.
model.setThreshold(1.0)
- val predAllZero = model.transform(dataset)
+ val predAllZero = model.transform(smallBinaryDataset)
.select("prediction", "myProbability")
.collect()
.map { case Row(pred: Double, prob: Vector) => pred }
assert(predAllZero.forall(_ === 0),
s"With threshold=1.0, expected predictions to be all 0, but only" +
- s" ${predAllZero.count(_ === 0)} of ${dataset.count()} were 0.")
+ s" ${predAllZero.count(_ === 0)} of ${smallBinaryDataset.count()} were 0.")
// Call transform with params, and check that the params worked.
val predNotAllZero =
- model.transform(dataset, model.threshold -> 0.0,
+ model.transform(smallBinaryDataset, model.threshold -> 0.0,
model.probabilityCol -> "myProb")
.select("prediction", "myProb")
.collect()
@@ -214,7 +277,7 @@ class LogisticRegressionSuite
// Call fit() with new params, and check as many params as we can.
lr.setThresholds(Array(0.6, 0.4))
- val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1,
+ val model2 = lr.fit(smallBinaryDataset, lr.maxIter -> 5, lr.regParam -> 0.1,
lr.probabilityCol -> "theProb")
val parent2 = model2.parent.asInstanceOf[LogisticRegression]
assert(parent2.getMaxIter === 5)
@@ -224,16 +287,63 @@ class LogisticRegressionSuite
assert(model2.getProbabilityCol === "theProb")
}
- test("logistic regression: Predictor, Classifier methods") {
+ test("multinomial logistic regression: Predictor, Classifier methods") {
+ val mlr = new LogisticRegression
+
+ val model = mlr.fit(smallMultinomialDataset)
+ assert(model.numClasses === 3)
+ val numFeatures = smallMultinomialDataset.select("features").first().getAs[Vector](0).size
+ assert(model.numFeatures === numFeatures)
+
+ val results = model.transform(smallMultinomialDataset)
+ // check that raw prediction is coefficients dot features + intercept
+ results.select("rawPrediction", "features").collect().foreach {
+ case Row(raw: Vector, features: Vector) =>
+ assert(raw.size === 3)
+ val margins = Array.tabulate(3) { k =>
+ var margin = 0.0
+ features.foreachActive { (index, value) =>
+ margin += value * model.coefficientMatrix(k, index)
+ }
+ margin += model.interceptVector(k)
+ margin
+ }
+ assert(raw ~== Vectors.dense(margins) relTol eps)
+ }
+
+ // Compare rawPrediction with probability
+ results.select("rawPrediction", "probability").collect().foreach {
+ case Row(raw: Vector, prob: Vector) =>
+ assert(raw.size === 3)
+ assert(prob.size === 3)
+ val max = raw.toArray.max
+ val subtract = if (max > 0) max else 0.0
+ val sum = raw.toArray.map(x => math.exp(x - subtract)).sum
+ val probFromRaw0 = math.exp(raw(0) - subtract) / sum
+ val probFromRaw1 = math.exp(raw(1) - subtract) / sum
+ assert(prob(0) ~== probFromRaw0 relTol eps)
+ assert(prob(1) ~== probFromRaw1 relTol eps)
+ assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps)
+ }
+
+ // Compare prediction with probability
+ results.select("prediction", "probability").collect().foreach {
+ case Row(pred: Double, prob: Vector) =>
+ val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
+ assert(pred == predFromProb)
+ }
+ }
+
+ test("binary logistic regression: Predictor, Classifier methods") {
val lr = new LogisticRegression
- val model = lr.fit(dataset)
+ val model = lr.fit(smallBinaryDataset)
assert(model.numClasses === 2)
- val numFeatures = dataset.select("features").first().getAs[Vector](0).size
+ val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size
assert(model.numFeatures === numFeatures)
val threshold = model.getThreshold
- val results = model.transform(dataset)
+ val results = model.transform(smallBinaryDataset)
// Compare rawPrediction with probability
results.select("rawPrediction", "probability").collect().foreach {
@@ -253,6 +363,29 @@ class LogisticRegressionSuite
}
}
+ test("overflow prediction for multiclass") {
+ val model = new LogisticRegressionModel("mLogReg",
+ Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)),
+ Vectors.dense(0.0, 0.0, 0.0), 3, true)
+ val overFlowData = spark.createDataFrame(Seq(
+ LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)),
+ LabeledPoint(1.0, Vectors.dense(0.0, -1.0))
+ ))
+ val results = model.transform(overFlowData).select("rawPrediction", "probability").collect()
+
+ // probabilities are correct when margins have to be adjusted
+ val raw1 = results(0).getAs[Vector](0)
+ val prob1 = results(0).getAs[Vector](1)
+ assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0))
+ assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps)
+
+ // probabilities are correct when margins don't have to be adjusted
+ val raw2 = results(1).getAs[Vector](0)
+ val prob2 = results(1).getAs[Vector](1)
+ assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0))
+ assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps)
+ }
+
test("MultiClassSummarizer") {
val summarizer1 = (new MultiClassSummarizer)
.add(0.0).add(3.0).add(4.0).add(3.0).add(6.0)
@@ -789,6 +922,7 @@ class LogisticRegressionSuite
assert(model2.coefficients ~= coefficientsTheory absTol 1E-6)
/*
+ TODO: why is this needed? The correctness of L1 regularization is already checked elsewhere
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
@@ -813,17 +947,69 @@ class LogisticRegressionSuite
assert(model1.coefficients ~== coefficientsR absTol 1E-6)
}
+ test("multinomial logistic regression with intercept with strong L1 regularization") {
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false)
+
+ val sqlContext = multinomialDataset.sqlContext
+ import sqlContext.implicits._
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+
+ val histogram = multinomialDataset.as[LabeledPoint].rdd.map(_.label)
+ .treeAggregate(new MultiClassSummarizer)(
+ seqOp = (c, v) => (c, v) match {
+ case (classSummarizer: MultiClassSummarizer, label: Double) => classSummarizer.add(label)
+ },
+ combOp = (c1, c2) => (c1, c2) match {
+ case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
+ classSummarizer1.merge(classSummarizer2)
+ }).histogram
+ val numFeatures = multinomialDataset.as[LabeledPoint].first().features.size
+ val numClasses = histogram.length
+
+ /*
+ For multinomial logistic regression with strong L1 regularization, all the coefficients
+ will be zeros. As a result, the intercepts will be proportional to the log counts in the
+ histogram.
+ {{{
+ \exp(b_k) = count_k * \exp(\lambda)
+ b_k = \log(count_k) * \lambda
+ }}}
+ \lambda is a free parameter, so choose the phase \lambda such that the
+ mean is centered. This yields
+ {{{
+ b_k = \log(count_k)
+ b_k' = b_k - \mean(b_k)
+ }}}
+ */
+ val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
+ val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length
+ val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean))
+ val coefficientsTheory = new DenseMatrix(numClasses, numFeatures,
+ Array.fill[Double](numClasses * numFeatures)(0.0), isTransposed = true)
+
+ assert(model1.interceptVector ~== interceptsTheory relTol 1E-3)
+ assert(model1.coefficientMatrix ~= coefficientsTheory absTol 1E-6)
+
+ assert(model2.interceptVector ~== interceptsTheory relTol 1E-3)
+ assert(model2.coefficientMatrix ~= coefficientsTheory absTol 1E-6)
+ }
+
test("evaluate on test set") {
- // TODO: add for multiclass
+ // TODO: add for multiclass when model summary becomes available
// Evaluate on test set should be same as that of the transformed training data.
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(1.0)
.setThreshold(0.6)
- val model = lr.fit(dataset)
+ val model = lr.fit(smallBinaryDataset)
val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary]
- val sameSummary = model.evaluate(dataset).asInstanceOf[BinaryLogisticRegressionSummary]
+ val sameSummary =
+ model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary]
assert(summary.areaUnderROC === sameSummary.areaUnderROC)
assert(summary.roc.collect() === sameSummary.roc.collect())
assert(summary.pr.collect === sameSummary.pr.collect())
@@ -840,7 +1026,7 @@ class LogisticRegressionSuite
.setMaxIter(10)
.setRegParam(1.0)
.setThreshold(0.6)
- val model = lr.fit(dataset)
+ val model = lr.fit(smallBinaryDataset)
assert(
model.summary
.objectiveHistory
@@ -934,9 +1120,16 @@ class LogisticRegressionSuite
assert(model5.interceptVector.size === 3)
}
+ test("intercept priors") {
+ // TODO
+ // Get coefficients from normal model with strong L1
+ // Set initial model with computed priors...
+ }
+
test("set initial model") {
// TODO: the binary one doesn't converge any faster
// TODO: should they converge after one or two iterations?
+ // We can just run the other ones for a few iterations then check the predictions
val lr = new LogisticRegression()
val model1 = lr.fit(binaryDataset)
val lr2 = new LogisticRegression().setInitialModel(model1)
@@ -949,7 +1142,7 @@ class LogisticRegressionSuite
}
test("logistic regression with all labels the same") {
- val sameLabels = dataset
+ val sameLabels = smallBinaryDataset
.withColumn("zeroLabel", lit(0.0))
.withColumn("oneLabel", lit(1.0))
@@ -990,6 +1183,76 @@ class LogisticRegressionSuite
assert(allOneNoInterceptModel.summary.totalIterations > 0)
}
+ test("multiclass logistic regression with all labels the same") {
+ val constantData = spark.createDataFrame(Seq(
+ LabeledPoint(4.0, Vectors.dense(0.0)),
+ LabeledPoint(4.0, Vectors.dense(1.0)),
+ LabeledPoint(4.0, Vectors.dense(2.0)))
+ )
+ val mlr = new LogisticRegression().setFamily("multinomial")
+ val model = mlr.fit(constantData)
+ val results = model.transform(constantData)
+ results.select("rawPrediction", "probability", "prediction").collect().foreach {
+ case Row(raw: Vector, prob: Vector, pred: Double) =>
+ assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity)))
+ assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0)))
+ assert(pred === 4.0)
+ }
+
+ // force the model to be trained with only one class
+ val constantZeroData = spark.createDataFrame(Seq(
+ LabeledPoint(0.0, Vectors.dense(0.0)),
+ LabeledPoint(0.0, Vectors.dense(1.0)),
+ LabeledPoint(0.0, Vectors.dense(2.0)))
+ )
+ val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData)
+ val resultsZero = modelZeroLabel.transform(constantZeroData)
+ resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach {
+ case Row(raw: Vector, prob: Vector, pred: Double) =>
+ assert(prob === Vectors.dense(Array(1.0)))
+ assert(pred === 0.0)
+ }
+
+ // ensure that the correct value is predicted when numClasses passed through metadata
+ val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
+ val constantDataWithMetadata = constantData
+ .select(constantData("label").as("label", labelMeta), constantData("features"))
+ val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata)
+ val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata)
+ resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach {
+ case Row(raw: Vector, prob: Vector, pred: Double) =>
+ assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0)))
+ assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)))
+ assert(pred === 4.0)
+ }
+ // TODO: check num iters is zero when it become available in the model
+ }
+
+ test("numClasses specified in metadata/inferred") {
+ val lr = new LogisticRegression().setMaxIter(1)
+
+ // specify more classes than unique label values
+ val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata()
+ val df = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta),
+ smallMultinomialDataset("features"))
+ val model1 = lr.fit(df)
+ assert(model1.numClasses === 4)
+ assert(model1.interceptVector.size === 4)
+
+ // specify two classes when there are really three
+ val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata()
+ val df1 = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta1),
+ smallMultinomialDataset("features"))
+ val thrown = intercept[IllegalArgumentException] {
+ lr.fit(df1)
+ }
+ assert(thrown.getMessage.contains("less than the number of unique labels"))
+
+ // lr should infer the number of classes if not specified
+ val model3 = lr.fit(smallMultinomialDataset)
+ assert(model3.numClasses === 3)
+ }
+
test("read/write") {
def checkModelData(model: LogisticRegressionModel, model2: LogisticRegressionModel): Unit = {
assert(model.intercept === model2.intercept)
@@ -998,7 +1261,7 @@ class LogisticRegressionSuite
assert(model.numFeatures === model2.numFeatures)
}
val lr = new LogisticRegression()
- testEstimatorAndModelReadWrite(lr, dataset, LogisticRegressionSuite.allParamSettings,
+ testEstimatorAndModelReadWrite(lr, smallBinaryDataset, LogisticRegressionSuite.allParamSettings,
checkModelData)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
index 9c7e08820d93b..9969bb02db04b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
@@ -86,35 +86,35 @@ class MultinomialLogisticRegressionSuite
rdd.saveAsTextFile("target/tmp/MultinomialLogisticRegressionSuite/multinomialDataset")
}
- test("params") {
- ParamsSuite.checkParams(new LogisticRegression)
- val model = new LogisticRegressionModel("mLogReg",
- Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true)
- ParamsSuite.checkParams(model)
- }
-
- test("multinomial logistic regression: default params") {
- val mlr = new LogisticRegression
- assert(mlr.getLabelCol === "label")
- assert(mlr.getFeaturesCol === "features")
- assert(mlr.getPredictionCol === "prediction")
- assert(mlr.getRawPredictionCol === "rawPrediction")
- assert(mlr.getProbabilityCol === "probability")
- assert(!mlr.isDefined(mlr.weightCol))
- assert(!mlr.isDefined(mlr.thresholds))
- assert(mlr.getFitIntercept)
- assert(mlr.getStandardization)
- val model = mlr.fit(dataset)
- model.transform(dataset)
- .select("label", "probability", "prediction", "rawPrediction")
- .collect()
- assert(model.getFeaturesCol === "features")
- assert(model.getPredictionCol === "prediction")
- assert(model.getRawPredictionCol === "rawPrediction")
- assert(model.getProbabilityCol === "probability")
- assert(model.interceptVector !== Vectors.dense(0.0, 0.0))
- assert(model.hasParent)
- }
+// test("params") {
+// ParamsSuite.checkParams(new LogisticRegression)
+// val model = new LogisticRegressionModel("mLogReg",
+// Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true)
+// ParamsSuite.checkParams(model)
+// }
+//
+// test("multinomial logistic regression: default params") {
+// val mlr = new LogisticRegression
+// assert(mlr.getLabelCol === "label")
+// assert(mlr.getFeaturesCol === "features")
+// assert(mlr.getPredictionCol === "prediction")
+// assert(mlr.getRawPredictionCol === "rawPrediction")
+// assert(mlr.getProbabilityCol === "probability")
+// assert(!mlr.isDefined(mlr.weightCol))
+// assert(!mlr.isDefined(mlr.thresholds))
+// assert(mlr.getFitIntercept)
+// assert(mlr.getStandardization)
+// val model = mlr.fit(dataset)
+// model.transform(dataset)
+// .select("label", "probability", "prediction", "rawPrediction")
+// .collect()
+// assert(model.getFeaturesCol === "features")
+// assert(model.getPredictionCol === "prediction")
+// assert(model.getRawPredictionCol === "rawPrediction")
+// assert(model.getProbabilityCol === "probability")
+// assert(model.interceptVector !== Vectors.dense(0.0, 0.0))
+// assert(model.hasParent)
+// }
test("multinomial logistic regression with intercept without regularization") {
@@ -813,202 +813,6 @@ class MultinomialLogisticRegressionSuite
}
*/
- test("prediction") {
- val model = new LogisticRegressionModel("mLogReg",
- Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)),
- Vectors.dense(0.0, 0.0, 0.0), 3, true)
- val overFlowData = spark.createDataFrame(Seq(
- LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)),
- LabeledPoint(1.0, Vectors.dense(0.0, -1.0))
- ))
- val results = model.transform(overFlowData).select("rawPrediction", "probability").collect()
-
- // probabilities are correct when margins have to be adjusted
- val raw1 = results(0).getAs[Vector](0)
- val prob1 = results(0).getAs[Vector](1)
- assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0))
- assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps)
-
- // probabilities are correct when margins don't have to be adjusted
- val raw2 = results(1).getAs[Vector](0)
- val prob2 = results(1).getAs[Vector](1)
- assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0))
- assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps)
- }
-
- test("multinomial logistic regression: Predictor, Classifier methods") {
- val mlr = new LogisticRegression
-
- val model = mlr.fit(dataset)
- assert(model.numClasses === 3)
- val numFeatures = dataset.select("features").first().getAs[Vector](0).size
- assert(model.numFeatures === numFeatures)
-
- val results = model.transform(dataset)
- // check that raw prediction is coefficients dot features + intercept
- results.select("rawPrediction", "features").collect().foreach {
- case Row(raw: Vector, features: Vector) =>
- assert(raw.size === 3)
- val margins = Array.tabulate(3) { k =>
- var margin = 0.0
- features.foreachActive { (index, value) =>
- margin += value * model.coefficientMatrix(k, index)
- }
- margin += model.interceptVector(k)
- margin
- }
- assert(raw ~== Vectors.dense(margins) relTol eps)
- }
-
- // Compare rawPrediction with probability
- results.select("rawPrediction", "probability").collect().foreach {
- case Row(raw: Vector, prob: Vector) =>
- assert(raw.size === 3)
- assert(prob.size === 3)
- val max = raw.toArray.max
- val subtract = if (max > 0) max else 0.0
- val sum = raw.toArray.map(x => math.exp(x - subtract)).sum
- val probFromRaw0 = math.exp(raw(0) - subtract) / sum
- val probFromRaw1 = math.exp(raw(1) - subtract) / sum
- assert(prob(0) ~== probFromRaw0 relTol eps)
- assert(prob(1) ~== probFromRaw1 relTol eps)
- assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps)
- }
-
- // Compare prediction with probability
- results.select("prediction", "probability").collect().foreach {
- case Row(pred: Double, prob: Vector) =>
- val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
- assert(pred == predFromProb)
- }
- }
-
- test("multinomial logistic regression coefficients should be centered") {
- val mlr = new LogisticRegression().setMaxIter(1)
- val model = mlr.fit(dataset)
- assert(model.interceptVector.toArray.sum ~== 0.0 absTol 1e-6)
- assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol 1e-6)
- }
-
- test("numClasses specified in metadata/inferred") {
- val mlr = new LogisticRegression().setMaxIter(1)
-
- // specify more classes than unique label values
- val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata()
- val df = dataset.select(dataset("label").as("label", labelMeta), dataset("features"))
- val model1 = mlr.fit(df)
- assert(model1.numClasses === 4)
- assert(model1.interceptVector.size === 4)
-
- // specify two classes when there are really three
- val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata()
- val df1 = dataset.select(dataset("label").as("label", labelMeta1), dataset("features"))
- val thrown = intercept[IllegalArgumentException] {
- mlr.fit(df1)
- }
- assert(thrown.getMessage.contains("less than the number of unique labels"))
-
- // mlr should infer the number of classes if not specified
- val model3 = mlr.fit(dataset)
- assert(model3.numClasses === 3)
- }
-
- test("all labels the same") {
- val constantData = spark.createDataFrame(Seq(
- LabeledPoint(4.0, Vectors.dense(0.0)),
- LabeledPoint(4.0, Vectors.dense(1.0)),
- LabeledPoint(4.0, Vectors.dense(2.0)))
- )
- val mlr = new LogisticRegression().setFamily("multinomial")
- val model = mlr.fit(constantData)
- val results = model.transform(constantData)
- results.select("rawPrediction", "probability", "prediction").collect().foreach {
- case Row(raw: Vector, prob: Vector, pred: Double) =>
- assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity)))
- assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0)))
- assert(pred === 4.0)
- }
-
- // force the model to be trained with only one class
- val constantZeroData = spark.createDataFrame(Seq(
- LabeledPoint(0.0, Vectors.dense(0.0)),
- LabeledPoint(0.0, Vectors.dense(1.0)),
- LabeledPoint(0.0, Vectors.dense(2.0)))
- )
- val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData)
- val resultsZero = modelZeroLabel.transform(constantZeroData)
- resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach {
- case Row(raw: Vector, prob: Vector, pred: Double) =>
- assert(prob === Vectors.dense(Array(1.0)))
- assert(pred === 0.0)
- }
-
- // ensure that the correct value is predicted when numClasses passed through metadata
- val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata()
- val constantDataWithMetadata = constantData
- .select(constantData("label").as("label", labelMeta), constantData("features"))
- val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata)
- val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata)
- resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach {
- case Row(raw: Vector, prob: Vector, pred: Double) =>
- assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0)))
- assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0)))
- assert(pred === 4.0)
- }
- // TODO: check num iters is zero when it become available in the model
- }
-
- test("weighted data") {
- val numClasses = 5
- val numPoints = 40
- val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark,
- numClasses, numPoints)
- val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i =>
- LabeledPoint(i.toDouble, Vectors.dense(i.toDouble))
- })
- val mlr = new LogisticRegression().setWeightCol("weight")
- val model = mlr.fit(outlierData)
- val results = model.transform(testData).select("label", "prediction").collect()
-
- // check that the predictions are the one to one mapping
- results.foreach { case Row(label: Double, pred: Double) =>
- assert(label === pred)
- }
- val (overSampledData, weightedData) =
- MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features",
- 42L)
- val weightedModel = mlr.fit(weightedData)
- val overSampledModel = mlr.setWeightCol("").fit(overSampledData)
- assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01)
- }
-
- test("thresholds prediction") {
- val mlr = new LogisticRegression
- val model = mlr.fit(dataset)
- val basePredictions = model.transform(dataset).select("prediction").collect()
-
- // should predict all zeros
- model.setThresholds(Array(1, 1000, 1000))
- val zeroPredictions = model.transform(dataset).select("prediction").collect()
- assert(zeroPredictions.forall(_.getDouble(0) === 0.0))
-
- // should predict all ones
- model.setThresholds(Array(1000, 1, 1000))
- val onePredictions = model.transform(dataset).select("prediction").collect()
- assert(onePredictions.forall(_.getDouble(0) === 1.0))
-
- // should predict all twos
- model.setThresholds(Array(1000, 1000, 1))
- val twoPredictions = model.transform(dataset).select("prediction").collect()
- assert(twoPredictions.forall(_.getDouble(0) === 2.0))
-
- // constant threshold scaling is the same as no thresholds
- model.setThresholds(Array(1000, 1000, 1000))
- val scaledPredictions = model.transform(dataset).select("prediction").collect()
- assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
- scaled.getDouble(0) === base.getDouble(0)
- })
- }
// test("read/write") {
// def checkModelData(
@@ -1024,16 +828,6 @@ class MultinomialLogisticRegressionSuite
// MultinomialLogisticRegressionSuite.allParamSettings,
// checkModelData)
// }
-
- test("should support all NumericType labels and not support other types") {
- val mlr = new LogisticRegression().setMaxIter(1)
- MLTestingUtils
- .checkNumericTypes[LogisticRegressionModel, LogisticRegression](
- mlr, spark) { (expected, actual) =>
- assert(expected.interceptVector === actual.interceptVector)
- assert(expected.coefficientMatrix.toArray === actual.coefficients.toArray)
- }
- }
}
object MultinomialLogisticRegressionSuite {
From d95370b9d73cb123657e278c0e297bb13ef18331 Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 25 Aug 2016 14:33:34 -0700
Subject: [PATCH 05/24] model loading backward compat
---
.../classification/LogisticRegression.scala | 38 +++++++++++++------
.../MultinomialLogisticRegressionSuite.scala | 8 ----
2 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 9b1845eaef98a..e15ebfe00bbac 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -602,12 +602,14 @@ class LogisticRegression @Since("1.2.0") (
/*
The intercepts are never regularized, so we always center the mean.
*/
+ // TODO: store model coefficients as multinomial representation?
+ // If so, zero out one set of coefs or use the +/- representation
val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
val interceptMean = interceptsArray.sum / numClasses
interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
Vectors.dense(interceptsArray)
- } else if (interceptsArray.nonEmpty) {
- Vectors.dense(interceptsArray)
+ } else if (interceptsArray.length == 2) {
+ Vectors.dense(interceptsArray.head)
} else {
Vectors.sparse(numClasses, Seq())
}
@@ -980,19 +982,33 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
override def load(path: String): LogisticRegressionModel = {
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+ val versionRegex = "([0-9]+)\\.([0-9]+)\\.(.+)".r
+ val versionRegex(major, minor, _) = metadata.sparkVersion
val dataPath = new Path(path, "data").toString
val data = sparkSession.read.format("parquet").load(dataPath)
- val convertedCoefs = MLUtils.convertMatrixColumnsToML(data, "coefficientMatrix")
- val converted = MLUtils.convertVectorColumnsToML(convertedCoefs, "interceptVector")
- .select("numClasses", "numFeatures", "interceptVector", "coefficientMatrix",
- "isMultinomial")
- // TODO: numFeatures not needed?
- val Row(numClasses: Int, numFeatures: Int, interceptVector: Vector,
- coefficientMatrix: Matrix, isMultinomial: Boolean) = converted.head()
- val model = new LogisticRegressionModel(metadata.uid, coefficientMatrix, interceptVector,
- numClasses, isMultinomial)
+ val model = if (major.toInt < 2 || (major.toInt == 2 && minor.toInt == 0)) {
+ // 2.0 and before
+ val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) =
+ MLUtils.convertVectorColumnsToML(data, "coefficients")
+ .select("numClasses", "numFeatures", "intercept", "coefficients")
+ .head()
+ val coefficientMatrix =
+ new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true)
+ val interceptVector = Vectors.dense(intercept)
+ new LogisticRegressionModel(metadata.uid, coefficientMatrix,
+ interceptVector, numClasses, isMultinomial = false)
+ } else {
+ // 2.1+
+ val Row(numClasses: Int, numFeatures: Int, interceptVector: Vector,
+ coefficientMatrix: Matrix, isMultinomial: Boolean) = data
+ .select("numClasses", "numFeatures", "interceptVector", "coefficientMatrix",
+ "isMultinomial").head()
+ new LogisticRegressionModel(metadata.uid, coefficientMatrix, interceptVector,
+ numClasses, isMultinomial)
+ }
+
DefaultParamsReader.getAndSetParams(model, metadata)
model
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
index 9969bb02db04b..5725a47dd8652 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
@@ -806,14 +806,6 @@ class MultinomialLogisticRegressionSuite
assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
}
- /*
- test("multinomial logistic regression with intercept with strong L1 regularization") {
- // TODO: implement this test to check that the priors on the intercepts are correct
- // TODO: when initial model becomes available
- }
- */
-
-
// test("read/write") {
// def checkModelData(
// model: LogisticRegressionModel,
From 942c3b7939879f360ce0a22c57cd6e31293fb044 Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 25 Aug 2016 18:27:57 -0700
Subject: [PATCH 06/24] correcting initial model test and deleting multinomial
---
.../classification/LogisticRegression.scala | 52 +++++++++++--------
.../LogisticRegressionSuite.scala | 36 +++++++------
2 files changed, 49 insertions(+), 39 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index e15ebfe00bbac..ebaaa58065fa4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.classification
import scala.collection.mutable
-import breeze.linalg.{DenseVector => BDV}
+import breeze.linalg.{DenseVector => BDV, View}
import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
import org.apache.hadoop.fs.Path
@@ -83,7 +83,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
* If numClasses == 1 || numClasses == 2, set to "binomial".
* Else, set to "multinomial"
* - "binomial": Binary logistic regression with pivoting.
- * - "multinomial": Multinomial (softmax) regression without pivoting.
+ * - "multinomial": Multinomial logistic (softmax) regression without pivoting.
* Default is "auto".
*
* @group param
@@ -181,9 +181,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
}
/**
- * Logistic regression.
- * Currently, this class only supports binary classification. For multiclass classification,
- * use [[MultinomialLogisticRegression]]
+ * Logistic regression. Supports multinomial logistic (softmax) regression and binomial logistic
+ * regression.
*/
@Since("1.2.0")
class LogisticRegression @Since("1.2.0") (
@@ -476,10 +475,11 @@ class LogisticRegression @Since("1.2.0") (
if (initialModelIsValid) {
val initialCoefArray = initialCoefficientsWithIntercept.toArray
- val providedCoefArray = optInitialModel.get.coefficientMatrix.toArray
- providedCoefArray.indices.foreach { i =>
- val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
- initialCoefArray(flatIndex) = providedCoefArray(i)
+ val providedCoef = optInitialModel.get.coefficientMatrix
+ providedCoef.foreachActive { (row, col, value) =>
+ val flatIndex = row * numFeaturesPlusIntercept + col
+ // We need to scale the coefficients since they will be trained in the scaled space
+ initialCoefArray(flatIndex) = value * featuresStd(col)
}
if ($(fitIntercept)) {
optInitialModel.get.interceptVector.foreachActive { (index, value) =>
@@ -608,10 +608,10 @@ class LogisticRegression @Since("1.2.0") (
val interceptMean = interceptsArray.sum / numClasses
interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
Vectors.dense(interceptsArray)
- } else if (interceptsArray.length == 2) {
- Vectors.dense(interceptsArray.head)
+ } else if (interceptsArray.length == 1) {
+ Vectors.dense(interceptsArray)
} else {
- Vectors.sparse(numClasses, Seq())
+ Vectors.sparse(numCoefficientSets, Seq())
}
(coefficientMatrix, interceptVector, arrayBuilder.result())
}
@@ -668,6 +668,7 @@ class LogisticRegressionModel private[spark] (
extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
with LogisticRegressionParams with MLWritable {
+ // TODO: remove this
def this(uid: String, coefficients: Vector, intercept: Double) {
this(uid,
new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true),
@@ -675,19 +676,28 @@ class LogisticRegressionModel private[spark] (
}
@Since("2.0.0")
- // TODO: this should convert sparse to sparse and dense to dense
- val coefficients: Vector = Vectors.dense(coefficientMatrix.toArray)
+ def coefficients: Vector = if (isMultinomial) {
+ throw new SparkException("Multinomial models contain a matrix of coefficients, use" +
+ "coefficientMatrix instead.")
+ } else {
+ _coefficients
+ }
+
+ // convert to appropriate vector representation without replicating data
+ private lazy val _coefficients: Vector = coefficientMatrix match {
+ case dm: DenseMatrix => Vectors.dense(dm.values)
+ case sm: SparseMatrix => Vectors.fromBreeze(sm.asBreeze.flatten(View.Require))
+ }
@Since("1.3.0")
- def intercept: Double = {
- if (isMultinomial) {
- logWarning("Multiclass model contains a vector of intercepts, use interceptVector instead." +
- "Returning 0.0 as placeholder.")
- }
+ def intercept: Double = if (isMultinomial) {
+ throw new SparkException("Multiclass model contains a vector of intercepts, use " +
+ "interceptVector instead. Returning 0.0 as placeholder.")
+ } else {
_intercept
}
- private val _intercept = if (!isMultinomial) interceptVector.toArray.head else 0.0
+ private lazy val _intercept = interceptVector.toArray.head
@Since("1.5.0")
override def setThreshold(value: Double): this.type = super.setThreshold(value)
@@ -943,7 +953,6 @@ class LogisticRegressionModel private[spark] (
@Since("1.6.0")
object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
- // TODO: we need to be able to load old models as well
@Since("1.6.0")
override def read: MLReader[LogisticRegressionModel] = new LogisticRegressionModelReader
@@ -1009,7 +1018,6 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
numClasses, isMultinomial)
}
-
DefaultParamsReader.getAndSetParams(model, metadata)
model
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index a8e94fafa50ed..3a9e0b4f856ca 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -1120,25 +1120,26 @@ class LogisticRegressionSuite
assert(model5.interceptVector.size === 3)
}
- test("intercept priors") {
- // TODO
- // Get coefficients from normal model with strong L1
- // Set initial model with computed priors...
- }
-
test("set initial model") {
- // TODO: the binary one doesn't converge any faster
- // TODO: should they converge after one or two iterations?
- // We can just run the other ones for a few iterations then check the predictions
val lr = new LogisticRegression()
- val model1 = lr.fit(binaryDataset)
- val lr2 = new LogisticRegression().setInitialModel(model1)
- val model2 = lr2.fit(binaryDataset)
+ val model1 = lr.fit(smallBinaryDataset)
+ val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5)
+ val model2 = lr2.fit(smallBinaryDataset)
+ val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect()
+ val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect()
+ predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
+ assert(p1 === p2)
+ }
val lr3 = new LogisticRegression()
- val model3 = lr3.fit(multinomialDataset)
- val lr4 = new LogisticRegression().setInitialModel(model3)
- val model4 = lr4.fit(multinomialDataset)
+ val model3 = lr3.fit(smallMultinomialDataset)
+ val lr4 = new LogisticRegression().setInitialModel(model3).setMaxIter(5)
+ val model4 = lr4.fit(smallMultinomialDataset)
+ val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect()
+ val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect()
+ predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) =>
+ assert(p1 === p2)
+ }
}
test("logistic regression with all labels the same") {
@@ -1241,8 +1242,9 @@ class LogisticRegressionSuite
// specify two classes when there are really three
val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata()
- val df1 = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta1),
- smallMultinomialDataset("features"))
+ val df1 = smallMultinomialDataset
+ .select(smallMultinomialDataset("label").as("label", labelMeta1),
+ smallMultinomialDataset("features"))
val thrown = intercept[IllegalArgumentException] {
lr.fit(df1)
}
From ae6150c33b7e93e5c2b6a7b292953150239d9c25 Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 25 Aug 2016 21:20:22 -0700
Subject: [PATCH 07/24] small fixes, remove temp constructor
---
.../classification/LogisticRegression.scala | 55 ++++---------------
.../ProbabilisticClassifier.scala | 27 +++++++--
.../classification/LogisticRegression.scala | 5 +-
.../LogisticRegressionSuite.scala | 3 +-
.../ml/classification/OneVsRestSuite.scala | 5 +-
.../spark/ml/tuning/CrossValidatorSuite.scala | 5 +-
.../ml/tuning/TrainValidationSplitSuite.scala | 5 +-
7 files changed, 47 insertions(+), 58 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index ebaaa58065fa4..2b3cdc5473529 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -365,7 +365,7 @@ class LogisticRegression @Since("1.2.0") (
if (!isMultinomial) {
require(isBinaryClassification, s"Binomial family only supports 1 or 2 " +
- s"outcome classes but found $numClasses")
+ s"outcome classes but found $numClasses")
}
if (isDefined(thresholds)) {
@@ -602,8 +602,6 @@ class LogisticRegression @Since("1.2.0") (
/*
The intercepts are never regularized, so we always center the mean.
*/
- // TODO: store model coefficients as multinomial representation?
- // If so, zero out one set of coefs or use the +/- representation
val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
val interceptMean = interceptsArray.sum / numClasses
interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
@@ -668,13 +666,6 @@ class LogisticRegressionModel private[spark] (
extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
with LogisticRegressionParams with MLWritable {
- // TODO: remove this
- def this(uid: String, coefficients: Vector, intercept: Double) {
- this(uid,
- new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true),
- Vectors.dense(intercept), 2, false)
- }
-
@Since("2.0.0")
def coefficients: Vector = if (isMultinomial) {
throw new SparkException("Multinomial models contain a matrix of coefficients, use" +
@@ -686,13 +677,14 @@ class LogisticRegressionModel private[spark] (
// convert to appropriate vector representation without replicating data
private lazy val _coefficients: Vector = coefficientMatrix match {
case dm: DenseMatrix => Vectors.dense(dm.values)
+ // TODO: better way to flatten sparse matrix?
case sm: SparseMatrix => Vectors.fromBreeze(sm.asBreeze.flatten(View.Require))
}
@Since("1.3.0")
def intercept: Double = if (isMultinomial) {
- throw new SparkException("Multiclass model contains a vector of intercepts, use " +
- "interceptVector instead. Returning 0.0 as placeholder.")
+ throw new SparkException("Multinomial models contain a vector of intercepts, use " +
+ "interceptVector instead.")
} else {
_intercept
}
@@ -730,6 +722,7 @@ class LogisticRegressionModel private[spark] (
}
/** Score (probability) for each class label. */
+ // TODO: do we need this anymore?
private val scores: Vector => Vector = (features) => {
val m = margins(features)
val maxMarginIndex = m.argmax
@@ -813,36 +806,11 @@ class LogisticRegressionModel private[spark] (
* Predict label for the given feature vector.
* The behavior of this can be adjusted using [[thresholds]].
*/
- override protected def predict(features: Vector): Double = {
+ override protected def predict(features: Vector): Double = if (isMultinomial) {
+ super.predict(features)
+ } else {
// Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
- if (isMultinomial) {
- if (isDefined(thresholds)) {
- val thresholds: Array[Double] = getThresholds
- val probabilities = scores(features).toArray
- var argMax = 0
- var max = Double.NegativeInfinity
- var i = 0
- while (i < numClasses) {
- if (thresholds(i) == 0.0) {
- max = Double.PositiveInfinity
- argMax = i
- } else {
- val scaled = probabilities(i) / thresholds(i)
- if (scaled > max) {
- max = scaled
- argMax = i
- }
- }
- i += 1
- }
- argMax
- } else {
- scores(features).argmax
- }
- }
- else {
- if (score(features) > getThreshold) 1 else 0
- }
+ if (score(features) > getThreshold) 1 else 0
}
override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
@@ -930,10 +898,10 @@ class LogisticRegressionModel private[spark] (
}
override protected def probability2prediction(probability: Vector): Double = {
- // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
if (isMultinomial) {
super.probability2prediction(probability)
} else {
+ // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
if (probability(1) > getThreshold) 1 else 0
}
}
@@ -983,8 +951,7 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
}
}
- private class LogisticRegressionModelReader
- extends MLReader[LogisticRegressionModel] {
+ private class LogisticRegressionModelReader extends MLReader[LogisticRegressionModel] {
/** Checked against metadata when loading model */
private val className = classOf[LogisticRegressionModel].getName
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 19df8f7edd43c..989bd19528a97 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -201,11 +201,30 @@ abstract class ProbabilisticClassificationModel[
probability.argmax
} else {
val thresholds: Array[Double] = getThresholds
- val scaledProbability: Array[Double] =
- probability.toArray.zip(thresholds).map { case (p, t) =>
- if (t == 0.0) Double.PositiveInfinity else p / t
+ val probabilities = probability.toArray
+ var argMax = 0
+ var max = Double.NegativeInfinity
+ var i = 0
+ while (i < probability.size) {
+ if (thresholds(i) == 0.0) {
+ max = Double.PositiveInfinity
+ argMax = i
+ } else {
+ val scaled = probabilities(i) / thresholds(i)
+ if (scaled > max) {
+ max = scaled
+ argMax = i
+ }
}
- Vectors.dense(scaledProbability).argmax
+ i += 1
+ }
+ argMax
+// val thresholds: Array[Double] = getThresholds
+// val scaledProbability: Array[Double] =
+// probability.toArray.zip(thresholds).map { case (p, t) =>
+// if (t == 0.0) Double.PositiveInfinity else p / t
+// }
+// Vectors.dense(scaledProbability).argmax
}
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index ad3dab33d2909..c3770dd0a12df 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -430,10 +430,9 @@ class LogisticRegressionWithLBFGS
lr.setElasticNetParam(elasticNetParam)
lr.setStandardization(useFeatureScaling)
if (userSuppliedWeights) {
- // TODO: check this
val uid = Identifiable.randomUID("logreg-static")
- lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(
- uid, new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed=true),
+ lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(uid,
+ new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed = true),
Vectors.dense(0.0).asML, 2, false))
}
lr.setFitIntercept(addIntercept)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 3a9e0b4f856ca..f04d73f979509 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -113,7 +113,8 @@ class LogisticRegressionSuite
test("params") {
ParamsSuite.checkParams(new LogisticRegression)
- val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0)
+ val model = new LogisticRegressionModel("logReg",
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
ParamsSuite.checkParams(model)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 361dd74cb082e..09e38786aa002 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -60,7 +60,8 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
test("params") {
ParamsSuite.checkParams(new OneVsRest)
- val lrModel = new LogisticRegressionModel("lr", Vectors.dense(0.0), 0.0)
+ val lrModel = new LogisticRegressionModel("logReg",
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel))
ParamsSuite.checkParams(model)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 30bd390381e97..0fb26f26e7792 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio
import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
import org.apache.spark.ml.feature.HashingTF
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.ml.param.shared.HasInputCol
import org.apache.spark.ml.regression.LinearRegression
@@ -244,7 +244,8 @@ class CrossValidatorSuite
test("read/write: CrossValidatorModel") {
val lr = new LogisticRegression()
.setThreshold(0.6)
- val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2)
+ val lrModel = new LogisticRegressionModel(lr.uid,
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
.setMetricName("areaUnderPR") // not default metric
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index c1e9c2fc1dc11..a05a1d641f1bb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCol
import org.apache.spark.ml.regression.LinearRegression
@@ -133,7 +133,8 @@ class TrainValidationSplitSuite
test("read/write: TrainValidationSplitModel") {
val lr = new LogisticRegression()
.setThreshold(0.6)
- val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2)
+ val lrModel = new LogisticRegressionModel(lr.uid,
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
val paramMaps = new ParamGridBuilder()
From 47fa5fde7a0f4ab17042989fb631cf772ff41069 Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 25 Aug 2016 21:24:46 -0700
Subject: [PATCH 08/24] rebase
---
.../MultinomialLogisticRegression.scala | 632 ------------------
1 file changed, 632 deletions(-)
delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala
deleted file mode 100644
index 006f57c0ce260..0000000000000
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.classification
-
-import scala.collection.mutable
-
-import breeze.linalg.{DenseVector => BDV}
-import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
-import org.apache.hadoop.fs.Path
-
-import org.apache.spark.SparkException
-import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.internal.Logging
-import org.apache.spark.ml.feature.Instance
-import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util._
-import org.apache.spark.mllib.linalg.VectorImplicits._
-import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Dataset, Row}
-import org.apache.spark.sql.functions.{col, lit}
-import org.apache.spark.sql.types.DoubleType
-import org.apache.spark.storage.StorageLevel
-
-/**
- * Params for multinomial logistic (softmax) regression.
- */
-private[classification] trait MultinomialLogisticRegressionParams
- extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter
- with HasFitIntercept with HasTol with HasStandardization with HasWeightCol
- with HasAggregationDepth {
-
- /**
- * Set thresholds in multiclass (or binary) classification to adjust the probability of
- * predicting each class. Array must have length equal to the number of classes, with values >= 0.
- * The class with largest value p/t is predicted, where p is the original probability of that
- * class and t is the class' threshold.
- *
- * @group setParam
- */
- def setThresholds(value: Array[Double]): this.type = {
- set(thresholds, value)
- }
-
- /**
- * Get thresholds for binary or multiclass classification.
- *
- * @group getParam
- */
- override def getThresholds: Array[Double] = {
- $(thresholds)
- }
-}
-
-/**
- * :: Experimental ::
- * Multinomial Logistic (softmax) regression.
- */
-@Since("2.1.0")
-@Experimental
-class MultinomialLogisticRegression @Since("2.1.0") (
- @Since("2.1.0") override val uid: String)
- extends ProbabilisticClassifier[Vector,
- MultinomialLogisticRegression, MultinomialLogisticRegressionModel]
- with MultinomialLogisticRegressionParams with DefaultParamsWritable with Logging {
-
- @Since("2.1.0")
- def this() = this(Identifiable.randomUID("mlogreg"))
-
- /**
- * Set the regularization parameter.
- * Default is 0.0.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setRegParam(value: Double): this.type = set(regParam, value)
- setDefault(regParam -> 0.0)
-
- /**
- * Set the ElasticNet mixing parameter.
- * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
- * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
- * Default is 0.0 which is an L2 penalty.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
- setDefault(elasticNetParam -> 0.0)
-
- /**
- * Set the maximum number of iterations.
- * Default is 100.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setMaxIter(value: Int): this.type = set(maxIter, value)
- setDefault(maxIter -> 100)
-
- /**
- * Set the convergence tolerance of iterations.
- * Smaller value will lead to higher accuracy with the cost of more iterations.
- * Default is 1E-6.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setTol(value: Double): this.type = set(tol, value)
- setDefault(tol -> 1E-6)
-
- /**
- * Whether to fit an intercept term.
- * Default is true.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
- setDefault(fitIntercept -> true)
-
- /**
- * Whether to standardize the training features before fitting the model.
- * The coefficients of models will be always returned on the original scale,
- * so it will be transparent for users. Note that with/without standardization,
- * the models should always converge to the same solution when no regularization
- * is applied. In R's GLMNET package, the default behavior is true as well.
- * Default is true.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setStandardization(value: Boolean): this.type = set(standardization, value)
- setDefault(standardization -> true)
-
- /**
- * Sets the value of param [[weightCol]].
- * If this is not set or empty, we treat all instance weights as 1.0.
- * Default is not set, so all instances have weight one.
- *
- * @group setParam
- */
- @Since("2.1.0")
- def setWeightCol(value: String): this.type = set(weightCol, value)
-
- @Since("2.1.0")
- override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value)
-
- /**
- * Suggested depth for treeAggregate (>= 2).
- * If the dimensions of features or the number of partitions are large,
- * this param could be adjusted to a larger size.
- * Default is 2.
- * @group expertSetParam
- */
- @Since("2.1.0")
- def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
- setDefault(aggregationDepth -> 2)
-
- override protected[spark] def train(dataset: Dataset[_]): MultinomialLogisticRegressionModel = {
- val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol))
- val instances: RDD[Instance] =
- dataset.select(col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map {
- case Row(label: Double, weight: Double, features: Vector) =>
- Instance(label, weight, features)
- }
-
- val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
- if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
-
- val instr = Instrumentation.create(this, instances)
- instr.logParams(regParam, elasticNetParam, standardization, thresholds,
- maxIter, tol, fitIntercept)
-
- val (summarizer, labelSummarizer) = {
- val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer),
- instance: Instance) =>
- (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight))
-
- val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer),
- c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) =>
- (c1._1.merge(c2._1), c1._2.merge(c2._2))
-
- instances.treeAggregate(
- new MultivariateOnlineSummarizer, new MultiClassSummarizer)(seqOp, combOp)
- }
-
- val histogram = labelSummarizer.histogram
- val numInvalid = labelSummarizer.countInvalid
- val numFeatures = summarizer.mean.size
- val numFeaturesPlusIntercept = if (getFitIntercept) numFeatures + 1 else numFeatures
-
- val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
- case Some(n: Int) =>
- require(n >= histogram.length, s"Specified number of classes $n was " +
- s"less than the number of unique labels ${histogram.length}")
- n
- case None => histogram.length
- }
-
- instr.logNumClasses(numClasses)
- instr.logNumFeatures(numFeatures)
-
- val (coefficients, intercepts, objectiveHistory) = {
- if (numInvalid != 0) {
- val msg = s"Classification labels should be in {0 to ${numClasses - 1} " +
- s"Found $numInvalid invalid labels."
- logError(msg)
- throw new SparkException(msg)
- }
-
- val isConstantLabel = histogram.count(_ != 0) == 1
-
- if ($(fitIntercept) && isConstantLabel) {
- // we want to produce a model that will always predict the constant label so all the
- // coefficients will be zero, and the constant label class intercept will be +inf
- val constantLabelIndex = Vectors.dense(histogram).argmax
- (Matrices.sparse(numClasses, numFeatures, Array.fill(numFeatures + 1)(0),
- Array.empty[Int], Array.empty[Double]),
- Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))),
- Array.empty[Double])
- } else {
- if (!$(fitIntercept) && isConstantLabel) {
- logWarning(s"All labels belong to a single class and fitIntercept=false. It's" +
- s"a dangerous ground, so the algorithm may not converge.")
- }
-
- val featuresStd = summarizer.variance.toArray.map(math.sqrt)
- val featuresMean = summarizer.mean.toArray
- if (!$(fitIntercept) && (0 until numFeatures).exists { i =>
- featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) {
- logWarning("Fitting MultinomialLogisticRegressionModel without intercept on dataset " +
- "with constant nonzero column, Spark MLlib outputs zero coefficients for constant " +
- "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.")
- }
-
- val regParamL1 = $(elasticNetParam) * $(regParam)
- val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam)
-
- val bcFeaturesStd = instances.context.broadcast(featuresStd)
- val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept),
- $(standardization), bcFeaturesStd, regParamL2, multinomial = true, $(aggregationDepth))
-
- val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) {
- new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
- } else {
- val standardizationParam = $(standardization)
- def regParamL1Fun = (index: Int) => {
- // Remove the L1 penalization on the intercept
- val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0)
- if (isIntercept) {
- 0.0
- } else {
- if (standardizationParam) {
- regParamL1
- } else {
- val featureIndex = if ($(fitIntercept)) {
- index % numFeaturesPlusIntercept
- } else {
- index % numFeatures
- }
- // If `standardization` is false, we still standardize the data
- // to improve the rate of convergence; as a result, we have to
- // perform this reverse standardization by penalizing each component
- // differently to get effectively the same objective function when
- // the training dataset is not standardized.
- if (featuresStd(featureIndex) != 0.0) {
- regParamL1 / featuresStd(featureIndex)
- } else {
- 0.0
- }
- }
- }
- }
- new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol))
- }
-
- val initialCoefficientsWithIntercept = Vectors.zeros(numClasses * numFeaturesPlusIntercept)
-
- if ($(fitIntercept)) {
- /*
- For multinomial logistic regression, when we initialize the coefficients as zeros,
- it will converge faster if we initialize the intercepts such that
- it follows the distribution of the labels.
- {{{
- P(1) = \exp(b_1) / Z
- ...
- P(K) = \exp(b_K) / Z
- where Z = \sum_{k=1}^{K} \exp(b_k)
- }}}
- Since this doesn't have a unique solution, one of the solutions that satisfies the
- above equations is
- {{{
- \exp(b_k) = count_k * \exp(\lambda)
- b_k = \log(count_k) * \lambda
- }}}
- \lambda is a free parameter, so choose the phase \lambda such that the
- mean is centered. This yields
- {{{
- b_k = \log(count_k)
- b_k' = b_k - \mean(b_k)
- }}}
- */
- val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing
- val rawMean = rawIntercepts.sum / rawIntercepts.length
- rawIntercepts.indices.foreach { i =>
- initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) =
- rawIntercepts(i) - rawMean
- }
- }
-
- val states = optimizer.iterations(new CachedDiffFunction(costFun),
- initialCoefficientsWithIntercept.asBreeze.toDenseVector)
-
- /*
- Note that in Multinomial Logistic Regression, the objective history
- (loss + regularization) is log-likelihood which is invariant under feature
- standardization. As a result, the objective history from optimizer is the same as the
- one in the original space.
- */
- val arrayBuilder = mutable.ArrayBuilder.make[Double]
- var state: optimizer.State = null
- while (states.hasNext) {
- state = states.next()
- arrayBuilder += state.adjustedValue
- }
-
- if (state == null) {
- val msg = s"${optimizer.getClass.getName} failed."
- logError(msg)
- throw new SparkException(msg)
- }
- bcFeaturesStd.destroy(blocking = false)
-
- /*
- The coefficients are trained in the scaled space; we're converting them back to
- the original space.
- Note that the intercept in scaled space and original space is the same;
- as a result, no scaling is needed.
- */
- val rawCoefficients = state.x.toArray
- val interceptsArray: Array[Double] = if ($(fitIntercept)) {
- Array.tabulate(numClasses) { i =>
- val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1
- rawCoefficients(coefIndex)
- }
- } else {
- Array.empty
- }
-
- val coefficientArray: Array[Double] = Array.tabulate(numClasses * numFeatures) { i =>
- // flatIndex will loop though rawCoefficients, and skip the intercept terms.
- val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i
- val featureIndex = i % numFeatures
- if (featuresStd(featureIndex) != 0.0) {
- rawCoefficients(flatIndex) / featuresStd(featureIndex)
- } else {
- 0.0
- }
- }
- val coefficientMatrix =
- new DenseMatrix(numClasses, numFeatures, coefficientArray, isTransposed = true)
-
- /*
- When no regularization is applied, the coefficients lack identifiability because
- we do not use a pivot class. We can add any constant value to the coefficients and
- get the same likelihood. So here, we choose the mean centered coefficients for
- reproducibility. This method follows the approach in glmnet, described here:
-
- Friedman, et al. "Regularization Paths for Generalized Linear Models via
- Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf
- */
- if ($(regParam) == 0.0) {
- val coefficientMean = coefficientMatrix.values.sum / (numClasses * numFeatures)
- coefficientMatrix.update(_ - coefficientMean)
- }
- /*
- The intercepts are never regularized, so we always center the mean.
- */
- val interceptVector = if (interceptsArray.nonEmpty) {
- val interceptMean = interceptsArray.sum / numClasses
- interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
- Vectors.dense(interceptsArray)
- } else {
- Vectors.sparse(numClasses, Seq())
- }
-
- (coefficientMatrix, interceptVector, arrayBuilder.result())
- }
- }
-
- if (handlePersistence) instances.unpersist()
-
- val model = copyValues(
- new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses))
- instr.logSuccess(model)
- model
- }
-
- @Since("2.1.0")
- override def copy(extra: ParamMap): MultinomialLogisticRegression = defaultCopy(extra)
-}
-
-@Since("2.1.0")
-object MultinomialLogisticRegression extends DefaultParamsReadable[MultinomialLogisticRegression] {
-
- @Since("2.1.0")
- override def load(path: String): MultinomialLogisticRegression = super.load(path)
-}
-
-/**
- * :: Experimental ::
- * Model produced by [[MultinomialLogisticRegression]].
- */
-@Since("2.1.0")
-@Experimental
-class MultinomialLogisticRegressionModel private[spark] (
- @Since("2.1.0") override val uid: String,
- @Since("2.1.0") val coefficients: Matrix,
- @Since("2.1.0") val intercepts: Vector,
- @Since("2.1.0") val numClasses: Int)
- extends ProbabilisticClassificationModel[Vector, MultinomialLogisticRegressionModel]
- with MultinomialLogisticRegressionParams with MLWritable {
-
- @Since("2.1.0")
- override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value)
-
- @Since("2.1.0")
- override def getThresholds: Array[Double] = super.getThresholds
-
- @Since("2.1.0")
- override val numFeatures: Int = coefficients.numCols
-
- /** Margin (rawPrediction) for each class label. */
- private val margins: Vector => Vector = (features) => {
- val m = intercepts.toDense.copy
- BLAS.gemv(1.0, coefficients, features, 1.0, m)
- m
- }
-
- /** Score (probability) for each class label. */
- private val scores: Vector => Vector = (features) => {
- val m = margins(features)
- val maxMarginIndex = m.argmax
- val marginArray = m.toArray
- val maxMargin = marginArray(maxMarginIndex)
-
- // adjust margins for overflow
- val sum = {
- var temp = 0.0
- var k = 0
- while (k < numClasses) {
- marginArray(k) = if (maxMargin > 0) {
- math.exp(marginArray(k) - maxMargin)
- } else {
- math.exp(marginArray(k))
- }
- temp += marginArray(k)
- k += 1
- }
- temp
- }
-
- val scores = Vectors.dense(marginArray)
- BLAS.scal(1 / sum, scores)
- scores
- }
-
- /**
- * Predict label for the given feature vector.
- * The behavior of this can be adjusted using [[thresholds]].
- */
- override protected def predict(features: Vector): Double = {
- if (isDefined(thresholds)) {
- val thresholds: Array[Double] = getThresholds
- val probabilities = scores(features).toArray
- var argMax = 0
- var max = Double.NegativeInfinity
- var i = 0
- while (i < numClasses) {
- if (thresholds(i) == 0.0) {
- max = Double.PositiveInfinity
- argMax = i
- } else {
- val scaled = probabilities(i) / thresholds(i)
- if (scaled > max) {
- max = scaled
- argMax = i
- }
- }
- i += 1
- }
- argMax
- } else {
- scores(features).argmax
- }
- }
-
- override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
- rawPrediction match {
- case dv: DenseVector =>
- val size = dv.size
- val values = dv.values
-
- // get the maximum margin
- val maxMarginIndex = rawPrediction.argmax
- val maxMargin = rawPrediction(maxMarginIndex)
-
- if (maxMargin == Double.PositiveInfinity) {
- var k = 0
- while (k < size) {
- values(k) = if (k == maxMarginIndex) 1.0 else 0.0
- k += 1
- }
- } else {
- val sum = {
- var temp = 0.0
- var k = 0
- while (k < numClasses) {
- values(k) = if (maxMargin > 0) {
- math.exp(values(k) - maxMargin)
- } else {
- math.exp(values(k))
- }
- temp += values(k)
- k += 1
- }
- temp
- }
- BLAS.scal(1 / sum, dv)
- }
- dv
- case sv: SparseVector =>
- throw new RuntimeException("Unexpected error in MultinomialLogisticRegressionModel:" +
- " raw2probabilitiesInPlace encountered SparseVector")
- }
- }
-
- override protected def predictRaw(features: Vector): Vector = margins(features)
-
- @Since("2.1.0")
- override def copy(extra: ParamMap): MultinomialLogisticRegressionModel = {
- val newModel =
- copyValues(
- new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses), extra)
- newModel.setParent(parent)
- }
-
- /**
- * Returns a [[org.apache.spark.ml.util.MLWriter]] instance for this ML instance.
- *
- * This does not save the [[parent]] currently.
- */
- @Since("2.1.0")
- override def write: MLWriter =
- new MultinomialLogisticRegressionModel.MultinomialLogisticRegressionModelWriter(this)
-}
-
-
-@Since("2.1.0")
-object MultinomialLogisticRegressionModel extends MLReadable[MultinomialLogisticRegressionModel] {
-
- @Since("2.1.0")
- override def read: MLReader[MultinomialLogisticRegressionModel] =
- new MultinomialLogisticRegressionModelReader
-
- @Since("2.1.0")
- override def load(path: String): MultinomialLogisticRegressionModel = super.load(path)
-
- /** [[MLWriter]] instance for [[MultinomialLogisticRegressionModel]] */
- private[MultinomialLogisticRegressionModel]
- class MultinomialLogisticRegressionModelWriter(instance: MultinomialLogisticRegressionModel)
- extends MLWriter with Logging {
-
- private case class Data(
- numClasses: Int,
- numFeatures: Int,
- intercepts: Vector,
- coefficients: Matrix)
-
- override protected def saveImpl(path: String): Unit = {
- // Save metadata and Params
- DefaultParamsWriter.saveMetadata(instance, path, sc)
- // Save model data: numClasses, numFeatures, intercept, coefficients
- val data = Data(instance.numClasses, instance.numFeatures, instance.intercepts,
- instance.coefficients)
- val dataPath = new Path(path, "data").toString
- sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
- }
- }
-
- private class MultinomialLogisticRegressionModelReader
- extends MLReader[MultinomialLogisticRegressionModel] {
-
- /** Checked against metadata when loading model */
- private val className = classOf[MultinomialLogisticRegressionModel].getName
-
- override def load(path: String): MultinomialLogisticRegressionModel = {
- val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
-
- val dataPath = new Path(path, "data").toString
- val data = sqlContext.read.format("parquet").load(dataPath)
- .select("numClasses", "numFeatures", "intercepts", "coefficients").head()
- val numClasses = data.getAs[Int](data.fieldIndex("numClasses"))
- val intercepts = data.getAs[Vector](data.fieldIndex("intercepts"))
- val coefficients = data.getAs[Matrix](data.fieldIndex("coefficients"))
- val model =
- new MultinomialLogisticRegressionModel(metadata.uid, coefficients, intercepts, numClasses)
-
- DefaultParamsReader.getAndSetParams(model, metadata)
- model
- }
- }
-}
From 79273f7be4234de0d97347df02518b690fef7119 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 26 Aug 2016 08:21:56 -0700
Subject: [PATCH 09/24] removing old test suite
---
.../classification/LogisticRegression.scala | 3 +-
.../LogisticRegressionSuite.scala | 710 ++++++++++++++-
.../MultinomialLogisticRegressionSuite.scala | 842 ------------------
3 files changed, 710 insertions(+), 845 deletions(-)
delete mode 100644 mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2b3cdc5473529..80426fc019e83 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -668,7 +668,7 @@ class LogisticRegressionModel private[spark] (
@Since("2.0.0")
def coefficients: Vector = if (isMultinomial) {
- throw new SparkException("Multinomial models contain a matrix of coefficients, use" +
+ throw new SparkException("Multinomial models contain a matrix of coefficients, use " +
"coefficientMatrix instead.")
} else {
_coefficients
@@ -1378,7 +1378,6 @@ class BinaryLogisticRegressionSummary private[classification] (
* $$
*
*
- *
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param bcFeaturesStd The broadcast standard deviation values of the features.
* @param numClasses the number of possible outcomes for k classes classification problem in
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index f04d73f979509..47c1a7218fcbd 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -24,7 +24,7 @@ import scala.language.existentials
import scala.util.Random
import scala.util.control.Breaks._
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.{Matrices, DenseMatrix, Vector, Vectors}
@@ -364,6 +364,24 @@ class LogisticRegressionSuite
}
}
+ test("coefficients and intercept methods") {
+ val mlr = new LogisticRegression().setMaxIter(1)
+ val mlrModel = mlr.fit(smallMultinomialDataset)
+ val thrownCoef = intercept[SparkException] {
+ mlrModel.coefficients
+ }
+ val thrownIntercept = intercept[SparkException] {
+ mlrModel.intercept
+ }
+ assert(thrownCoef.getMessage().contains("use coefficientMatrix instead"))
+ assert(thrownIntercept.getMessage().contains("use interceptVector instead"))
+
+ val blr = new LogisticRegression().setMaxIter(1)
+ val blrModel = blr.fit(smallBinaryDataset)
+ assert(blrModel.coefficients.size === 1)
+ assert(blrModel.intercept !== 0.0)
+ }
+
test("overflow prediction for multiclass") {
val model = new LogisticRegressionModel("mLogReg",
Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)),
@@ -999,6 +1017,696 @@ class LogisticRegressionSuite
assert(model2.coefficientMatrix ~= coefficientsTheory absTol 1E-6)
}
+ test("multinomial logistic regression with intercept without regularization") {
+
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+ > library("glmnet")
+ > data <- read.csv("path", header=FALSE)
+ > label = as.factor(data$V1)
+ > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0))
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -2.24493379
+ V2 0.25096771
+ V3 -0.03915938
+ V4 0.14766639
+ V5 0.36810817
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.3778931
+ V2 -0.3327489
+ V3 0.8893666
+ V4 -0.2306948
+ V5 -0.4442330
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 1.86704066
+ V2 0.08178121
+ V3 -0.85020722
+ V4 0.08302840
+ V5 0.07612480
+ */
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.2509677, -0.0391594, 0.1476664, 0.3681082,
+ -0.3327489, 0.8893666, -0.2306948, -0.4442330,
+ 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true)
+ val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407)
+
+ assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.interceptVector ~== interceptsR relTol 0.05)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.interceptVector ~== interceptsR relTol 0.05)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression without intercept without regularization") {
+
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+
+ /*
+ Using the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0,
+ intercept=F))
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.06992464
+ V3 -0.36562784
+ V4 0.12142680
+ V5 0.32052211
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 -0.3036269
+ V3 0.9449630
+ V4 -0.2271038
+ V5 -0.4364839
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.2337022
+ V3 -0.5793351
+ V4 0.1056770
+ V5 0.1159618
+ */
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.0699246, -0.3656278, 0.1214268, 0.3205221,
+ -0.3036269, 0.9449630, -0.2271038, -0.4364839,
+ 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true)
+
+ assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression with intercept with L1 regularization") {
+
+ // use tighter constraints because OWL-QN solver takes longer to converge
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
+ .setMaxIter(300).setTol(1e-10)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
+ .setMaxIter(300).setTol(1e-10)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+
+ /*
+ Use the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1,
+ lambda = 0.05, standardization=T))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05,
+ standardization=F))
+ > coefficientsStd
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.68988825
+ V2 .
+ V3 .
+ V4 .
+ V5 0.09404023
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.2303499
+ V2 -0.1232443
+ V3 0.3258380
+ V4 -0.1564688
+ V5 -0.2053965
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.9202381
+ V2 .
+ V3 -0.4803856
+ V4 .
+ V5 .
+
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.44893320
+ V2 .
+ V3 .
+ V4 0.01933812
+ V5 0.03666044
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.7376760
+ V2 -0.0577182
+ V3 .
+ V4 -0.2081718
+ V5 -0.1304592
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.2887428
+ V2 .
+ V3 .
+ V4 .
+ V5 .
+ */
+
+ val coefficientsRStd = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.0, 0.09404023,
+ -0.1232443, 0.3258380, -0.1564688, -0.2053965,
+ 0.0, -0.4803856, 0.0, 0.0), isTransposed = true)
+ val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381)
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.01933812, 0.03666044,
+ -0.0577182, 0.0, -0.2081718, -0.1304592,
+ 0.0, 0.0, 0.0, 0.0), isTransposed = true)
+ val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428)
+
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02)
+ assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02)
+ assert(model2.interceptVector ~== interceptsR relTol 0.1)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression without intercept with L1 regularization") {
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+ /*
+ Use the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1,
+ lambda = 0.05, intercept=F, standardization=T))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05,
+ intercept=F, standardization=F))
+ > coefficientsStd
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 .
+ V4 .
+ V5 0.01525105
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 -0.1502410
+ V3 0.5134658
+ V4 -0.1601146
+ V5 -0.2500232
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.003301875
+ V3 .
+ V4 .
+ V5 .
+
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 .
+ V4 .
+ V5 .
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 0.1943624
+ V4 -0.1902577
+ V5 -0.1028789
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 .
+ V4 .
+ V5 .
+ */
+
+ val coefficientsRStd = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.0, 0.01525105,
+ -0.1502410, 0.5134658, -0.1601146, -0.2500232,
+ 0.003301875, 0.0, 0.0, 0.0), isTransposed = true)
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.1943624, -0.1902577, -0.1028789,
+ 0.0, 0.0, 0.0, 0.0), isTransposed = true)
+
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression with intercept with L2 regularization") {
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+ /*
+ Use the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0,
+ lambda = 0.1, intercept=T, standardization=T))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0,
+ lambda = 0.1, intercept=T, standardization=F))
+ > coefficientsStd
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -1.70040424
+ V2 0.17576070
+ V3 0.01527894
+ V4 0.10216108
+ V5 0.26099531
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.2438590
+ V2 -0.2238875
+ V3 0.5967610
+ V4 -0.1555496
+ V5 -0.3010479
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 1.45654525
+ V2 0.04812679
+ V3 -0.61203992
+ V4 0.05338850
+ V5 0.04005258
+
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -1.65488543
+ V2 0.15715048
+ V3 0.01992903
+ V4 0.12428858
+ V5 0.22130317
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 1.1297533
+ V2 -0.1974768
+ V3 0.2776373
+ V4 -0.1869445
+ V5 -0.2510320
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.52513212
+ V2 0.04032627
+ V3 -0.29756637
+ V4 0.06265594
+ V5 0.02972883
+ */
+
+ val coefficientsRStd = new DenseMatrix(3, 4, Array(
+ 0.17576070, 0.01527894, 0.10216108, 0.26099531,
+ -0.2238875, 0.5967610, -0.1555496, -0.3010479,
+ 0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true)
+ val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525)
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.15715048, 0.01992903, 0.12428858, 0.22130317,
+ -0.1974768, 0.2776373, -0.1869445, -0.2510320,
+ 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true)
+ val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212)
+
+ assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05)
+ assert(model1.interceptVector ~== interceptsRStd relTol 0.05)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
+ assert(model2.interceptVector ~== interceptsR relTol 0.05)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression without intercept with L2 regularization") {
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+ /*
+ Use the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0,
+ lambda = 0.1, intercept=F, standardization=T))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0,
+ lambda = 0.1, intercept=F, standardization=F))
+ > coefficientsStd
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.03904171
+ V3 -0.23354322
+ V4 0.08288096
+ V5 0.22706393
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 -0.2061848
+ V3 0.6341398
+ V4 -0.1530059
+ V5 -0.2958455
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.16714312
+ V3 -0.40059658
+ V4 0.07012496
+ V5 0.06878158
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 -0.005704542
+ V3 -0.144466409
+ V4 0.092080736
+ V5 0.182927657
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 -0.08469036
+ V3 0.38996748
+ V4 -0.16468436
+ V5 -0.22522976
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.09039490
+ V3 -0.24550107
+ V4 0.07260362
+ V5 0.04230210
+ */
+ val coefficientsRStd = new DenseMatrix(3, 4, Array(
+ 0.03904171, -0.23354322, 0.08288096, 0.2270639,
+ -0.2061848, 0.6341398, -0.1530059, -0.2958455,
+ 0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true)
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ -0.005704542, -0.144466409, 0.092080736, 0.182927657,
+ -0.08469036, 0.38996748, -0.16468436, -0.22522976,
+ 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true)
+
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression with intercept with elasticnet regularization") {
+ val trainer1 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
+ .setMaxIter(300).setTol(1e-10)
+ val trainer2 = (new LogisticRegression).setFitIntercept(true)
+ .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
+ .setMaxIter(300).setTol(1e-10)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+ /*
+ Use the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
+ lambda = 0.1, intercept=T, standardization=T))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
+ lambda = 0.1, intercept=T, standardization=F))
+ > coefficientsStd
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.5521819483
+ V2 0.0003092611
+ V3 .
+ V4 .
+ V5 0.0913818490
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.27531989
+ V2 -0.09790029
+ V3 0.28502034
+ V4 -0.12416487
+ V5 -0.16513373
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.8275018
+ V2 .
+ V3 -0.4044859
+ V4 .
+ V5 .
+
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.39876213
+ V2 .
+ V3 .
+ V4 0.02547520
+ V5 0.03893991
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ 0.61089869
+ V2 -0.04224269
+ V3 .
+ V4 -0.18923970
+ V5 -0.09104249
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ -0.2121366
+ V2 .
+ V3 .
+ V4 .
+ V5 .
+ */
+
+ val coefficientsRStd = new DenseMatrix(3, 4, Array(
+ 0.0003092611, 0.0, 0.0, 0.091381849,
+ -0.09790029, 0.28502034, -0.12416487, -0.16513373,
+ 0.0, -0.4044859, 0.0, 0.0), isTransposed = true)
+ val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018)
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.0254752, 0.03893991,
+ -0.04224269, 0.0, -0.1892397, -0.09104249,
+ 0.0, 0.0, 0.0, 0.0), isTransposed = true)
+ val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366)
+
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector ~== interceptsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector ~== interceptsR absTol 0.01)
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
+ test("multinomial logistic regression without intercept with elasticnet regularization") {
+ val trainer1 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
+ .setMaxIter(300).setTol(1e-10)
+ val trainer2 = (new LogisticRegression).setFitIntercept(false)
+ .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
+ .setMaxIter(300).setTol(1e-10)
+
+ val model1 = trainer1.fit(multinomialDataset)
+ val model2 = trainer2.fit(multinomialDataset)
+ /*
+ Use the following R code to load the data and train the model using glmnet package.
+ library("glmnet")
+ data <- read.csv("path", header=FALSE)
+ label = as.factor(data$V1)
+ features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+ coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
+ lambda = 0.1, intercept=F, standardization=T))
+ coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
+ lambda = 0.1, intercept=F, standardization=F))
+ > coefficientsStd
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 .
+ V4 .
+ V5 0.03543706
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 -0.1187387
+ V3 0.4025482
+ V4 -0.1270969
+ V5 -0.1918386
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 0.00774365
+ V3 .
+ V4 .
+ V5 .
+
+ > coefficients
+ $`0`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 .
+ V4 .
+ V5 .
+
+ $`1`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 0.14666497
+ V4 -0.16570638
+ V5 -0.05982875
+
+ $`2`
+ 5 x 1 sparse Matrix of class "dgCMatrix"
+ s0
+ .
+ V2 .
+ V3 .
+ V4 .
+ V5 .
+ */
+ val coefficientsRStd = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.0, 0.03543706,
+ -0.1187387, 0.4025482, -0.1270969, -0.1918386,
+ 0.0, 0.0, 0.0, 0.00774365), isTransposed = true)
+
+ val coefficientsR = new DenseMatrix(3, 4, Array(
+ 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.14666497, -0.16570638, -0.05982875,
+ 0.0, 0.0, 0.0, 0.0), isTransposed = true)
+
+ assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
+ assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
+ assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
+ assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
+ }
+
test("evaluate on test set") {
// TODO: add for multiclass when model summary becomes available
// Evaluate on test set should be same as that of the transformed training data.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
deleted file mode 100644
index 5725a47dd8652..0000000000000
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala
+++ /dev/null
@@ -1,842 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.classification
-
-import scala.language.existentials
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.attribute.NominalAttribute
-import org.apache.spark.ml.classification.LogisticRegressionSuite._
-import org.apache.spark.ml.feature.LabeledPoint
-import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
-
-class MultinomialLogisticRegressionSuite
- extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
-
- @transient var dataset: Dataset[_] = _
- @transient var multinomialDataset: DataFrame = _
- private val eps: Double = 1e-5
-
- override def beforeAll(): Unit = {
- super.beforeAll()
-
- dataset = {
- val nPoints = 100
- val coefficients = Array(
- -0.57997, 0.912083, -0.371077,
- -0.16624, -0.84355, -0.048509)
-
- val xMean = Array(5.843, 3.057)
- val xVariance = Array(0.6856, 0.1899)
-
- val testData = generateMultinomialLogisticInput(
- coefficients, xMean, xVariance, addIntercept = true, nPoints, 42)
-
- val df = spark.createDataFrame(sc.parallelize(testData, 4))
- df.cache()
- df
- }
-
- multinomialDataset = {
- val nPoints = 10000
- val coefficients = Array(
- -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
- -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)
-
- val xMean = Array(5.843, 3.057, 3.758, 1.199)
- val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
-
- val testData = generateMultinomialLogisticInput(
- coefficients, xMean, xVariance, addIntercept = true, nPoints, 42)
-
- val df = spark.createDataFrame(sc.parallelize(testData, 4))
- df.cache()
- df
- }
- }
-
- /**
- * Enable the ignored test to export the dataset into CSV format,
- * so we can validate the training accuracy compared with R's glmnet package.
- */
- ignore("export test data into CSV format") {
- val rdd = multinomialDataset.rdd.map { case Row(label: Double, features: Vector) =>
- label + "," + features.toArray.mkString(",")
- }.repartition(1)
- rdd.saveAsTextFile("target/tmp/MultinomialLogisticRegressionSuite/multinomialDataset")
- }
-
-// test("params") {
-// ParamsSuite.checkParams(new LogisticRegression)
-// val model = new LogisticRegressionModel("mLogReg",
-// Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true)
-// ParamsSuite.checkParams(model)
-// }
-//
-// test("multinomial logistic regression: default params") {
-// val mlr = new LogisticRegression
-// assert(mlr.getLabelCol === "label")
-// assert(mlr.getFeaturesCol === "features")
-// assert(mlr.getPredictionCol === "prediction")
-// assert(mlr.getRawPredictionCol === "rawPrediction")
-// assert(mlr.getProbabilityCol === "probability")
-// assert(!mlr.isDefined(mlr.weightCol))
-// assert(!mlr.isDefined(mlr.thresholds))
-// assert(mlr.getFitIntercept)
-// assert(mlr.getStandardization)
-// val model = mlr.fit(dataset)
-// model.transform(dataset)
-// .select("label", "probability", "prediction", "rawPrediction")
-// .collect()
-// assert(model.getFeaturesCol === "features")
-// assert(model.getPredictionCol === "prediction")
-// assert(model.getRawPredictionCol === "rawPrediction")
-// assert(model.getProbabilityCol === "probability")
-// assert(model.interceptVector !== Vectors.dense(0.0, 0.0))
-// assert(model.hasParent)
-// }
-
- test("multinomial logistic regression with intercept without regularization") {
-
- val trainer1 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100)
- val trainer2 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
-
- /*
- Using the following R code to load the data and train the model using glmnet package.
- > library("glmnet")
- > data <- read.csv("path", header=FALSE)
- > label = as.factor(data$V1)
- > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0))
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -2.24493379
- V2 0.25096771
- V3 -0.03915938
- V4 0.14766639
- V5 0.36810817
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.3778931
- V2 -0.3327489
- V3 0.8893666
- V4 -0.2306948
- V5 -0.4442330
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 1.86704066
- V2 0.08178121
- V3 -0.85020722
- V4 0.08302840
- V5 0.07612480
- */
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.2509677, -0.0391594, 0.1476664, 0.3681082,
- -0.3327489, 0.8893666, -0.2306948, -0.4442330,
- 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true)
- val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407)
-
- assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
- assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
- assert(model1.interceptVector ~== interceptsR relTol 0.05)
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
- assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
- assert(model2.interceptVector ~== interceptsR relTol 0.05)
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression without intercept without regularization") {
-
- val trainer1 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true)
- val trainer2 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
-
- /*
- Using the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0,
- intercept=F))
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.06992464
- V3 -0.36562784
- V4 0.12142680
- V5 0.32052211
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 -0.3036269
- V3 0.9449630
- V4 -0.2271038
- V5 -0.4364839
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.2337022
- V3 -0.5793351
- V4 0.1056770
- V5 0.1159618
- */
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0699246, -0.3656278, 0.1214268, 0.3205221,
- -0.3036269, 0.9449630, -0.2271038, -0.4364839,
- 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true)
-
- assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05)
- assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
- assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
- assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps)
- assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression with intercept with L1 regularization") {
-
- // use tighter constraints because OWL-QN solver takes longer to converge
- val trainer1 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
- .setMaxIter(300).setTol(1e-10)
- val trainer2 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
- .setMaxIter(300).setTol(1e-10)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
-
- /*
- Use the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1,
- lambda = 0.05, standardization=T))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05,
- standardization=F))
- > coefficientsStd
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.68988825
- V2 .
- V3 .
- V4 .
- V5 0.09404023
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.2303499
- V2 -0.1232443
- V3 0.3258380
- V4 -0.1564688
- V5 -0.2053965
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.9202381
- V2 .
- V3 -0.4803856
- V4 .
- V5 .
-
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.44893320
- V2 .
- V3 .
- V4 0.01933812
- V5 0.03666044
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.7376760
- V2 -0.0577182
- V3 .
- V4 -0.2081718
- V5 -0.1304592
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.2887428
- V2 .
- V3 .
- V4 .
- V5 .
- */
-
- val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.09404023,
- -0.1232443, 0.3258380, -0.1564688, -0.2053965,
- 0.0, -0.4803856, 0.0, 0.0), isTransposed = true)
- val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381)
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.01933812, 0.03666044,
- -0.0577182, 0.0, -0.2081718, -0.1304592,
- 0.0, 0.0, 0.0, 0.0), isTransposed = true)
- val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428)
-
- assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02)
- assert(model1.interceptVector ~== interceptsRStd relTol 0.1)
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02)
- assert(model2.interceptVector ~== interceptsR relTol 0.1)
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression without intercept with L1 regularization") {
- val trainer1 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true)
- val trainer2 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
- /*
- Use the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1,
- lambda = 0.05, intercept=F, standardization=T))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05,
- intercept=F, standardization=F))
- > coefficientsStd
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 .
- V4 .
- V5 0.01525105
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 -0.1502410
- V3 0.5134658
- V4 -0.1601146
- V5 -0.2500232
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.003301875
- V3 .
- V4 .
- V5 .
-
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 .
- V4 .
- V5 .
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 0.1943624
- V4 -0.1902577
- V5 -0.1028789
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 .
- V4 .
- V5 .
- */
-
- val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.01525105,
- -0.1502410, 0.5134658, -0.1601146, -0.2500232,
- 0.003301875, 0.0, 0.0, 0.0), isTransposed = true)
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.1943624, -0.1902577, -0.1028789,
- 0.0, 0.0, 0.0, 0.0), isTransposed = true)
-
- assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
- assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
- assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression with intercept with L2 regularization") {
- val trainer1 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
- val trainer2 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
- /*
- Use the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0,
- lambda = 0.1, intercept=T, standardization=T))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0,
- lambda = 0.1, intercept=T, standardization=F))
- > coefficientsStd
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -1.70040424
- V2 0.17576070
- V3 0.01527894
- V4 0.10216108
- V5 0.26099531
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.2438590
- V2 -0.2238875
- V3 0.5967610
- V4 -0.1555496
- V5 -0.3010479
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 1.45654525
- V2 0.04812679
- V3 -0.61203992
- V4 0.05338850
- V5 0.04005258
-
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -1.65488543
- V2 0.15715048
- V3 0.01992903
- V4 0.12428858
- V5 0.22130317
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 1.1297533
- V2 -0.1974768
- V3 0.2776373
- V4 -0.1869445
- V5 -0.2510320
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.52513212
- V2 0.04032627
- V3 -0.29756637
- V4 0.06265594
- V5 0.02972883
- */
-
- val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.17576070, 0.01527894, 0.10216108, 0.26099531,
- -0.2238875, 0.5967610, -0.1555496, -0.3010479,
- 0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true)
- val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525)
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.15715048, 0.01992903, 0.12428858, 0.22130317,
- -0.1974768, 0.2776373, -0.1869445, -0.2510320,
- 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true)
- val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212)
-
- assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05)
- assert(model1.interceptVector ~== interceptsRStd relTol 0.05)
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05)
- assert(model2.interceptVector ~== interceptsR relTol 0.05)
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression without intercept with L2 regularization") {
- val trainer1 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true)
- val trainer2 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
- /*
- Use the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0,
- lambda = 0.1, intercept=F, standardization=T))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0,
- lambda = 0.1, intercept=F, standardization=F))
- > coefficientsStd
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.03904171
- V3 -0.23354322
- V4 0.08288096
- V5 0.22706393
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 -0.2061848
- V3 0.6341398
- V4 -0.1530059
- V5 -0.2958455
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.16714312
- V3 -0.40059658
- V4 0.07012496
- V5 0.06878158
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 -0.005704542
- V3 -0.144466409
- V4 0.092080736
- V5 0.182927657
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 -0.08469036
- V3 0.38996748
- V4 -0.16468436
- V5 -0.22522976
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.09039490
- V3 -0.24550107
- V4 0.07260362
- V5 0.04230210
- */
- val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.03904171, -0.23354322, 0.08288096, 0.2270639,
- -0.2061848, 0.6341398, -0.1530059, -0.2958455,
- 0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true)
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- -0.005704542, -0.144466409, 0.092080736, 0.182927657,
- -0.08469036, 0.38996748, -0.16468436, -0.22522976,
- 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true)
-
- assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
- assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
- assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression with intercept with elasticnet regularization") {
- val trainer1 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
- .setMaxIter(300).setTol(1e-10)
- val trainer2 = (new LogisticRegression).setFitIntercept(true)
- .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
- .setMaxIter(300).setTol(1e-10)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
- /*
- Use the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
- lambda = 0.1, intercept=T, standardization=T))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
- lambda = 0.1, intercept=T, standardization=F))
- > coefficientsStd
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.5521819483
- V2 0.0003092611
- V3 .
- V4 .
- V5 0.0913818490
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.27531989
- V2 -0.09790029
- V3 0.28502034
- V4 -0.12416487
- V5 -0.16513373
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.8275018
- V2 .
- V3 -0.4044859
- V4 .
- V5 .
-
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.39876213
- V2 .
- V3 .
- V4 0.02547520
- V5 0.03893991
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- 0.61089869
- V2 -0.04224269
- V3 .
- V4 -0.18923970
- V5 -0.09104249
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- -0.2121366
- V2 .
- V3 .
- V4 .
- V5 .
- */
-
- val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0003092611, 0.0, 0.0, 0.091381849,
- -0.09790029, 0.28502034, -0.12416487, -0.16513373,
- 0.0, -0.4044859, 0.0, 0.0), isTransposed = true)
- val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018)
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0254752, 0.03893991,
- -0.04224269, 0.0, -0.1892397, -0.09104249,
- 0.0, 0.0, 0.0, 0.0), isTransposed = true)
- val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366)
-
- assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
- assert(model1.interceptVector ~== interceptsRStd absTol 0.01)
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
- assert(model2.interceptVector ~== interceptsR absTol 0.01)
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
- test("multinomial logistic regression without intercept with elasticnet regularization") {
- val trainer1 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true)
- .setMaxIter(300).setTol(1e-10)
- val trainer2 = (new LogisticRegression).setFitIntercept(false)
- .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false)
- .setMaxIter(300).setTol(1e-10)
-
- val model1 = trainer1.fit(multinomialDataset)
- val model2 = trainer2.fit(multinomialDataset)
- /*
- Use the following R code to load the data and train the model using glmnet package.
- library("glmnet")
- data <- read.csv("path", header=FALSE)
- label = as.factor(data$V1)
- features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
- coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
- lambda = 0.1, intercept=F, standardization=T))
- coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5,
- lambda = 0.1, intercept=F, standardization=F))
- > coefficientsStd
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 .
- V4 .
- V5 0.03543706
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 -0.1187387
- V3 0.4025482
- V4 -0.1270969
- V5 -0.1918386
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 0.00774365
- V3 .
- V4 .
- V5 .
-
- > coefficients
- $`0`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 .
- V4 .
- V5 .
-
- $`1`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 0.14666497
- V4 -0.16570638
- V5 -0.05982875
-
- $`2`
- 5 x 1 sparse Matrix of class "dgCMatrix"
- s0
- .
- V2 .
- V3 .
- V4 .
- V5 .
- */
- val coefficientsRStd = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.03543706,
- -0.1187387, 0.4025482, -0.1270969, -0.1918386,
- 0.0, 0.0, 0.0, 0.00774365), isTransposed = true)
-
- val coefficientsR = new DenseMatrix(3, 4, Array(
- 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.14666497, -0.16570638, -0.05982875,
- 0.0, 0.0, 0.0, 0.0), isTransposed = true)
-
- assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01)
- assert(model1.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps)
- assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01)
- assert(model2.interceptVector.toArray === Array.fill(3)(0.0))
- assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps)
- }
-
-// test("read/write") {
-// def checkModelData(
-// model: LogisticRegressionModel,
-// model2: LogisticRegressionModel): Unit = {
-// assert(model.interceptVector === model2.interceptVector)
-// assert(model.coefficientMatrix.toArray === model2.coefficients.toArray)
-// assert(model.numClasses === model2.numClasses)
-// assert(model.numFeatures === model2.numFeatures)
-// }
-// val mlr = new LogisticRegression()
-// testEstimatorAndModelReadWrite(mlr, dataset,
-// MultinomialLogisticRegressionSuite.allParamSettings,
-// checkModelData)
-// }
-}
-
-object MultinomialLogisticRegressionSuite {
-
- /**
- * Mapping from all Params to valid settings which differ from the defaults.
- * This is useful for tests which need to exercise all Params, such as save/load.
- * This excludes input columns to simplify some tests.
- */
- val allParamSettings: Map[String, Any] = ProbabilisticClassifierSuite.allParamSettings ++ Map(
- "probabilityCol" -> "myProbability",
- "thresholds" -> Array(0.4, 0.6),
- "regParam" -> 0.01,
- "elasticNetParam" -> 0.1,
- "maxIter" -> 2, // intentionally small
- "fitIntercept" -> true,
- "tol" -> 0.8,
- "standardization" -> false
- )
-}
From 262bc996063f4d07b9440d6164be01f497d180ef Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 26 Aug 2016 09:36:05 -0700
Subject: [PATCH 10/24] some small fixes
---
.../classification/LogisticRegression.scala | 24 +++++++++----------
.../ProbabilisticClassifier.scala | 6 -----
.../classification/LogisticRegression.scala | 4 ++--
.../LogisticRegressionSuite.scala | 15 ++++++------
.../ml/classification/OneVsRestSuite.scala | 4 ++--
.../spark/ml/tuning/CrossValidatorSuite.scala | 4 ++--
.../ml/tuning/TrainValidationSplitSuite.scala | 4 ++--
7 files changed, 28 insertions(+), 33 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 80426fc019e83..1a7d6a2aa68a5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -68,6 +68,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
*
* @group setParam
*/
+ // TODO: Implement SPARK-11543?
def setThreshold(value: Double): this.type = {
if (isSet(thresholds)) clear(thresholds)
set(threshold, value)
@@ -88,14 +89,14 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
*
* @group param
*/
- @Since("2.0.0")
+ @Since("2.1.0")
final val family: Param[String] = new Param(this, "family",
"The name of family which is a description of the label distribution to be used in the " +
s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.",
ParamValidators.inArray[String](supportedFamilyNames))
/** @group getParam */
- @Since("2.0.0")
+ @Since("2.1.0")
def getFamily: String = $(family)
/**
@@ -252,7 +253,7 @@ class LogisticRegression @Since("1.2.0") (
*
* @group setParam
*/
- @Since("2.0.0")
+ @Since("2.1.0")
def setFamily(value: String): this.type = set(family, value)
setDefault(family -> "auto")
@@ -271,7 +272,6 @@ class LogisticRegression @Since("1.2.0") (
setDefault(standardization -> true)
@Since("1.5.0")
- // TODO: Check this behavior
override def setThreshold(value: Double): this.type = super.setThreshold(value)
@Since("1.5.0")
@@ -354,18 +354,18 @@ class LogisticRegression @Since("1.2.0") (
val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
case Some(n: Int) =>
require(n >= histogram.length, s"Specified number of classes $n was " +
- s"less than the number of unique labels ${histogram.length}")
+ s"less than the number of unique labels ${histogram.length}.")
n
case None => histogram.length
}
val isBinaryClassification = numClasses == 1 || numClasses == 2
- val isMultinomial = ($(family) == LogisticRegression.auto && !isBinaryClassification) ||
- ($(family) == LogisticRegression.multinomial)
+ val isMultinomial = ($(family) == LogisticRegression.Auto && !isBinaryClassification) ||
+ ($(family) == LogisticRegression.Multinomial)
val numCoefficientSets = if (isMultinomial) numClasses else 1
if (!isMultinomial) {
require(isBinaryClassification, s"Binomial family only supports 1 or 2 " +
- s"outcome classes but found $numClasses")
+ s"outcome classes but found $numClasses.")
}
if (isDefined(thresholds)) {
@@ -646,11 +646,11 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] {
@Since("1.6.0")
override def load(path: String): LogisticRegression = super.load(path)
- private val multinomial = "multinomial"
- private val binomial = "binomial"
- private val auto = "auto"
+ private val Multinomial = "multinomial"
+ private val Binomial = "binomial"
+ private val Auto = "auto"
- private[classification] lazy val supportedFamilyNames = Array(auto, binomial, multinomial)
+ private[classification] val supportedFamilyNames = Array(Auto, Binomial, Multinomial)
}
/**
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 989bd19528a97..1a07aab663030 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -219,12 +219,6 @@ abstract class ProbabilisticClassificationModel[
i += 1
}
argMax
-// val thresholds: Array[Double] = getThresholds
-// val scaledProbability: Array[Double] =
-// probability.toArray.zip(thresholds).map { case (p, t) =>
-// if (t == 0.0) Double.PositiveInfinity else p / t
-// }
-// Vectors.dense(scaledProbability).argmax
}
}
}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index c3770dd0a12df..d851b983349c9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -432,8 +432,8 @@ class LogisticRegressionWithLBFGS
if (userSuppliedWeights) {
val uid = Identifiable.randomUID("logreg-static")
lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(uid,
- new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed = true),
- Vectors.dense(0.0).asML, 2, false))
+ new DenseMatrix(1, initialWeights.size, initialWeights.toArray),
+ Vectors.dense(1.0).asML, 2, false))
}
lr.setFitIntercept(addIntercept)
lr.setMaxIter(optimizer.getNumIterations())
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 47c1a7218fcbd..31f991b3fd5e4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -17,22 +17,21 @@
package org.apache.spark.ml.classification
-import org.apache.spark.ml.attribute.NominalAttribute
-
import scala.collection.JavaConverters._
import scala.language.existentials
import scala.util.Random
import scala.util.control.Breaks._
import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
-import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{Matrices, DenseMatrix, Vector, Vectors}
+import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions.lit
class LogisticRegressionSuite
@@ -99,7 +98,7 @@ class LogisticRegressionSuite
}
/**
- * Enable the ignored test to export the smallBinaryDataset into CSV format,
+ * Enable the ignored test to export the dataset into CSV format,
* so we can validate the training accuracy compared with R's glmnet package.
*/
ignore("export test data into CSV format") {
@@ -114,7 +113,7 @@ class LogisticRegressionSuite
test("params") {
ParamsSuite.checkParams(new LogisticRegression)
val model = new LogisticRegressionModel("logReg",
- new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
+ new DenseMatrix(1, 1, Array(0.0)), Vectors.dense(0.0), 2, isMultinomial = false)
ParamsSuite.checkParams(model)
}
@@ -1839,6 +1838,7 @@ class LogisticRegressionSuite
predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) =>
assert(p1 === p2)
}
+ assert(model2.summary.totalIterations === 1)
val lr3 = new LogisticRegression()
val model3 = lr3.fit(smallMultinomialDataset)
@@ -1849,6 +1849,7 @@ class LogisticRegressionSuite
predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) =>
assert(p1 === p2)
}
+ // TODO: check that it converges in a single iteration when initial model is available
}
test("logistic regression with all labels the same") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 09e38786aa002..3ae47029c8dd8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -61,7 +61,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
test("params") {
ParamsSuite.checkParams(new OneVsRest)
val lrModel = new LogisticRegressionModel("logReg",
- new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
+ Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false)
val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel))
ParamsSuite.checkParams(model)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 0fb26f26e7792..87c7c82e4c3b2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio
import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
import org.apache.spark.ml.feature.HashingTF
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.ml.param.shared.HasInputCol
import org.apache.spark.ml.regression.LinearRegression
@@ -245,7 +245,7 @@ class CrossValidatorSuite
val lr = new LogisticRegression()
.setThreshold(0.6)
val lrModel = new LogisticRegressionModel(lr.uid,
- new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
+ Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
.setMetricName("areaUnderPR") // not default metric
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index a05a1d641f1bb..6c58bed9812c1 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
-import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
+import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCol
import org.apache.spark.ml.regression.LinearRegression
@@ -134,7 +134,7 @@ class TrainValidationSplitSuite
val lr = new LogisticRegression()
.setThreshold(0.6)
val lrModel = new LogisticRegressionModel(lr.uid,
- new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false)
+ Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
val paramMaps = new ParamGridBuilder()
From b64ffad60d8f344a576227bf5f150eea5679aaa9 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 26 Aug 2016 10:52:29 -0700
Subject: [PATCH 11/24] use _coefficients
---
.../org/apache/spark/ml/classification/LogisticRegression.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1a7d6a2aa68a5..314fde435eb5a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -705,7 +705,7 @@ class LogisticRegressionModel private[spark] (
/** Margin (rawPrediction) for class label 1. For binary classification only. */
private val margin: Vector => Double = (features) => {
- BLAS.dot(features, coefficients) + _intercept
+ BLAS.dot(features, _coefficients) + _intercept
}
/** Margin (rawPrediction) for each class label. */
From 7895c8176df4fccc144ddb29079af8dc8a9a1942 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 26 Aug 2016 11:15:58 -0700
Subject: [PATCH 12/24] use strings in supported families
---
.../spark/ml/classification/LogisticRegression.scala | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 314fde435eb5a..75fab6fc81094 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -359,8 +359,8 @@ class LogisticRegression @Since("1.2.0") (
case None => histogram.length
}
val isBinaryClassification = numClasses == 1 || numClasses == 2
- val isMultinomial = ($(family) == LogisticRegression.Auto && !isBinaryClassification) ||
- ($(family) == LogisticRegression.Multinomial)
+ val isMultinomial = ($(family) == "auto" && !isBinaryClassification) ||
+ ($(family) == "multinomial")
val numCoefficientSets = if (isMultinomial) numClasses else 1
if (!isMultinomial) {
@@ -646,11 +646,8 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] {
@Since("1.6.0")
override def load(path: String): LogisticRegression = super.load(path)
- private val Multinomial = "multinomial"
- private val Binomial = "binomial"
- private val Auto = "auto"
-
- private[classification] val supportedFamilyNames = Array(Auto, Binomial, Multinomial)
+ private[classification] val supportedFamilyNames =
+ Array("auto", "binomial", "multinomial").map(_.toLowerCase)
}
/**
From c9b6d970a625fff921d0c512bb7a1dd4f7a10bf1 Mon Sep 17 00:00:00 2001
From: sethah
Date: Thu, 1 Sep 2016 21:43:32 -0700
Subject: [PATCH 13/24] mima exclusion for lr model constructor
---
project/MimaExcludes.scala | 3 +++
1 file changed, 3 insertions(+)
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 1bdcf9a623dc9..d4cbf510b9a5c 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -812,6 +812,9 @@ object MimaExcludes {
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ui.exec.ExecutorsListener.executorToTotalCores"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ui.exec.ExecutorsListener.executorToTasksMax"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ui.exec.ExecutorsListener.executorToJvmGCTime")
+ ) ++ Seq(
+ // [SPARK-17163] Unify logistic regression interface. Private constructor has new signature.
+ ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.this")
)
}
From b532692f4f63c414dbd4bec38c0adecb5d83d853 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 9 Sep 2016 13:24:33 -0700
Subject: [PATCH 14/24] address initial review
---
.../classification/LogisticRegression.scala | 56 +++++++++++--------
.../LogisticRegressionSuite.scala | 24 +++++++-
2 files changed, 54 insertions(+), 26 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 75fab6fc81094..b5ef73cfa8356 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -50,7 +50,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol
with HasStandardization with HasWeightCol with HasThreshold with HasAggregationDepth {
- import LogisticRegression._
+ import org.apache.spark.ml.classification.LogisticRegression.supportedFamilyNames
/**
* Set threshold in binary classification, in range [0, 1].
@@ -377,7 +377,7 @@ class LogisticRegression @Since("1.2.0") (
instr.logNumClasses(numClasses)
instr.logNumFeatures(numFeatures)
- val (coefficients, intercept, objectiveHistory) = {
+ val (coefficientMatrix, interceptVector, objectiveHistory) = {
if (numInvalid != 0) {
val msg = s"Classification labels should be in [0 to ${numClasses - 1}]. " +
s"Found $numInvalid invalid labels."
@@ -385,20 +385,25 @@ class LogisticRegression @Since("1.2.0") (
throw new SparkException(msg)
}
- val isConstantLabel = histogram.count(_ != 0) == 1
+ val isConstantLabel = histogram.count(_ != 0.0) == 1
if ($(fitIntercept) && isConstantLabel) {
logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
s"will be zeros. Training is not needed.")
val constantLabelIndex = Vectors.dense(histogram).argmax
- val coefficientMatrix = Matrices.sparse(numCoefficientSets, numFeatures,
- Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
- val interceptVector = if (isMultinomial) {
+ val coefMatrix = if (numFeatures < numClasses) {
+ new SparseMatrix(numCoefficientSets, numFeatures,
+ Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
+ } else {
+ new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numClasses + 1)(0),
+ Array.empty[Int], Array.empty[Double], isTransposed = true)
+ }
+ val interceptVec = if (isMultinomial) {
Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity)))
} else {
Vectors.dense(if (numClasses == 2) Double.PositiveInfinity else Double.NegativeInfinity)
}
- (coefficientMatrix, interceptVector, Array.empty[Double])
+ (coefMatrix, interceptVec, Array.empty[Double])
} else {
if (!$(fitIntercept) && isConstantLabel) {
logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " +
@@ -460,31 +465,34 @@ class LogisticRegression @Since("1.2.0") (
val initialCoefficientsWithIntercept =
Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept)
- val initialModelIsValid = optInitialModel.exists { model =>
- val providedCoefs = model.coefficientMatrix
- val modelValid = (providedCoefs.numRows == numCoefficientSets) &&
- (providedCoefs.numCols == numFeatures) &&
- (model.interceptVector.size == numCoefficientSets)
- if (!modelValid) {
- logWarning(s"Initial coefficients will be ignored! Its dimensions " +
- s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the expected " +
- s"size ($numCoefficientSets, $numFeatures)")
- }
- modelValid
+ val initialModelIsValid = optInitialModel match {
+ case Some(_initialModel) =>
+ val providedCoefs = _initialModel.coefficientMatrix
+ val modelIsValid = (providedCoefs.numRows == numCoefficientSets) &&
+ (providedCoefs.numCols == numFeatures) &&
+ (_initialModel.interceptVector.size == numCoefficientSets) &&
+ (_initialModel.getFitIntercept == $(fitIntercept))
+ if (!modelIsValid) {
+ logWarning(s"Initial coefficients will be ignored! Its dimensions " +
+ s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " +
+ s"expected size ($numCoefficientSets, $numFeatures)")
+ }
+ modelIsValid
+ case None => false
}
if (initialModelIsValid) {
- val initialCoefArray = initialCoefficientsWithIntercept.toArray
+ val initialCoefWithInterceptArray = initialCoefficientsWithIntercept.toArray
val providedCoef = optInitialModel.get.coefficientMatrix
providedCoef.foreachActive { (row, col, value) =>
val flatIndex = row * numFeaturesPlusIntercept + col
// We need to scale the coefficients since they will be trained in the scaled space
- initialCoefArray(flatIndex) = value * featuresStd(col)
+ initialCoefWithInterceptArray(flatIndex) = value * featuresStd(col)
}
if ($(fitIntercept)) {
optInitialModel.get.interceptVector.foreachActive { (index, value) =>
val coefIndex = (index + 1) * numFeaturesPlusIntercept - 1
- initialCoefArray(coefIndex) = value
+ initialCoefWithInterceptArray(coefIndex) = value
}
}
} else if ($(fitIntercept) && isMultinomial) {
@@ -549,13 +557,13 @@ class LogisticRegression @Since("1.2.0") (
state = states.next()
arrayBuilder += state.adjustedValue
}
+ bcFeaturesStd.destroy(blocking = false)
if (state == null) {
val msg = s"${optimizer.getClass.getName} failed."
logError(msg)
throw new SparkException(msg)
}
- bcFeaturesStd.destroy(blocking = false)
/*
The coefficients are trained in the scaled space; we're converting them back to
@@ -617,8 +625,8 @@ class LogisticRegression @Since("1.2.0") (
if (handlePersistence) instances.unpersist()
- val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses,
- isMultinomial))
+ val model = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
+ numClasses, isMultinomial))
// TODO: implement summary model for multinomial case
val m = if (!isMultinomial) {
val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol()
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 31f991b3fd5e4..5af825ca0c0c3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.LabeledPoint
-import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
+import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
@@ -1849,7 +1849,7 @@ class LogisticRegressionSuite
predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) =>
assert(p1 === p2)
}
- // TODO: check that it converges in a single iteration when initial model is available
+ // TODO: check that it converges in a single iteration when model summary is available
}
test("logistic regression with all labels the same") {
@@ -1894,6 +1894,26 @@ class LogisticRegressionSuite
assert(allOneNoInterceptModel.summary.totalIterations > 0)
}
+ test("compressed storage") {
+ val moreClassesThanFeatures = spark.createDataFrame(Seq(
+ LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)),
+ LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)),
+ LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0)))
+ )
+ val mlr = new LogisticRegression().setFamily("multinomial")
+ val model = mlr.fit(moreClassesThanFeatures)
+ assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4)
+ val moreFeaturesThanClasses = spark.createDataFrame(Seq(
+ LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)),
+ LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)),
+ LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0)))
+ )
+ val model2 = mlr.fit(moreFeaturesThanClasses)
+ assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
+ }
+
test("multiclass logistic regression with all labels the same") {
val constantData = spark.createDataFrame(Seq(
LabeledPoint(4.0, Vectors.dense(0.0)),
From af8fb453e86b08956d06ee1f37ef3eb393287b74 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 9 Sep 2016 13:49:49 -0700
Subject: [PATCH 15/24] rewriting family detection logic
---
.../ml/classification/LogisticRegression.scala | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index b5ef73cfa8356..c4f283be0b4f4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -359,14 +359,16 @@ class LogisticRegression @Since("1.2.0") (
case None => histogram.length
}
val isBinaryClassification = numClasses == 1 || numClasses == 2
- val isMultinomial = ($(family) == "auto" && !isBinaryClassification) ||
- ($(family) == "multinomial")
- val numCoefficientSets = if (isMultinomial) numClasses else 1
-
- if (!isMultinomial) {
- require(isBinaryClassification, s"Binomial family only supports 1 or 2 " +
+ val isMultinomial = $(family) match {
+ case "binomial" =>
+ require(isBinaryClassification, s"Binomial family only supports 1 or 2 " +
s"outcome classes but found $numClasses.")
+ false
+ case "multinomial" => true
+ case "auto" => !isBinaryClassification
+ case other => throw new IllegalArgumentException(s"Unsupported family: $other")
}
+ val numCoefficientSets = if (isMultinomial) numClasses else 1
if (isDefined(thresholds)) {
require($(thresholds).length == numClasses, this.getClass.getSimpleName +
From b27cb2c190f4cdc7bd8540c8cd83e55814e52378 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 9 Sep 2016 14:32:22 -0700
Subject: [PATCH 16/24] set family explicitly in tests
---
.../LogisticRegressionSuite.scala | 78 ++++++++++---------
1 file changed, 43 insertions(+), 35 deletions(-)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 5af825ca0c0c3..2060d7d113376 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -153,7 +153,7 @@ class LogisticRegressionSuite
}
test("setThreshold, getThreshold") {
- val lr = new LogisticRegression
+ val lr = new LogisticRegression().setFamily("binomial")
// default
assert(lr.getThreshold === 0.5, "LogisticRegression.threshold should default to 0.5")
withClue("LogisticRegression should not have thresholds set by default.") {
@@ -170,7 +170,7 @@ class LogisticRegressionSuite
lr.setThreshold(0.5)
assert(lr.getThresholds === Array(0.5, 0.5))
// Set via thresholds
- val lr2 = new LogisticRegression
+ val lr2 = new LogisticRegression().setFamily("binomial")
lr2.setThresholds(Array(0.3, 0.7))
val expectedThreshold = 1.0 / (1.0 + 0.3 / 0.7)
assert(lr2.getThreshold ~== expectedThreshold relTol 1E-7)
@@ -234,11 +234,16 @@ class LogisticRegressionSuite
}
test("logistic regression doesn't fit intercept when fitIntercept is off") {
- val lr = new LogisticRegression
+ val lr = new LogisticRegression().setFamily("binomial")
lr.setFitIntercept(false)
val model = lr.fit(smallBinaryDataset)
assert(model.intercept === 0.0)
+ val mlr = new LogisticRegression().setFamily("multinomial")
+ mlr.setFitIntercept(false)
+ val mlrModel = mlr.fit(smallMultinomialDataset)
+ assert(mlrModel.interceptVector === Vectors.sparse(3, Seq()))
+
// copied model must have the same parent.
MLTestingUtils.checkCopy(model)
}
@@ -288,7 +293,7 @@ class LogisticRegressionSuite
}
test("multinomial logistic regression: Predictor, Classifier methods") {
- val mlr = new LogisticRegression
+ val mlr = new LogisticRegression().setFamily("multinomial")
val model = mlr.fit(smallMultinomialDataset)
assert(model.numClasses === 3)
@@ -335,7 +340,7 @@ class LogisticRegressionSuite
}
test("binary logistic regression: Predictor, Classifier methods") {
- val lr = new LogisticRegression
+ val lr = new LogisticRegression().setFamily("binomial")
val model = lr.fit(smallBinaryDataset)
assert(model.numClasses === 2)
@@ -364,7 +369,7 @@ class LogisticRegressionSuite
}
test("coefficients and intercept methods") {
- val mlr = new LogisticRegression().setMaxIter(1)
+ val mlr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")
val mlrModel = mlr.fit(smallMultinomialDataset)
val thrownCoef = intercept[SparkException] {
mlrModel.coefficients
@@ -375,7 +380,7 @@ class LogisticRegressionSuite
assert(thrownCoef.getMessage().contains("use coefficientMatrix instead"))
assert(thrownIntercept.getMessage().contains("use interceptVector instead"))
- val blr = new LogisticRegression().setMaxIter(1)
+ val blr = new LogisticRegression().setMaxIter(1).setFamily("binomial")
val blrModel = blr.fit(smallBinaryDataset)
assert(blrModel.coefficients.size === 1)
assert(blrModel.intercept !== 0.0)
@@ -1751,7 +1756,7 @@ class LogisticRegressionSuite
val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i =>
LabeledPoint(i.toDouble, Vectors.dense(i.toDouble))
})
- val lr = new LogisticRegression().setWeightCol("weight")
+ val lr = new LogisticRegression().setFamily("binomial").setWeightCol("weight")
val model = lr.fit(outlierData)
val results = model.transform(testData).select("label", "prediction").collect()
@@ -1775,7 +1780,7 @@ class LogisticRegressionSuite
val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i =>
LabeledPoint(i.toDouble, Vectors.dense(i.toDouble))
})
- val mlr = new LogisticRegression().setWeightCol("weight")
+ val mlr = new LogisticRegression().setFamily("multinomial").setWeightCol("weight")
val model = mlr.fit(outlierData)
val results = model.transform(testData).select("label", "prediction").collect()
@@ -1829,9 +1834,9 @@ class LogisticRegressionSuite
}
test("set initial model") {
- val lr = new LogisticRegression()
+ val lr = new LogisticRegression().setFamily("binomial")
val model1 = lr.fit(smallBinaryDataset)
- val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5)
+ val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5).setFamily("binomial")
val model2 = lr2.fit(smallBinaryDataset)
val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect()
val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect()
@@ -1840,9 +1845,10 @@ class LogisticRegressionSuite
}
assert(model2.summary.totalIterations === 1)
- val lr3 = new LogisticRegression()
+ val lr3 = new LogisticRegression().setFamily("multinomial")
val model3 = lr3.fit(smallMultinomialDataset)
- val lr4 = new LogisticRegression().setInitialModel(model3).setMaxIter(5)
+ val lr4 = new LogisticRegression()
+ .setInitialModel(model3).setMaxIter(5).setFamily("multinomial")
val model4 = lr4.fit(smallMultinomialDataset)
val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect()
val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect()
@@ -1852,7 +1858,7 @@ class LogisticRegressionSuite
// TODO: check that it converges in a single iteration when model summary is available
}
- test("logistic regression with all labels the same") {
+ test("binary logistic regression with all labels the same") {
val sameLabels = smallBinaryDataset
.withColumn("zeroLabel", lit(0.0))
.withColumn("oneLabel", lit(1.0))
@@ -1861,6 +1867,7 @@ class LogisticRegressionSuite
val lrIntercept = new LogisticRegression()
.setFitIntercept(true)
.setMaxIter(3)
+ .setFamily("binomial")
val allZeroInterceptModel = lrIntercept
.setLabelCol("zeroLabel")
@@ -1880,6 +1887,7 @@ class LogisticRegressionSuite
val lrNoIntercept = new LogisticRegression()
.setFitIntercept(false)
.setMaxIter(3)
+ .setFamily("binomial")
val allZeroNoInterceptModel = lrNoIntercept
.setLabelCol("zeroLabel")
@@ -1894,26 +1902,6 @@ class LogisticRegressionSuite
assert(allOneNoInterceptModel.summary.totalIterations > 0)
}
- test("compressed storage") {
- val moreClassesThanFeatures = spark.createDataFrame(Seq(
- LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)),
- LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)),
- LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0)))
- )
- val mlr = new LogisticRegression().setFamily("multinomial")
- val model = mlr.fit(moreClassesThanFeatures)
- assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
- assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4)
- val moreFeaturesThanClasses = spark.createDataFrame(Seq(
- LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)),
- LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)),
- LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0)))
- )
- val model2 = mlr.fit(moreFeaturesThanClasses)
- assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
- assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
- }
-
test("multiclass logistic regression with all labels the same") {
val constantData = spark.createDataFrame(Seq(
LabeledPoint(4.0, Vectors.dense(0.0)),
@@ -1959,8 +1947,28 @@ class LogisticRegressionSuite
// TODO: check num iters is zero when it become available in the model
}
+ test("compressed storage") {
+ val moreClassesThanFeatures = spark.createDataFrame(Seq(
+ LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)),
+ LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)),
+ LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0)))
+ )
+ val mlr = new LogisticRegression().setFamily("multinomial")
+ val model = mlr.fit(moreClassesThanFeatures)
+ assert(model.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4)
+ val moreFeaturesThanClasses = spark.createDataFrame(Seq(
+ LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)),
+ LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)),
+ LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0)))
+ )
+ val model2 = mlr.fit(moreFeaturesThanClasses)
+ assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
+ }
+
test("numClasses specified in metadata/inferred") {
- val lr = new LogisticRegression().setMaxIter(1)
+ val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial")
// specify more classes than unique label values
val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata()
From be030b5269518fd5c018e9e172cea7685addcb03 Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 9 Sep 2016 16:01:32 -0700
Subject: [PATCH 17/24] fix compression bug
---
.../apache/spark/ml/classification/LogisticRegression.scala | 5 +++--
.../spark/ml/classification/LogisticRegressionSuite.scala | 5 +++++
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c4f283be0b4f4..fb491dd9d60be 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -358,6 +358,7 @@ class LogisticRegression @Since("1.2.0") (
n
case None => histogram.length
}
+
val isBinaryClassification = numClasses == 1 || numClasses == 2
val isMultinomial = $(family) match {
case "binomial" =>
@@ -393,11 +394,11 @@ class LogisticRegression @Since("1.2.0") (
logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
s"will be zeros. Training is not needed.")
val constantLabelIndex = Vectors.dense(histogram).argmax
- val coefMatrix = if (numFeatures < numClasses) {
+ val coefMatrix = if (numFeatures < numCoefficientSets) {
new SparseMatrix(numCoefficientSets, numFeatures,
Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
} else {
- new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numClasses + 1)(0),
+ new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numCoefficientSets + 1)(0),
Array.empty[Int], Array.empty[Double], isTransposed = true)
}
val interceptVec = if (isMultinomial) {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 2060d7d113376..e3e3000018a12 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -1965,6 +1965,11 @@ class LogisticRegressionSuite
val model2 = mlr.fit(moreFeaturesThanClasses)
assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix])
assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3)
+
+ val blr = new LogisticRegression().setFamily("binomial")
+ val blrModel = blr.fit(moreFeaturesThanClasses)
+ assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix])
+ assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2)
}
test("numClasses specified in metadata/inferred") {
From 73158e5b24e5e58de8284aef84297bdefa75e8ca Mon Sep 17 00:00:00 2001
From: sethah
Date: Fri, 9 Sep 2016 19:18:56 -0700
Subject: [PATCH 18/24] use regex util
---
.../apache/spark/ml/classification/LogisticRegression.scala | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index fb491dd9d60be..4eb9dfd9cc128 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -42,6 +42,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions.{col, lit}
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.VersionUtils
/**
* Params for logistic regression.
@@ -298,6 +299,7 @@ class LogisticRegression @Since("1.2.0") (
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
+ *
* @group expertSetParam
*/
@Since("2.1.0")
@@ -966,8 +968,7 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] {
override def load(path: String): LogisticRegressionModel = {
val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
- val versionRegex = "([0-9]+)\\.([0-9]+)\\.(.+)".r
- val versionRegex(major, minor, _) = metadata.sparkVersion
+ val (major, minor) = VersionUtils.majorMinorVersion(metadata.sparkVersion)
val dataPath = new Path(path, "data").toString
val data = sparkSession.read.format("parquet").load(dataPath)
@@ -1386,6 +1387,7 @@ class BinaryLogisticRegressionSummary private[classification] (
* $$
*
*
+ *
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param bcFeaturesStd The broadcast standard deviation values of the features.
* @param numClasses the number of possible outcomes for k classes classification problem in
From f538e1e36c6be6201b4408afbc89f2a9daf6cea7 Mon Sep 17 00:00:00 2001
From: sethah
Date: Mon, 12 Sep 2016 15:09:18 -0700
Subject: [PATCH 19/24] sparse storage for binary lor
---
.../classification/LogisticRegression.scala | 49 ++++++++++++-------
.../LogisticRegressionSuite.scala | 4 +-
2 files changed, 35 insertions(+), 18 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 4eb9dfd9cc128..ed730230c6d29 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -299,7 +299,7 @@ class LogisticRegression @Since("1.2.0") (
* If the dimensions of features or the number of partitions are large,
* this param could be adjusted to a larger size.
* Default is 2.
- *
+ *
* @group expertSetParam
*/
@Since("2.1.0")
@@ -361,14 +361,13 @@ class LogisticRegression @Since("1.2.0") (
case None => histogram.length
}
- val isBinaryClassification = numClasses == 1 || numClasses == 2
val isMultinomial = $(family) match {
case "binomial" =>
- require(isBinaryClassification, s"Binomial family only supports 1 or 2 " +
+ require(numClasses == 1 || numClasses == 2, s"Binomial family only supports 1 or 2 " +
s"outcome classes but found $numClasses.")
false
case "multinomial" => true
- case "auto" => !isBinaryClassification
+ case "auto" => numClasses > 2
case other => throw new IllegalArgumentException(s"Unsupported family: $other")
}
val numCoefficientSets = if (isMultinomial) numClasses else 1
@@ -396,6 +395,7 @@ class LogisticRegression @Since("1.2.0") (
logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " +
s"will be zeros. Training is not needed.")
val constantLabelIndex = Vectors.dense(histogram).argmax
+ // TODO: use `compressed` after SPARK-17471
val coefMatrix = if (numFeatures < numCoefficientSets) {
new SparseMatrix(numCoefficientSets, numFeatures,
Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double])
@@ -587,21 +587,34 @@ class LogisticRegression @Since("1.2.0") (
0.0
}
}
- val coefficientMatrix =
- new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true)
if ($(regParam) == 0.0 && isMultinomial) {
/*
- When no regularization is applied, the coefficients lack identifiability because
- we do not use a pivot class. We can add any constant value to the coefficients and
- get the same likelihood. So here, we choose the mean centered coefficients for
+ When no regularization is applied, the multinomial coefficients lack identifiability
+ because we do not use a pivot class. We can add any constant value to the coefficients
+ and get the same likelihood. So here, we choose the mean centered coefficients for
reproducibility. This method follows the approach in glmnet, described here:
Friedman, et al. "Regularization Paths for Generalized Linear Models via
Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf
*/
- val coefficientMean = coefficientMatrix.values.sum / coefficientMatrix.values.length
- coefficientMatrix.update(_ - coefficientMean)
+ val coefficientMean = coefficientArray.sum / coefficientArray.length
+ coefficientArray.indices.foreach { i => coefficientArray(i) -= coefficientMean}
+ }
+
+ val denseCoefficientMatrix =
+ new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true)
+ // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471
+ val compressedCoefficientMatrix = if (isMultinomial) {
+ denseCoefficientMatrix
+ } else {
+ val compressedVector = Vectors.dense(coefficientArray).compressed
+ compressedVector match {
+ case dv: DenseVector => denseCoefficientMatrix
+ case sv: SparseVector =>
+ new SparseMatrix(1, numFeatures, Array(0, sv.indices.length), sv.indices, sv.values,
+ isTransposed = true)
+ }
}
val interceptsArray: Array[Double] = if ($(fitIntercept)) {
@@ -612,10 +625,8 @@ class LogisticRegression @Since("1.2.0") (
} else {
Array[Double]()
}
- /*
- The intercepts are never regularized, so we always center the mean.
- */
val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) {
+ // The intercepts are never regularized, so we always center the mean.
val interceptMean = interceptsArray.sum / numClasses
interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean }
Vectors.dense(interceptsArray)
@@ -624,7 +635,7 @@ class LogisticRegression @Since("1.2.0") (
} else {
Vectors.sparse(numCoefficientSets, Seq())
}
- (coefficientMatrix, interceptVector, arrayBuilder.result())
+ (compressedCoefficientMatrix, interceptVector.compressed, arrayBuilder.result())
}
}
@@ -687,8 +698,12 @@ class LogisticRegressionModel private[spark] (
// convert to appropriate vector representation without replicating data
private lazy val _coefficients: Vector = coefficientMatrix match {
case dm: DenseMatrix => Vectors.dense(dm.values)
- // TODO: better way to flatten sparse matrix?
- case sm: SparseMatrix => Vectors.fromBreeze(sm.asBreeze.flatten(View.Require))
+ case sm: SparseMatrix =>
+ if (coefficientMatrix.isTransposed) {
+ Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values)
+ } else {
+ throw new IllegalStateException("LogisticRegressionModel coefficients should be row major.")
+ }
}
@Since("1.3.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index e3e3000018a12..e7304401f324b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -604,7 +604,9 @@ class LogisticRegressionSuite
val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)
assert(model2.intercept ~== interceptR2 relTol 1E-2)
- assert(model2.coefficients ~= coefficientsR2 absTol 1E-3)
+ assert(model2.coefficients ~== coefficientsR2 absTol 1E-3)
+ // TODO: move this to a standalone test of compression after SPARK-17471
+ assert(model2.coefficients.isInstanceOf[SparseVector])
}
test("binary logistic regression without intercept with L1 regularization") {
From a3a7d20414ff9a2c9df1fdf60417a4c307dd472f Mon Sep 17 00:00:00 2001
From: sethah
Date: Tue, 13 Sep 2016 22:41:45 -0700
Subject: [PATCH 20/24] remove scores and address some review
---
.../classification/LogisticRegression.scala | 46 ++++---------------
.../ProbabilisticClassifier.scala | 3 +-
2 files changed, 10 insertions(+), 39 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index ed730230c6d29..151002b37df65 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.classification
import scala.collection.mutable
-import breeze.linalg.{DenseVector => BDV, View}
+import breeze.linalg.{DenseVector => BDV}
import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
import org.apache.hadoop.fs.Path
@@ -696,14 +696,13 @@ class LogisticRegressionModel private[spark] (
}
// convert to appropriate vector representation without replicating data
- private lazy val _coefficients: Vector = coefficientMatrix match {
- case dm: DenseMatrix => Vectors.dense(dm.values)
- case sm: SparseMatrix =>
- if (coefficientMatrix.isTransposed) {
- Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values)
- } else {
- throw new IllegalStateException("LogisticRegressionModel coefficients should be row major.")
- }
+ private lazy val _coefficients: Vector = {
+ require(coefficientMatrix.isTransposed,
+ "LogisticRegressionModel coefficients should be row major.")
+ coefficientMatrix match {
+ case dm: DenseMatrix => Vectors.dense(dm.values)
+ case sm: SparseMatrix => Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values)
+ }
}
@Since("1.3.0")
@@ -746,35 +745,6 @@ class LogisticRegressionModel private[spark] (
1.0 / (1.0 + math.exp(-m))
}
- /** Score (probability) for each class label. */
- // TODO: do we need this anymore?
- private val scores: Vector => Vector = (features) => {
- val m = margins(features)
- val maxMarginIndex = m.argmax
- val marginArray = m.toArray
- val maxMargin = marginArray(maxMarginIndex)
-
- // adjust margins for overflow
- val sum = {
- var temp = 0.0
- var k = 0
- while (k < numClasses) {
- marginArray(k) = if (maxMargin > 0) {
- math.exp(marginArray(k) - maxMargin)
- } else {
- math.exp(marginArray(k))
- }
- temp += marginArray(k)
- k += 1
- }
- temp
- }
-
- val scores = Vectors.dense(marginArray)
- BLAS.scal(1 / sum, scores)
- scores
- }
-
@Since("1.6.0")
override val numFeatures: Int = coefficientMatrix.numCols
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index 1a07aab663030..1b6e77542cc80 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -205,7 +205,8 @@ abstract class ProbabilisticClassificationModel[
var argMax = 0
var max = Double.NegativeInfinity
var i = 0
- while (i < probability.size) {
+ val probabilitySize = probability.size
+ while (i < probabilitySize) {
if (thresholds(i) == 0.0) {
max = Double.PositiveInfinity
argMax = i
From cb1666e43ba604de780f937cc5d3eaac28e7a0f8 Mon Sep 17 00:00:00 2001
From: sethah
Date: Wed, 14 Sep 2016 07:42:16 -0700
Subject: [PATCH 21/24] transposed error in test suites
---
.../org/apache/spark/ml/classification/OneVsRestSuite.scala | 4 ++--
.../org/apache/spark/ml/tuning/CrossValidatorSuite.scala | 4 ++--
.../apache/spark/ml/tuning/TrainValidationSplitSuite.scala | 4 ++--
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 3ae47029c8dd8..01a043195ad3f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.feature.StringIndexer
-import org.apache.spark.ml.linalg.{Matrices, Vectors}
+import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils}
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -61,7 +61,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
test("params") {
ParamsSuite.checkParams(new OneVsRest)
val lrModel = new LogisticRegressionModel("logReg",
- Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false)
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false)
val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel))
ParamsSuite.checkParams(model)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index 87c7c82e4c3b2..a0a2e87b10edf 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio
import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
import org.apache.spark.ml.feature.HashingTF
-import org.apache.spark.ml.linalg.{Matrices, Vectors}
+import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.ml.param.shared.HasInputCol
import org.apache.spark.ml.regression.LinearRegression
@@ -245,7 +245,7 @@ class CrossValidatorSuite
val lr = new LogisticRegression()
.setThreshold(0.6)
val lrModel = new LogisticRegressionModel(lr.uid,
- Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false)
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
.setMetricName("areaUnderPR") // not default metric
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 6c58bed9812c1..39e23e6c45dbb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator}
-import org.apache.spark.ml.linalg.{Matrices, Vectors}
+import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCol
import org.apache.spark.ml.regression.LinearRegression
@@ -134,7 +134,7 @@ class TrainValidationSplitSuite
val lr = new LogisticRegression()
.setThreshold(0.6)
val lrModel = new LogisticRegressionModel(lr.uid,
- Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false)
+ new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
val paramMaps = new ParamGridBuilder()
From bd7fca10e2081372574a6c9dd59da4aca9aaf13e Mon Sep 17 00:00:00 2001
From: sethah
Date: Wed, 14 Sep 2016 14:02:26 -0700
Subject: [PATCH 22/24] update scaladoc and correct predict method
---
.../classification/LogisticRegression.scala | 44 ++++++++++++++++++-
1 file changed, 43 insertions(+), 1 deletion(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 151002b37df65..de5e23780c86a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -687,6 +687,11 @@ class LogisticRegressionModel private[spark] (
extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
with LogisticRegressionParams with MLWritable {
+ /**
+ * A vector of model coefficients for "binomial" logistic regression. If this model was trained
+ * using the "multinomial" family then an exception is thrown.
+ * @return Vector
+ */
@Since("2.0.0")
def coefficients: Vector = if (isMultinomial) {
throw new SparkException("Multinomial models contain a matrix of coefficients, use " +
@@ -705,6 +710,11 @@ class LogisticRegressionModel private[spark] (
}
}
+ /**
+ * The model intercept for "binomial" logistic regression. If this model was fit with the
+ * "multinomial" family then an exception is thrown.
+ * @return Double
+ */
@Since("1.3.0")
def intercept: Double = if (isMultinomial) {
throw new SparkException("Multinomial models contain a vector of intercepts, use " +
@@ -745,6 +755,34 @@ class LogisticRegressionModel private[spark] (
1.0 / (1.0 + math.exp(-m))
}
+ /** Score (probability) for each class label. */
+ private val scores: Vector => Vector = (features) => {
+ val m = margins(features)
+ val maxMarginIndex = m.argmax
+ val marginArray = m.toArray
+ val maxMargin = marginArray(maxMarginIndex)
+
+ // adjust margins for overflow
+ val sum = {
+ var temp = 0.0
+ var k = 0
+ while (k < numClasses) {
+ marginArray(k) = if (maxMargin > 0) {
+ math.exp(marginArray(k) - maxMargin)
+ } else {
+ math.exp(marginArray(k))
+ }
+ temp += marginArray(k)
+ k += 1
+ }
+ temp
+ }
+
+ val scores = Vectors.dense(marginArray)
+ BLAS.scal(1 / sum, scores)
+ scores
+ }
+
@Since("1.6.0")
override val numFeatures: Int = coefficientMatrix.numCols
@@ -802,7 +840,11 @@ class LogisticRegressionModel private[spark] (
* The behavior of this can be adjusted using [[thresholds]].
*/
override protected def predict(features: Vector): Double = if (isMultinomial) {
- super.predict(features)
+ if (isDefined(thresholds)) {
+ probability2prediction(scores(features))
+ } else {
+ super.predict(features)
+ }
} else {
// Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
if (score(features) > getThreshold) 1 else 0
From 38fad988956458aac59109613c7d468855a0faf8 Mon Sep 17 00:00:00 2001
From: sethah
Date: Wed, 14 Sep 2016 17:05:19 -0700
Subject: [PATCH 23/24] revert predict changes and add tests
---
.../classification/LogisticRegression.scala | 34 +--------------
.../LogisticRegressionSuite.scala | 43 ++++++++++++++++++-
2 files changed, 43 insertions(+), 34 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index de5e23780c86a..2229009571a0d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -755,34 +755,6 @@ class LogisticRegressionModel private[spark] (
1.0 / (1.0 + math.exp(-m))
}
- /** Score (probability) for each class label. */
- private val scores: Vector => Vector = (features) => {
- val m = margins(features)
- val maxMarginIndex = m.argmax
- val marginArray = m.toArray
- val maxMargin = marginArray(maxMarginIndex)
-
- // adjust margins for overflow
- val sum = {
- var temp = 0.0
- var k = 0
- while (k < numClasses) {
- marginArray(k) = if (maxMargin > 0) {
- math.exp(marginArray(k) - maxMargin)
- } else {
- math.exp(marginArray(k))
- }
- temp += marginArray(k)
- k += 1
- }
- temp
- }
-
- val scores = Vectors.dense(marginArray)
- BLAS.scal(1 / sum, scores)
- scores
- }
-
@Since("1.6.0")
override val numFeatures: Int = coefficientMatrix.numCols
@@ -840,11 +812,7 @@ class LogisticRegressionModel private[spark] (
* The behavior of this can be adjusted using [[thresholds]].
*/
override protected def predict(features: Vector): Double = if (isMultinomial) {
- if (isDefined(thresholds)) {
- probability2prediction(scores(features))
- } else {
- super.predict(features)
- }
+ super.predict(features)
} else {
// Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden.
if (score(features) > getThreshold) 1 else 0
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index e7304401f324b..e6d520f69bd7c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -231,6 +231,12 @@ class LogisticRegressionSuite
assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) =>
scaled.getDouble(0) === base.getDouble(0)
})
+
+ // force it to use the predict method
+ model.setRawPredictionCol("").setProbabilityCol("").setThresholds(Array(0, 1, 1))
+ val predictionsWithPredict =
+ model.transform(smallMultinomialDataset).select("prediction").collect()
+ assert(predictionsWithPredict.forall(_.getDouble(0) === 0.0))
}
test("logistic regression doesn't fit intercept when fitIntercept is off") {
@@ -293,6 +299,8 @@ class LogisticRegressionSuite
}
test("multinomial logistic regression: Predictor, Classifier methods") {
+ val sqlContext = smallMultinomialDataset.sqlContext
+ import sqlContext.implicits._
val mlr = new LogisticRegression().setFamily("multinomial")
val model = mlr.fit(smallMultinomialDataset)
@@ -337,9 +345,27 @@ class LogisticRegressionSuite
val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
assert(pred == predFromProb)
}
+
+ // force it to use probability2prediction
+ model.setProbabilityCol("")
+ val resultsUsingProb2Predict =
+ model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
+ resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
+ case (pred1, pred2) => assert(pred1 === pred2)
+ }
+
+ // force it to use predict
+ model.setRawPredictionCol("").setProbabilityCol("")
+ val resultsUsingPredict =
+ model.transform(smallMultinomialDataset).select("prediction").as[Double].collect()
+ resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
+ case (pred1, pred2) => assert(pred1 === pred2)
+ }
}
test("binary logistic regression: Predictor, Classifier methods") {
+ val sqlContext = smallBinaryDataset.sqlContext
+ import sqlContext.implicits._
val lr = new LogisticRegression().setFamily("binomial")
val model = lr.fit(smallBinaryDataset)
@@ -347,7 +373,6 @@ class LogisticRegressionSuite
val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size
assert(model.numFeatures === numFeatures)
- val threshold = model.getThreshold
val results = model.transform(smallBinaryDataset)
// Compare rawPrediction with probability
@@ -366,6 +391,22 @@ class LogisticRegressionSuite
val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2
assert(pred == predFromProb)
}
+
+ // force it to use probability2prediction
+ model.setProbabilityCol("")
+ val resultsUsingProb2Predict =
+ model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
+ resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach {
+ case (pred1, pred2) => assert(pred1 === pred2)
+ }
+
+ // force it to use predict
+ model.setRawPredictionCol("").setProbabilityCol("")
+ val resultsUsingPredict =
+ model.transform(smallBinaryDataset).select("prediction").as[Double].collect()
+ resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach {
+ case (pred1, pred2) => assert(pred1 === pred2)
+ }
}
test("coefficients and intercept methods") {
From 4dae59569732ace5cb2cf583d6db315fb3eda596 Mon Sep 17 00:00:00 2001
From: sethah
Date: Mon, 19 Sep 2016 17:47:56 -0700
Subject: [PATCH 24/24] code review, add secondary constructor
---
.../spark/ml/classification/LogisticRegression.scala | 10 ++++++++--
.../ml/classification/LogisticRegressionSuite.scala | 5 ++---
.../spark/ml/classification/OneVsRestSuite.scala | 3 +--
.../apache/spark/ml/tuning/CrossValidatorSuite.scala | 3 +--
.../spark/ml/tuning/TrainValidationSplitSuite.scala | 3 +--
5 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2229009571a0d..343d50c790e85 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -75,7 +75,6 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
set(threshold, value)
}
-
/**
* Param for the name of family which is a description of the label distribution
* to be used in the model.
@@ -687,6 +686,14 @@ class LogisticRegressionModel private[spark] (
extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
with LogisticRegressionParams with MLWritable {
+ require(coefficientMatrix.numRows == interceptVector.size, s"Dimension mismatch! Expected " +
+ s"coefficientMatrix.numRows == interceptVector.size, but ${coefficientMatrix.numRows} != " +
+ s"${interceptVector.size}")
+
+ private[spark] def this(uid: String, coefficients: Vector, intercept: Double) =
+ this(uid, new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true),
+ Vectors.dense(intercept), 2, isMultinomial = false)
+
/**
* A vector of model coefficients for "binomial" logistic regression. If this model was trained
* using the "multinomial" family then an exception is thrown.
@@ -1382,7 +1389,6 @@ class BinaryLogisticRegressionSummary private[classification] (
* $$
*
*
- *
* @param bcCoefficients The broadcast coefficients corresponding to the features.
* @param bcFeaturesStd The broadcast standard deviation values of the features.
* @param numClasses the number of possible outcomes for k classes classification problem in
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index e6d520f69bd7c..2623759f24d91 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.LabeledPoint
-import org.apache.spark.ml.linalg._
+import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
@@ -112,8 +112,7 @@ class LogisticRegressionSuite
test("params") {
ParamsSuite.checkParams(new LogisticRegression)
- val model = new LogisticRegressionModel("logReg",
- new DenseMatrix(1, 1, Array(0.0)), Vectors.dense(0.0), 2, isMultinomial = false)
+ val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0)
ParamsSuite.checkParams(model)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 01a043195ad3f..99dd5854ff649 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -60,8 +60,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
test("params") {
ParamsSuite.checkParams(new OneVsRest)
- val lrModel = new LogisticRegressionModel("logReg",
- new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false)
+ val lrModel = new LogisticRegressionModel("lr", Vectors.dense(0.0), 0.0)
val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel))
ParamsSuite.checkParams(model)
}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index a0a2e87b10edf..750dc5bf01e6a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -244,8 +244,7 @@ class CrossValidatorSuite
test("read/write: CrossValidatorModel") {
val lr = new LogisticRegression()
.setThreshold(0.6)
- val lrModel = new LogisticRegressionModel(lr.uid,
- new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false)
+ val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
.setMetricName("areaUnderPR") // not default metric
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index 39e23e6c45dbb..9971371e47288 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -133,8 +133,7 @@ class TrainValidationSplitSuite
test("read/write: TrainValidationSplitModel") {
val lr = new LogisticRegression()
.setThreshold(0.6)
- val lrModel = new LogisticRegressionModel(lr.uid,
- new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false)
+ val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2)
.setThreshold(0.6)
val evaluator = new BinaryClassificationEvaluator()
val paramMaps = new ParamGridBuilder()