From 52ec9cd1c9920650cb588d15e1301e998a036371 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 24 Aug 2016 17:13:33 -0700 Subject: [PATCH 01/24] first pass at merging MLOR with LOR --- .../classification/LogisticRegression.scala | 422 ++++++++++++++---- .../classification/LogisticRegression.scala | 5 +- .../MultinomialLogisticRegressionSuite.scala | 210 ++++----- 3 files changed, 454 insertions(+), 183 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 757d52052d87f..c8c06a4d7752b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -50,6 +50,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol with HasStandardization with HasWeightCol with HasThreshold with HasAggregationDepth { + import LogisticRegression._ + /** * Set threshold in binary classification, in range [0, 1]. * @@ -71,6 +73,25 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas set(threshold, value) } + + /** + * Param for the name of family which is a description of the error distribution + * to be used in the model. + * Supported options: "multinomial", "binomial". + * Default is "multinomial". + * + * @group param + */ + @Since("2.0.0") + final val family: Param[String] = new Param(this, "family", + "The name of family which is a description of the error distribution to be used in the " + + s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.", + ParamValidators.inArray[String](supportedFamilyNames)) + + /** @group getParam */ + @Since("2.0.0") + def getFamily: String = $(family) + /** * Get threshold for binary classification. * @@ -220,6 +241,17 @@ class LogisticRegression @Since("1.2.0") ( def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) setDefault(fitIntercept -> true) + /** + * Sets the value of param [[family]]. + * Default is "multinomial". + * + * @group setParam + */ + // TODO: don't use strings? + @Since("2.0.0") + def setFamily(value: String): this.type = set(family, value) + setDefault(family -> "multinomial") + /** * Whether to standardize the training features before fitting the model. * The coefficients of models will be always returned on the original scale, @@ -311,8 +343,25 @@ class LogisticRegression @Since("1.2.0") ( val histogram = labelSummarizer.histogram val numInvalid = labelSummarizer.countInvalid - val numClasses = histogram.length val numFeatures = summarizer.mean.size + val numFeaturesPlusIntercept = if (getFitIntercept) numFeatures + 1 else numFeatures + + val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match { + case Some(n: Int) => + require(n >= histogram.length, s"Specified number of classes $n was " + + s"less than the number of unique labels ${histogram.length}") + n + case None => histogram.length + } + val isBinaryClassification = numClasses == 1 || numClasses == 2 + val isMultinomial = !((!isSet(family) && isBinaryClassification) || $(family) == "binomial") + val numCoefficientSets = if (isMultinomial) numClasses else 1 + + // TODO: use enumeration or similar + if (!isMultinomial) { + require(isBinaryClassification, s"Binomial family only supports 1 or 2" + + s"outcome classes but found $numClasses") + } if (isDefined(thresholds)) { require($(thresholds).length == numClasses, this.getClass.getSimpleName + @@ -333,22 +382,18 @@ class LogisticRegression @Since("1.2.0") ( val isConstantLabel = histogram.count(_ != 0) == 1 - if (numClasses > 2) { - val msg = s"LogisticRegression with ElasticNet in ML package only supports " + - s"binary classification. Found $numClasses in the input dataset. Consider using " + - s"MultinomialLogisticRegression instead." - logError(msg) - throw new SparkException(msg) - } else if ($(fitIntercept) && numClasses == 2 && isConstantLabel) { - logWarning(s"All labels are one and fitIntercept=true, so the coefficients will be " + - s"zeros and the intercept will be positive infinity; as a result, " + - s"training is not needed.") - (Vectors.sparse(numFeatures, Seq()), Double.PositiveInfinity, Array.empty[Double]) - } else if ($(fitIntercept) && numClasses == 1) { - logWarning(s"All labels are zero and fitIntercept=true, so the coefficients will be " + - s"zeros and the intercept will be negative infinity; as a result, " + - s"training is not needed.") - (Vectors.sparse(numFeatures, Seq()), Double.NegativeInfinity, Array.empty[Double]) + if ($(fitIntercept) && isConstantLabel) { + logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " + + s"will be zeros. Training is not needed.") + val constantLabelIndex = Vectors.dense(histogram).argmax + val coefficientMatrix = Matrices.sparse(numCoefficientSets, numFeatures, + Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double]) + val interceptVector = if (isMultinomial) { + Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))) + } else { + Vectors.dense(if (numClasses == 2) Double.PositiveInfinity else Double.NegativeInfinity) + } + (coefficientMatrix, interceptVector, Array.empty[Double]) } else { if (!$(fitIntercept) && isConstantLabel) { logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " + @@ -370,35 +415,52 @@ class LogisticRegression @Since("1.2.0") ( val bcFeaturesStd = instances.context.broadcast(featuresStd) val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), - $(standardization), bcFeaturesStd, regParamL2, multinomial = false, $(aggregationDepth)) + $(standardization), bcFeaturesStd, regParamL2, multinomial = isMultinomial, + $(aggregationDepth)) val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) } else { val standardizationParam = $(standardization) + // TODO: check this works in both cases def regParamL1Fun = (index: Int) => { // Remove the L1 penalization on the intercept - if (index == numFeatures) { + val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0) + if (isIntercept) { 0.0 } else { if (standardizationParam) { regParamL1 } else { + val featureIndex = if ($(fitIntercept)) { + index % numFeaturesPlusIntercept + } else { + index % numFeatures + } // If `standardization` is false, we still standardize the data // to improve the rate of convergence; as a result, we have to // perform this reverse standardization by penalizing each component // differently to get effectively the same objective function when // the training dataset is not standardized. - if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0 + if (featuresStd(featureIndex) != 0.0) { + regParamL1 / featuresStd(featureIndex) + } else { + 0.0 + } } } } new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) } - val initialCoefficientsWithIntercept = - Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures) + // TODO: double check this + val initialCoefficientsWithIntercept = if (isMultinomial) { + Vectors.zeros(numClasses * numFeaturesPlusIntercept) + } else { + Vectors.zeros(numFeaturesPlusIntercept) + } + // TODO: need to add this for multinomial case if (optInitialModel.isDefined && optInitialModel.get.coefficients.size != numFeatures) { val vecSize = optInitialModel.get.coefficients.size logWarning( @@ -406,13 +468,46 @@ class LogisticRegression @Since("1.2.0") ( s"expected size $numFeatures") } - if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) { - val initialCoefficientsWithInterceptArray = initialCoefficientsWithIntercept.toArray - optInitialModel.get.coefficients.foreachActive { case (index, value) => - initialCoefficientsWithInterceptArray(index) = value - } - if ($(fitIntercept)) { - initialCoefficientsWithInterceptArray(numFeatures) == optInitialModel.get.intercept + // TODO: removing initial model for now +// if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) { +// val initialCoefficientsWithInterceptArray = initialCoefficientsWithIntercept.toArray +// optInitialModel.get.coefficients.foreachActive { case (index, value) => +// initialCoefficientsWithInterceptArray(index) = value +// } +// if ($(fitIntercept)) { +// initialCoefficientsWithInterceptArray(numFeatures) == optInitialModel.get.intercept +// } +// } + if ($(fitIntercept) && isMultinomial) { + // TODO: can we merge the logic or something here? + /* + For multinomial logistic regression, when we initialize the coefficients as zeros, + it will converge faster if we initialize the intercepts such that + it follows the distribution of the labels. + {{{ + P(1) = \exp(b_1) / Z + ... + P(K) = \exp(b_K) / Z + where Z = \sum_{k=1}^{K} \exp(b_k) + }}} + Since this doesn't have a unique solution, one of the solutions that satisfies the + above equations is + {{{ + \exp(b_k) = count_k * \exp(\lambda) + b_k = \log(count_k) * \lambda + }}} + \lambda is a free parameter, so choose the phase \lambda such that the + mean is centered. This yields + {{{ + b_k = \log(count_k) + b_k' = b_k - \mean(b_k) + }}} + */ + val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing + val rawMean = rawIntercepts.sum / rawIntercepts.length + rawIntercepts.indices.foreach { i => + initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) = + rawIntercepts(i) - rawMean } } else if ($(fitIntercept)) { /* @@ -452,6 +547,7 @@ class LogisticRegression @Since("1.2.0") ( logError(msg) throw new SparkException(msg) } + bcFeaturesStd.destroy(blocking = false) /* The coefficients are trained in the scaled space; we're converting them back to @@ -460,25 +556,62 @@ class LogisticRegression @Since("1.2.0") ( as a result, no scaling is needed. */ val rawCoefficients = state.x.toArray.clone() - var i = 0 - while (i < numFeatures) { - rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } - i += 1 + // TODO: I think this will work for both binomial and multinomial + val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i => + // flatIndex will loop though rawCoefficients, and skip the intercept terms. + val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i + val featureIndex = i % numFeatures + if (featuresStd(featureIndex) != 0.0) { + rawCoefficients(flatIndex) / featuresStd(featureIndex) + } else { + 0.0 + } } - bcFeaturesStd.destroy(blocking = false) + val coefficientMatrix = + new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true) - if ($(fitIntercept)) { - (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last, - arrayBuilder.result()) + if ($(regParam) == 0.0 && isMultinomial) { + /* + When no regularization is applied, the coefficients lack identifiability because + we do not use a pivot class. We can add any constant value to the coefficients and + get the same likelihood. So here, we choose the mean centered coefficients for + reproducibility. This method follows the approach in glmnet, described here: + + Friedman, et al. "Regularization Paths for Generalized Linear Models via + Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf + */ + val coefficientMean = coefficientMatrix.values.sum / coefficientMatrix.values.length + coefficientMatrix.update(_ - coefficientMean) + } + + val interceptsArray: Array[Double] = if ($(fitIntercept)) { + Array.tabulate(numCoefficientSets) { i => + val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1 + rawCoefficients(coefIndex) + } + } else { + Array[Double]() + } + /* + The intercepts are never regularized, so we always center the mean. + */ + val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) { + val interceptMean = interceptsArray.sum / numClasses + interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } + Vectors.dense(interceptsArray) + } else if (interceptsArray.nonEmpty) { + Vectors.dense(interceptsArray) } else { - (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result()) + Vectors.sparse(numClasses, Seq()) } + (coefficientMatrix, interceptVector, arrayBuilder.result()) } } if (handlePersistence) instances.unpersist() - val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept)) + val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses, + isMultinomial)) val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol() val logRegSummary = new BinaryLogisticRegressionTrainingSummary( summaryModel.transform(dataset), @@ -500,6 +633,8 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Since("1.6.0") override def load(path: String): LogisticRegression = super.load(path) + + private[classification] lazy val supportedFamilyNames = Array("binomial", "multinomial") } /** @@ -508,11 +643,34 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Since("1.4.0") class LogisticRegressionModel private[spark] ( @Since("1.4.0") override val uid: String, - @Since("2.0.0") val coefficients: Vector, - @Since("1.3.0") val intercept: Double) + @Since("2.1.0") val coefficientMatrix: Matrix, + @Since("2.1.0") val interceptVector: Vector, + @Since("1.3.0") override val numClasses: Int, + private val isMultinomial: Boolean) extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { + def this(uid: String, coefficients: Vector, intercept: Double) { + this(uid, + new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true), + Vectors.dense(intercept), 2, false) + } + + @Since("2.0.0") + // TODO: this should convert sparse to sparse and dense to dense + val coefficients: Vector = Vectors.dense(coefficientMatrix.toArray) + + @Since("1.3.0") + def intercept: Double = { + if (isMultinomial) { + logWarning("Multiclass model contains an vector of intercepts, use interceptVector instead." + + "Returning 0.0 as placeholder.") + } + _intercept + } + + private val _intercept = if (!isMultinomial) interceptVector.toArray.head else 0.0 + @Since("1.5.0") override def setThreshold(value: Double): this.type = super.setThreshold(value) @@ -527,7 +685,14 @@ class LogisticRegressionModel private[spark] ( /** Margin (rawPrediction) for class label 1. For binary classification only. */ private val margin: Vector => Double = (features) => { - BLAS.dot(features, coefficients) + intercept + BLAS.dot(features, coefficients) + _intercept + } + + /** Margin (rawPrediction) for each class label. */ + private val margins: Vector => Vector = (features) => { + val m = interceptVector.toDense.copy + BLAS.gemv(1.0, coefficientMatrix, features, 1.0, m) + m } /** Score (probability) for class label 1. For binary classification only. */ @@ -536,11 +701,36 @@ class LogisticRegressionModel private[spark] ( 1.0 / (1.0 + math.exp(-m)) } - @Since("1.6.0") - override val numFeatures: Int = coefficients.size + /** Score (probability) for each class label. */ + private val scores: Vector => Vector = (features) => { + val m = margins(features) + val maxMarginIndex = m.argmax + val marginArray = m.toArray + val maxMargin = marginArray(maxMarginIndex) - @Since("1.3.0") - override val numClasses: Int = 2 + // adjust margins for overflow + val sum = { + var temp = 0.0 + var k = 0 + while (k < numClasses) { + marginArray(k) = if (maxMargin > 0) { + math.exp(marginArray(k) - maxMargin) + } else { + math.exp(marginArray(k)) + } + temp += marginArray(k) + k += 1 + } + temp + } + + val scores = Vectors.dense(marginArray) + BLAS.scal(1 / sum, scores) + scores + } + + @Since("1.6.0") + override val numFeatures: Int = coefficientMatrix.numCols private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None @@ -597,19 +787,80 @@ class LogisticRegressionModel private[spark] ( */ override protected def predict(features: Vector): Double = { // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. - if (score(features) > getThreshold) 1 else 0 + if (isMultinomial) { + if (isDefined(thresholds)) { + val thresholds: Array[Double] = getThresholds + val probabilities = scores(features).toArray + var argMax = 0 + var max = Double.NegativeInfinity + var i = 0 + while (i < numClasses) { + if (thresholds(i) == 0.0) { + max = Double.PositiveInfinity + argMax = i + } else { + val scaled = probabilities(i) / thresholds(i) + if (scaled > max) { + max = scaled + argMax = i + } + } + i += 1 + } + argMax + } else { + scores(features).argmax + } + } + else { + if (score(features) > getThreshold) 1 else 0 + } } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction match { case dv: DenseVector => - var i = 0 - val size = dv.size - while (i < size) { - dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i))) - i += 1 + if (isMultinomial) { + val size = dv.size + val values = dv.values + + // get the maximum margin + val maxMarginIndex = rawPrediction.argmax + val maxMargin = rawPrediction(maxMarginIndex) + + if (maxMargin == Double.PositiveInfinity) { + var k = 0 + while (k < size) { + values(k) = if (k == maxMarginIndex) 1.0 else 0.0 + k += 1 + } + } else { + val sum = { + var temp = 0.0 + var k = 0 + while (k < numClasses) { + values(k) = if (maxMargin > 0) { + math.exp(values(k) - maxMargin) + } else { + math.exp(values(k)) + } + temp += values(k) + k += 1 + } + temp + } + BLAS.scal(1 / sum, dv) + } + dv + } else { + var i = 0 + val size = dv.size + while (i < size) { + dv.values(i) = 1.0 / (1.0 + math.exp(-dv.values(i))) + i += 1 + } + dv } - dv case sv: SparseVector => throw new RuntimeException("Unexpected error in LogisticRegressionModel:" + " raw2probabilitiesInPlace encountered SparseVector") @@ -617,33 +868,46 @@ class LogisticRegressionModel private[spark] ( } override protected def predictRaw(features: Vector): Vector = { - val m = margin(features) - Vectors.dense(-m, m) + if (isMultinomial) { + margins(features) + } else { + val m = margin(features) + Vectors.dense(-m, m) + } } @Since("1.4.0") override def copy(extra: ParamMap): LogisticRegressionModel = { - val newModel = copyValues(new LogisticRegressionModel(uid, coefficients, intercept), extra) + val newModel = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector, + numClasses, isMultinomial), extra) if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get) newModel.setParent(parent) } - + // TODO: basically check all these methods override protected def raw2prediction(rawPrediction: Vector): Double = { - // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. - val t = getThreshold - val rawThreshold = if (t == 0.0) { - Double.NegativeInfinity - } else if (t == 1.0) { - Double.PositiveInfinity + if (isMultinomial) { + super.raw2prediction(rawPrediction) } else { - math.log(t / (1.0 - t)) + // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. + val t = getThreshold + val rawThreshold = if (t == 0.0) { + Double.NegativeInfinity + } else if (t == 1.0) { + Double.PositiveInfinity + } else { + math.log(t / (1.0 - t)) + } + if (rawPrediction(1) > rawThreshold) 1 else 0 } - if (rawPrediction(1) > rawThreshold) 1 else 0 } override protected def probability2prediction(probability: Vector): Double = { // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. - if (probability(1) > getThreshold) 1 else 0 + if (isMultinomial) { + super.probability2prediction(probability) + } else { + if (probability(1) > getThreshold) 1 else 0 + } } /** @@ -676,15 +940,16 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { private case class Data( numClasses: Int, numFeatures: Int, - intercept: Double, - coefficients: Vector) + interceptVector: Vector, + coefficientMatrix: Matrix, + isMultinomial: Boolean) override protected def saveImpl(path: String): Unit = { // Save metadata and Params DefaultParamsWriter.saveMetadata(instance, path, sc) // Save model data: numClasses, numFeatures, intercept, coefficients - val data = Data(instance.numClasses, instance.numFeatures, instance.intercept, - instance.coefficients) + val data = Data(instance.numClasses, instance.numFeatures, instance.interceptVector, + instance.coefficientMatrix, instance.isMultinomial) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } @@ -702,13 +967,15 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) - // We will need numClasses, numFeatures in the future for multinomial logreg support. - // TODO: remove numClasses and numFeatures fields? - val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) = - MLUtils.convertVectorColumnsToML(data, "coefficients") - .select("numClasses", "numFeatures", "intercept", "coefficients") - .head() - val model = new LogisticRegressionModel(metadata.uid, coefficients, intercept) + val convertedCoefs = MLUtils.convertMatrixColumnsToML(data, "coefficientMatrix") + val converted = MLUtils.convertVectorColumnsToML(convertedCoefs, "interceptVector") + .select("numClasses", "numFeatures", "interceptVector", "coefficientMatrix", + "isMultinomial") + // TODO: numFeatures not needed? + val Row(numClasses: Int, numFeatures: Int, interceptVector: Vector, + coefficientMatrix: Matrix, isMultinomial: Boolean) = converted.head() + val model = new LogisticRegressionModel(metadata.uid, coefficientMatrix, interceptVector, + numClasses, isMultinomial) DefaultParamsReader.getAndSetParams(model, metadata) model @@ -1103,6 +1370,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * $$ *

* + * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. * @param numClasses the number of possible outcomes for k classes classification problem in diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index e4cbf5acbc11d..ad3dab33d2909 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -19,6 +19,7 @@ package org.apache.spark.mllib.classification import org.apache.spark.SparkContext import org.apache.spark.annotation.Since +import org.apache.spark.ml.linalg.DenseMatrix import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.classification.impl.GLMClassificationModel import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors} @@ -429,9 +430,11 @@ class LogisticRegressionWithLBFGS lr.setElasticNetParam(elasticNetParam) lr.setStandardization(useFeatureScaling) if (userSuppliedWeights) { + // TODO: check this val uid = Identifiable.randomUID("logreg-static") lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel( - uid, initialWeights.asML, 1.0)) + uid, new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed=true), + Vectors.dense(0.0).asML, 2, false)) } lr.setFitIntercept(addIntercept) lr.setMaxIter(optimizer.getNumIterations()) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala index 0913fe559c562..9c7e08820d93b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -87,14 +87,14 @@ class MultinomialLogisticRegressionSuite } test("params") { - ParamsSuite.checkParams(new MultinomialLogisticRegression) - val model = new MultinomialLogisticRegressionModel("mLogReg", - Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2) + ParamsSuite.checkParams(new LogisticRegression) + val model = new LogisticRegressionModel("mLogReg", + Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true) ParamsSuite.checkParams(model) } test("multinomial logistic regression: default params") { - val mlr = new MultinomialLogisticRegression + val mlr = new LogisticRegression assert(mlr.getLabelCol === "label") assert(mlr.getFeaturesCol === "features") assert(mlr.getPredictionCol === "prediction") @@ -112,15 +112,15 @@ class MultinomialLogisticRegressionSuite assert(model.getPredictionCol === "prediction") assert(model.getRawPredictionCol === "rawPrediction") assert(model.getProbabilityCol === "probability") - assert(model.intercepts !== Vectors.dense(0.0, 0.0)) + assert(model.interceptVector !== Vectors.dense(0.0, 0.0)) assert(model.hasParent) } test("multinomial logistic regression with intercept without regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer1 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) val model1 = trainer1.fit(multinomialDataset) @@ -166,21 +166,21 @@ class MultinomialLogisticRegressionSuite 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true) val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407) - assert(model1.coefficients ~== coefficientsR relTol 0.05) - assert(model1.coefficients.toArray.sum ~== 0.0 absTol eps) - assert(model1.intercepts ~== interceptsR relTol 0.05) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR relTol 0.05) - assert(model2.coefficients.toArray.sum ~== 0.0 absTol eps) - assert(model2.intercepts ~== interceptsR relTol 0.05) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model1.interceptVector ~== interceptsR relTol 0.05) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model2.interceptVector ~== interceptsR relTol 0.05) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept without regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer1 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer2 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) val model1 = trainer1.fit(multinomialDataset) @@ -226,23 +226,23 @@ class MultinomialLogisticRegressionSuite -0.3036269, 0.9449630, -0.2271038, -0.4364839, 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true) - assert(model1.coefficients ~== coefficientsR relTol 0.05) - assert(model1.coefficients.toArray.sum ~== 0.0 absTol eps) - assert(model1.intercepts.toArray === Array.fill(3)(0.0)) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR relTol 0.05) - assert(model2.coefficients.toArray.sum ~== 0.0 absTol eps) - assert(model2.intercepts.toArray === Array.fill(3)(0.0)) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression with intercept with L1 regularization") { // use tighter constraints because OWL-QN solver takes longer to converge - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer1 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) .setMaxIter(300).setTol(1e-10) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) .setMaxIter(300).setTol(1e-10) @@ -328,18 +328,18 @@ class MultinomialLogisticRegressionSuite 0.0, 0.0, 0.0, 0.0), isTransposed = true) val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428) - assert(model1.coefficients ~== coefficientsRStd absTol 0.02) - assert(model1.intercepts ~== interceptsRStd relTol 0.1) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR absTol 0.02) - assert(model2.intercepts ~== interceptsR relTol 0.1) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02) + assert(model1.interceptVector ~== interceptsRStd relTol 0.1) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02) + assert(model2.interceptVector ~== interceptsR relTol 0.1) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept with L1 regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer1 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer2 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) val model1 = trainer1.fit(multinomialDataset) @@ -421,18 +421,18 @@ class MultinomialLogisticRegressionSuite 0.0, 0.1943624, -0.1902577, -0.1028789, 0.0, 0.0, 0.0, 0.0), isTransposed = true) - assert(model1.coefficients ~== coefficientsRStd absTol 0.01) - assert(model1.intercepts.toArray === Array.fill(3)(0.0)) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR absTol 0.01) - assert(model2.intercepts.toArray === Array.fill(3)(0.0)) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression with intercept with L2 regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer1 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) val model1 = trainer1.fit(multinomialDataset) @@ -516,18 +516,18 @@ class MultinomialLogisticRegressionSuite 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true) val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212) - assert(model1.coefficients ~== coefficientsRStd relTol 0.05) - assert(model1.intercepts ~== interceptsRStd relTol 0.05) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR relTol 0.05) - assert(model2.intercepts ~== interceptsR relTol 0.05) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05) + assert(model1.interceptVector ~== interceptsRStd relTol 0.05) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.interceptVector ~== interceptsR relTol 0.05) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept with L2 regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer1 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer2 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) val model1 = trainer1.fit(multinomialDataset) @@ -607,19 +607,19 @@ class MultinomialLogisticRegressionSuite -0.08469036, 0.38996748, -0.16468436, -0.22522976, 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true) - assert(model1.coefficients ~== coefficientsRStd absTol 0.01) - assert(model1.intercepts.toArray === Array.fill(3)(0.0)) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR absTol 0.01) - assert(model2.intercepts.toArray === Array.fill(3)(0.0)) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression with intercept with elasticnet regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer1 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) .setMaxIter(300).setTol(1e-10) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) .setMaxIter(300).setTol(1e-10) @@ -704,19 +704,19 @@ class MultinomialLogisticRegressionSuite 0.0, 0.0, 0.0, 0.0), isTransposed = true) val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366) - assert(model1.coefficients ~== coefficientsRStd absTol 0.01) - assert(model1.intercepts ~== interceptsRStd absTol 0.01) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR absTol 0.01) - assert(model2.intercepts ~== interceptsR absTol 0.01) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector ~== interceptsRStd absTol 0.01) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector ~== interceptsR absTol 0.01) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } test("multinomial logistic regression without intercept with elasticnet regularization") { - val trainer1 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer1 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) .setMaxIter(300).setTol(1e-10) - val trainer2 = (new MultinomialLogisticRegression).setFitIntercept(false) + val trainer2 = (new LogisticRegression).setFitIntercept(false) .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) .setMaxIter(300).setTol(1e-10) @@ -798,12 +798,12 @@ class MultinomialLogisticRegressionSuite 0.0, 0.14666497, -0.16570638, -0.05982875, 0.0, 0.0, 0.0, 0.0), isTransposed = true) - assert(model1.coefficients ~== coefficientsRStd absTol 0.01) - assert(model1.intercepts.toArray === Array.fill(3)(0.0)) - assert(model1.intercepts.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficients ~== coefficientsR absTol 0.01) - assert(model2.intercepts.toArray === Array.fill(3)(0.0)) - assert(model2.intercepts.toArray.sum ~== 0.0 absTol eps) + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } /* @@ -814,9 +814,9 @@ class MultinomialLogisticRegressionSuite */ test("prediction") { - val model = new MultinomialLogisticRegressionModel("mLogReg", + val model = new LogisticRegressionModel("mLogReg", Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), - Vectors.dense(0.0, 0.0, 0.0), 3) + Vectors.dense(0.0, 0.0, 0.0), 3, true) val overFlowData = spark.createDataFrame(Seq( LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)), LabeledPoint(1.0, Vectors.dense(0.0, -1.0)) @@ -837,7 +837,7 @@ class MultinomialLogisticRegressionSuite } test("multinomial logistic regression: Predictor, Classifier methods") { - val mlr = new MultinomialLogisticRegression + val mlr = new LogisticRegression val model = mlr.fit(dataset) assert(model.numClasses === 3) @@ -852,9 +852,9 @@ class MultinomialLogisticRegressionSuite val margins = Array.tabulate(3) { k => var margin = 0.0 features.foreachActive { (index, value) => - margin += value * model.coefficients(k, index) + margin += value * model.coefficientMatrix(k, index) } - margin += model.intercepts(k) + margin += model.interceptVector(k) margin } assert(raw ~== Vectors.dense(margins) relTol eps) @@ -884,21 +884,21 @@ class MultinomialLogisticRegressionSuite } test("multinomial logistic regression coefficients should be centered") { - val mlr = new MultinomialLogisticRegression().setMaxIter(1) + val mlr = new LogisticRegression().setMaxIter(1) val model = mlr.fit(dataset) - assert(model.intercepts.toArray.sum ~== 0.0 absTol 1e-6) - assert(model.coefficients.toArray.sum ~== 0.0 absTol 1e-6) + assert(model.interceptVector.toArray.sum ~== 0.0 absTol 1e-6) + assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol 1e-6) } test("numClasses specified in metadata/inferred") { - val mlr = new MultinomialLogisticRegression().setMaxIter(1) + val mlr = new LogisticRegression().setMaxIter(1) // specify more classes than unique label values val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata() val df = dataset.select(dataset("label").as("label", labelMeta), dataset("features")) val model1 = mlr.fit(df) assert(model1.numClasses === 4) - assert(model1.intercepts.size === 4) + assert(model1.interceptVector.size === 4) // specify two classes when there are really three val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata() @@ -919,7 +919,7 @@ class MultinomialLogisticRegressionSuite LabeledPoint(4.0, Vectors.dense(1.0)), LabeledPoint(4.0, Vectors.dense(2.0))) ) - val mlr = new MultinomialLogisticRegression + val mlr = new LogisticRegression().setFamily("multinomial") val model = mlr.fit(constantData) val results = model.transform(constantData) results.select("rawPrediction", "probability", "prediction").collect().foreach { @@ -966,7 +966,7 @@ class MultinomialLogisticRegressionSuite val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) }) - val mlr = new MultinomialLogisticRegression().setWeightCol("weight") + val mlr = new LogisticRegression().setWeightCol("weight") val model = mlr.fit(outlierData) val results = model.transform(testData).select("label", "prediction").collect() @@ -979,11 +979,11 @@ class MultinomialLogisticRegressionSuite 42L) val weightedModel = mlr.fit(weightedData) val overSampledModel = mlr.setWeightCol("").fit(overSampledData) - assert(weightedModel.coefficients ~== overSampledModel.coefficients relTol 0.01) + assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01) } test("thresholds prediction") { - val mlr = new MultinomialLogisticRegression + val mlr = new LogisticRegression val model = mlr.fit(dataset) val basePredictions = model.transform(dataset).select("prediction").collect() @@ -1010,28 +1010,28 @@ class MultinomialLogisticRegressionSuite }) } - test("read/write") { - def checkModelData( - model: MultinomialLogisticRegressionModel, - model2: MultinomialLogisticRegressionModel): Unit = { - assert(model.intercepts === model2.intercepts) - assert(model.coefficients.toArray === model2.coefficients.toArray) - assert(model.numClasses === model2.numClasses) - assert(model.numFeatures === model2.numFeatures) - } - val mlr = new MultinomialLogisticRegression() - testEstimatorAndModelReadWrite(mlr, dataset, - MultinomialLogisticRegressionSuite.allParamSettings, - checkModelData) - } +// test("read/write") { +// def checkModelData( +// model: LogisticRegressionModel, +// model2: LogisticRegressionModel): Unit = { +// assert(model.interceptVector === model2.interceptVector) +// assert(model.coefficientMatrix.toArray === model2.coefficients.toArray) +// assert(model.numClasses === model2.numClasses) +// assert(model.numFeatures === model2.numFeatures) +// } +// val mlr = new LogisticRegression() +// testEstimatorAndModelReadWrite(mlr, dataset, +// MultinomialLogisticRegressionSuite.allParamSettings, +// checkModelData) +// } test("should support all NumericType labels and not support other types") { - val mlr = new MultinomialLogisticRegression().setMaxIter(1) + val mlr = new LogisticRegression().setMaxIter(1) MLTestingUtils - .checkNumericTypes[MultinomialLogisticRegressionModel, MultinomialLogisticRegression]( + .checkNumericTypes[LogisticRegressionModel, LogisticRegression]( mlr, spark) { (expected, actual) => - assert(expected.intercepts === actual.intercepts) - assert(expected.coefficients.toArray === actual.coefficients.toArray) + assert(expected.interceptVector === actual.interceptVector) + assert(expected.coefficientMatrix.toArray === actual.coefficients.toArray) } } } From d4675bea0c531a786381adb9c4763f97ae8bcb9e Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 24 Aug 2016 22:05:46 -0700 Subject: [PATCH 02/24] add initial model --- .../classification/LogisticRegression.scala | 46 +++++++++++-------- .../LogisticRegressionSuite.scala | 36 ++++++++++++++- 2 files changed, 61 insertions(+), 21 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index c8c06a4d7752b..15a2450f464de 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -354,10 +354,10 @@ class LogisticRegression @Since("1.2.0") ( case None => histogram.length } val isBinaryClassification = numClasses == 1 || numClasses == 2 + // TODO: use enumeration or similar val isMultinomial = !((!isSet(family) && isBinaryClassification) || $(family) == "binomial") val numCoefficientSets = if (isMultinomial) numClasses else 1 - // TODO: use enumeration or similar if (!isMultinomial) { require(isBinaryClassification, s"Binomial family only supports 1 or 2" + s"outcome classes but found $numClasses") @@ -461,25 +461,33 @@ class LogisticRegression @Since("1.2.0") ( } // TODO: need to add this for multinomial case - if (optInitialModel.isDefined && optInitialModel.get.coefficients.size != numFeatures) { - val vecSize = optInitialModel.get.coefficients.size - logWarning( - s"Initial coefficients will be ignored!! As its size $vecSize did not match the " + - s"expected size $numFeatures") + val initialModelIsValid = optInitialModel.exists { model => + val providedCoefs = model.coefficientMatrix + val modelValid = (providedCoefs.numRows == numCoefficientSets) && + (providedCoefs.numCols == numFeatures) && + (model.interceptVector.size == numCoefficientSets) + if (!modelValid) { + logWarning(s"Initial coefficients will be ignored! Its dimensions " + + s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the expected " + + s"size ($numCoefficientSets, $numFeatures)") + } + modelValid } - // TODO: removing initial model for now -// if (optInitialModel.isDefined && optInitialModel.get.coefficients.size == numFeatures) { -// val initialCoefficientsWithInterceptArray = initialCoefficientsWithIntercept.toArray -// optInitialModel.get.coefficients.foreachActive { case (index, value) => -// initialCoefficientsWithInterceptArray(index) = value -// } -// if ($(fitIntercept)) { -// initialCoefficientsWithInterceptArray(numFeatures) == optInitialModel.get.intercept -// } -// } - if ($(fitIntercept) && isMultinomial) { - // TODO: can we merge the logic or something here? + if (initialModelIsValid) { + val initialCoefArray = initialCoefficientsWithIntercept.toArray + val providedCoefArray = optInitialModel.get.coefficientMatrix.toArray + providedCoefArray.indices.foreach { i => + val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i + initialCoefArray(flatIndex) = providedCoefArray(i) + } + if ($(fitIntercept)) { + optInitialModel.get.interceptVector.foreachActive { (index, value) => + val coefIndex = (index + 1) * numFeaturesPlusIntercept - 1 + initialCoefArray(coefIndex) = value + } + } + } else if ($(fitIntercept) && isMultinomial) { /* For multinomial logistic regression, when we initialize the coefficients as zeros, it will converge faster if we initialize the intercepts such that @@ -556,7 +564,6 @@ class LogisticRegression @Since("1.2.0") ( as a result, no scaling is needed. */ val rawCoefficients = state.x.toArray.clone() - // TODO: I think this will work for both binomial and multinomial val coefficientArray = Array.tabulate(numCoefficientSets * numFeatures) { i => // flatIndex will loop though rawCoefficients, and skip the intercept terms. val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i @@ -612,6 +619,7 @@ class LogisticRegression @Since("1.2.0") ( val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses, isMultinomial)) + // TODO: need to implement model summary for MLOR... probably best to do it in another JIRA val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol() val logRegSummary = new BinaryLogisticRegressionTrainingSummary( summaryModel.transform(dataset), diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index a1b48539c46e0..a0af82c2ea42c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -25,7 +25,7 @@ import scala.util.control.Breaks._ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.{Instance, LabeledPoint} -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.linalg.{DenseMatrix, Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -37,7 +37,8 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @transient var dataset: Dataset[_] = _ - @transient var binaryDataset: DataFrame = _ + @transient var binaryDataset: Dataset[_] = _ + @transient var multinomialDataset: Dataset[_] = _ private val eps: Double = 1e-5 override def beforeAll(): Unit = { @@ -57,6 +58,23 @@ class LogisticRegressionSuite spark.createDataFrame(sc.parallelize(testData, 4)) } + + multinomialDataset = { + val nPoints = 10000 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, + -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) + + val xMean = Array(5.843, 3.057, 3.758, 1.199) + val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) + + val df = spark.createDataFrame(sc.parallelize(testData, 4)) + df.cache() + df + } } /** @@ -886,6 +904,20 @@ class LogisticRegressionSuite assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) } + test("set initial model") { + // TODO: the binary one doesn't converge any faster + // TODO: should they converge after one or two iterations? + val lr = new LogisticRegression() + val model1 = lr.fit(binaryDataset) + val lr2 = new LogisticRegression().setInitialModel(model1) + val model2 = lr2.fit(binaryDataset) + + val lr3 = new LogisticRegression() + val model3 = lr3.fit(multinomialDataset) + val lr4 = new LogisticRegression().setInitialModel(model3) + val model4 = lr4.fit(multinomialDataset) + } + test("logistic regression with all labels the same") { val sameLabels = dataset .withColumn("zeroLabel", lit(0.0)) From a399ef3ab4b9720f081b2e234f993eef61c5587b Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 25 Aug 2016 09:16:33 -0700 Subject: [PATCH 03/24] fixing some todos, added dual support for weighted tests --- .../classification/LogisticRegression.scala | 65 +++++---- .../LogisticRegressionSuite.scala | 138 +++++++++++------- 2 files changed, 120 insertions(+), 83 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 15a2450f464de..aca96aa3ba3a1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -75,16 +75,22 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** - * Param for the name of family which is a description of the error distribution + * Param for the name of family which is a description of the label distribution * to be used in the model. - * Supported options: "multinomial", "binomial". - * Default is "multinomial". + * Supported options: "auto", "multinomial", "binomial". + * Supported options: + * - "auto": Automatically select the family based on the number of classes: + * If numClasses == 1 || numClasses == 2, set to "binomial". + * Else, set to "multinomial" + * - "binomial": Binary logistic regression with pivoting. + * - "multinomial": Multinomial (softmax) regression without pivoting. + * Default is "auto". * * @group param */ @Since("2.0.0") final val family: Param[String] = new Param(this, "family", - "The name of family which is a description of the error distribution to be used in the " + + "The name of family which is a description of the label distribution to be used in the " + s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.", ParamValidators.inArray[String](supportedFamilyNames)) @@ -243,14 +249,13 @@ class LogisticRegression @Since("1.2.0") ( /** * Sets the value of param [[family]]. - * Default is "multinomial". + * Default is "auto". * * @group setParam */ - // TODO: don't use strings? @Since("2.0.0") def setFamily(value: String): this.type = set(family, value) - setDefault(family -> "multinomial") + setDefault(family -> "auto") /** * Whether to standardize the training features before fitting the model. @@ -267,6 +272,7 @@ class LogisticRegression @Since("1.2.0") ( setDefault(standardization -> true) @Since("1.5.0") + // TODO: Check this behavior override def setThreshold(value: Double): this.type = super.setThreshold(value) @Since("1.5.0") @@ -354,12 +360,12 @@ class LogisticRegression @Since("1.2.0") ( case None => histogram.length } val isBinaryClassification = numClasses == 1 || numClasses == 2 - // TODO: use enumeration or similar - val isMultinomial = !((!isSet(family) && isBinaryClassification) || $(family) == "binomial") + val isMultinomial = ($(family) == LogisticRegression.auto && !isBinaryClassification) || + ($(family) == LogisticRegression.multinomial) val numCoefficientSets = if (isMultinomial) numClasses else 1 if (!isMultinomial) { - require(isBinaryClassification, s"Binomial family only supports 1 or 2" + + require(isBinaryClassification, s"Binomial family only supports 1 or 2 " + s"outcome classes but found $numClasses") } @@ -422,7 +428,6 @@ class LogisticRegression @Since("1.2.0") ( new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) } else { val standardizationParam = $(standardization) - // TODO: check this works in both cases def regParamL1Fun = (index: Int) => { // Remove the L1 penalization on the intercept val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0) @@ -453,14 +458,8 @@ class LogisticRegression @Since("1.2.0") ( new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) } - // TODO: double check this - val initialCoefficientsWithIntercept = if (isMultinomial) { - Vectors.zeros(numClasses * numFeaturesPlusIntercept) - } else { - Vectors.zeros(numFeaturesPlusIntercept) - } + val initialCoefficientsWithIntercept = Vectors.zeros(numCoefficientSets * numFeatures) - // TODO: need to add this for multinomial case val initialModelIsValid = optInitialModel.exists { model => val providedCoefs = model.coefficientMatrix val modelValid = (providedCoefs.numRows == numCoefficientSets) && @@ -619,15 +618,19 @@ class LogisticRegression @Since("1.2.0") ( val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses, isMultinomial)) - // TODO: need to implement model summary for MLOR... probably best to do it in another JIRA - val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol() - val logRegSummary = new BinaryLogisticRegressionTrainingSummary( - summaryModel.transform(dataset), - probabilityColName, - $(labelCol), - $(featuresCol), - objectiveHistory) - val m = model.setSummary(logRegSummary) + // TODO: implement summary model for multinomial case + val m = if (!isMultinomial) { + val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol() + val logRegSummary = new BinaryLogisticRegressionTrainingSummary( + summaryModel.transform(dataset), + probabilityColName, + $(labelCol), + $(featuresCol), + objectiveHistory) + model.setSummary(logRegSummary) + } else { + model + } instr.logSuccess(m) m } @@ -642,7 +645,11 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Since("1.6.0") override def load(path: String): LogisticRegression = super.load(path) - private[classification] lazy val supportedFamilyNames = Array("binomial", "multinomial") + private val multinomial = "multinomial" + private val binomial = "binomial" + private val auto = "auto" + + private[classification] lazy val supportedFamilyNames = Array(auto, binomial, multinomial) } /** @@ -891,7 +898,7 @@ class LogisticRegressionModel private[spark] ( if (trainingSummary.isDefined) newModel.setSummary(trainingSummary.get) newModel.setParent(parent) } - // TODO: basically check all these methods + override protected def raw2prediction(rawPrediction: Vector): Double = { if (isMultinomial) { super.raw2prediction(rawPrediction) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index a0af82c2ea42c..899158e45954a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -85,6 +85,9 @@ class LogisticRegressionSuite binaryDataset.rdd.map { case Row(label: Double, features: Vector) => label + "," + features.toArray.mkString(",") }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset") + multinomialDataset.rdd.map { case Row(label: Double, features: Vector) => + label + "," + features.toArray.mkString(",") + }.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/multinomialDataset") } test("params") { @@ -100,6 +103,7 @@ class LogisticRegressionSuite assert(lr.getPredictionCol === "prediction") assert(lr.getRawPredictionCol === "rawPrediction") assert(lr.getProbabilityCol === "probability") + assert(lr.getFamily === "multinomial") assert(!lr.isDefined(lr.weightCol)) assert(lr.getFitIntercept) assert(lr.getStandardization) @@ -221,7 +225,6 @@ class LogisticRegressionSuite } test("logistic regression: Predictor, Classifier methods") { - val spark = this.spark val lr = new LogisticRegression val model = lr.fit(dataset) @@ -811,6 +814,7 @@ class LogisticRegressionSuite } test("evaluate on test set") { + // TODO: add for multiclass // Evaluate on test set should be same as that of the transformed training data. val lr = new LogisticRegression() .setMaxIter(10) @@ -845,63 +849,89 @@ class LogisticRegressionSuite } - test("binary logistic regression with weighted samples") { - val (dataset, weightedDataset) = { - val nPoints = 1000 - val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191) - val xMean = Array(5.843, 3.057, 3.758, 1.199) - val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) - val testData = - generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, nPoints, 42) - - // Let's over-sample the positive samples twice. - val data1 = testData.flatMap { case labeledPoint: LabeledPoint => - if (labeledPoint.label == 1.0) { - Iterator(labeledPoint, labeledPoint) - } else { - Iterator(labeledPoint) - } - } + test("binary logistic regression with weighted data") { + val numClasses = 2 + val numPoints = 40 + val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, + numClasses, numPoints) + val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => + LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) + }) + val lr = new LogisticRegression().setWeightCol("weight") + val model = lr.fit(outlierData) + val results = model.transform(testData).select("label", "prediction").collect() + + // check that the predictions are the one to one mapping + results.foreach { case Row(label: Double, pred: Double) => + assert(label === pred) + } + val (overSampledData, weightedData) = + MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features", + 42L) + val weightedModel = lr.fit(weightedData) + val overSampledModel = lr.setWeightCol("").fit(overSampledData) + assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01) + } - val rnd = new Random(8392) - val data2 = testData.flatMap { case LabeledPoint(label: Double, features: Vector) => - if (rnd.nextGaussian() > 0.0) { - if (label == 1.0) { - Iterator( - Instance(label, 1.2, features), - Instance(label, 0.8, features), - Instance(0.0, 0.0, features)) - } else { - Iterator( - Instance(label, 0.3, features), - Instance(1.0, 0.0, features), - Instance(label, 0.1, features), - Instance(label, 0.6, features)) - } - } else { - if (label == 1.0) { - Iterator(Instance(label, 2.0, features)) - } else { - Iterator(Instance(label, 1.0, features)) - } - } - } + test("multinomial logistic regression with weighted data") { + val numClasses = 5 + val numPoints = 40 + val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, + numClasses, numPoints) + val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => + LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) + }) + val mlr = new LogisticRegression().setWeightCol("weight") + val model = mlr.fit(outlierData) + val results = model.transform(testData).select("label", "prediction").collect() + + // check that the predictions are the one to one mapping + results.foreach { case Row(label: Double, pred: Double) => + assert(label === pred) + } + val (overSampledData, weightedData) = + MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features", + 42L) + val weightedModel = mlr.fit(weightedData) + val overSampledModel = mlr.setWeightCol("").fit(overSampledData) + assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01) + } - (spark.createDataFrame(sc.parallelize(data1, 4)), - spark.createDataFrame(sc.parallelize(data2, 4))) + test("set family") { + val lr = new LogisticRegression().setMaxIter(1) + // don't set anything for binary classification + val model1 = lr.fit(binaryDataset) + assert(model1.coefficientMatrix.numRows === 1 && model1.coefficientMatrix.numCols === 4) + assert(model1.interceptVector.size === 1) + + // set to multinomial for binary classification + val model2 = lr.setFamily("multinomial").fit(binaryDataset) + assert(model2.coefficientMatrix.numRows === 2 && model2.coefficientMatrix.numCols === 4) + assert(model2.interceptVector.size === 2) + + // set to binary for binary classification + val model3 = lr.setFamily("binomial").fit(binaryDataset) + assert(model3.coefficientMatrix.numRows === 1 && model3.coefficientMatrix.numCols === 4) + assert(model3.interceptVector.size === 1) + + // don't set anything for multiclass classification + val mlr = new LogisticRegression().setMaxIter(1) + val model4 = mlr.fit(multinomialDataset) + assert(model4.coefficientMatrix.numRows === 3 && model4.coefficientMatrix.numCols === 4) + assert(model4.interceptVector.size === 3) + + // set to binary for multiclass classification + mlr.setFamily("binomial") + val thrown = intercept[IllegalArgumentException] { + mlr.fit(multinomialDataset) } + assert(thrown.getMessage.contains("Binomial family only supports 1 or 2 outcome classes")) - val trainer1a = (new LogisticRegression).setFitIntercept(true) - .setRegParam(0.0).setStandardization(true) - val trainer1b = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight") - .setRegParam(0.0).setStandardization(true) - val model1a0 = trainer1a.fit(dataset) - val model1a1 = trainer1a.fit(weightedDataset) - val model1b = trainer1b.fit(weightedDataset) - assert(model1a0.coefficients !~= model1a1.coefficients absTol 1E-3) - assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3) - assert(model1a0.coefficients ~== model1b.coefficients absTol 1E-3) - assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) + // set to multinomial for multiclass + mlr.setFamily("multinomial") + val model5 = mlr.fit(multinomialDataset) + assert(model5.coefficientMatrix.numRows === 3 && model5.coefficientMatrix.numCols === 4) + assert(model5.interceptVector.size === 3) } test("set initial model") { From a35469019ba6ca0cb0fd9877c28ae02aba46d337 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 25 Aug 2016 13:11:44 -0700 Subject: [PATCH 04/24] all auxiliary tests are merged to LOR, and added initial model test --- .../classification/LogisticRegression.scala | 6 +- .../LogisticRegressionSuite.scala | 315 ++++++++++++++++-- .../MultinomialLogisticRegressionSuite.scala | 264 ++------------- 3 files changed, 322 insertions(+), 263 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index aca96aa3ba3a1..9b1845eaef98a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -458,7 +458,8 @@ class LogisticRegression @Since("1.2.0") ( new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) } - val initialCoefficientsWithIntercept = Vectors.zeros(numCoefficientSets * numFeatures) + val initialCoefficientsWithIntercept = + Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept) val initialModelIsValid = optInitialModel.exists { model => val providedCoefs = model.coefficientMatrix @@ -678,7 +679,7 @@ class LogisticRegressionModel private[spark] ( @Since("1.3.0") def intercept: Double = { if (isMultinomial) { - logWarning("Multiclass model contains an vector of intercepts, use interceptVector instead." + + logWarning("Multiclass model contains a vector of intercepts, use interceptVector instead." + "Returning 0.0 as placeholder.") } _intercept @@ -940,6 +941,7 @@ class LogisticRegressionModel private[spark] ( @Since("1.6.0") object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { + // TODO: we need to be able to load old models as well @Since("1.6.0") override def read: MLReader[LogisticRegressionModel] = new LogisticRegressionModelReader diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 899158e45954a..a8e94fafa50ed 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.classification +import org.apache.spark.ml.attribute.NominalAttribute + import scala.collection.JavaConverters._ import scala.language.existentials import scala.util.Random @@ -25,7 +27,7 @@ import scala.util.control.Breaks._ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.{Instance, LabeledPoint} -import org.apache.spark.ml.linalg.{DenseMatrix, Vector, Vectors} +import org.apache.spark.ml.linalg.{Matrices, DenseMatrix, Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -36,7 +38,8 @@ import org.apache.spark.sql.functions.lit class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - @transient var dataset: Dataset[_] = _ + @transient var smallBinaryDataset: Dataset[_] = _ + @transient var smallMultinomialDataset: Dataset[_] = _ @transient var binaryDataset: Dataset[_] = _ @transient var multinomialDataset: Dataset[_] = _ private val eps: Double = 1e-5 @@ -44,7 +47,25 @@ class LogisticRegressionSuite override def beforeAll(): Unit = { super.beforeAll() - dataset = spark.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)) + smallBinaryDataset = + spark.createDataFrame(generateLogisticInput(1.0, 1.0, nPoints = 100, seed = 42)) + + smallMultinomialDataset = { + val nPoints = 100 + val coefficients = Array( + -0.57997, 0.912083, -0.371077, + -0.16624, -0.84355, -0.048509) + + val xMean = Array(5.843, 3.057) + val xVariance = Array(0.6856, 0.1899) + + val testData = generateMultinomialLogisticInput( + coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) + + val df = spark.createDataFrame(sc.parallelize(testData, 4)) + df.cache() + df + } binaryDataset = { val nPoints = 10000 @@ -78,7 +99,7 @@ class LogisticRegressionSuite } /** - * Enable the ignored test to export the dataset into CSV format, + * Enable the ignored test to export the smallBinaryDataset into CSV format, * so we can validate the training accuracy compared with R's glmnet package. */ ignore("export test data into CSV format") { @@ -103,12 +124,12 @@ class LogisticRegressionSuite assert(lr.getPredictionCol === "prediction") assert(lr.getRawPredictionCol === "rawPrediction") assert(lr.getProbabilityCol === "probability") - assert(lr.getFamily === "multinomial") + assert(lr.getFamily === "auto") assert(!lr.isDefined(lr.weightCol)) assert(lr.getFitIntercept) assert(lr.getStandardization) - val model = lr.fit(dataset) - model.transform(dataset) + val model = lr.fit(smallBinaryDataset) + model.transform(smallBinaryDataset) .select("label", "probability", "prediction", "rawPrediction") .collect() assert(model.getThreshold === 0.5) @@ -122,11 +143,11 @@ class LogisticRegressionSuite test("empty probabilityCol") { val lr = new LogisticRegression().setProbabilityCol("") - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert(model.hasSummary) // Validate that we re-insert a probability column for evaluation val fieldNames = model.summary.predictions.schema.fieldNames - assert(dataset.schema.fieldNames.toSet.subsetOf( + assert(smallBinaryDataset.schema.fieldNames.toSet.subsetOf( fieldNames.toSet)) assert(fieldNames.exists(s => s.startsWith("probability_"))) } @@ -163,17 +184,59 @@ class LogisticRegressionSuite // thresholds and threshold must be consistent: values withClue("fit with ParamMap should throw error if threshold, thresholds do not match.") { intercept[IllegalArgumentException] { - val lr2model = lr2.fit(dataset, + val lr2model = lr2.fit(smallBinaryDataset, lr2.thresholds -> Array(0.3, 0.7), lr2.threshold -> (expectedThreshold / 2.0)) lr2model.getThreshold } } } + test("thresholds prediction") { + val blr = new LogisticRegression().setFamily("binomial") + val binaryModel = blr.fit(smallBinaryDataset) + + binaryModel.setThreshold(1.0) + val binaryZeroPredictions = + binaryModel.transform(smallBinaryDataset).select("prediction").collect() + assert(binaryZeroPredictions.forall(_.getDouble(0) === 0.0)) + + binaryModel.setThreshold(0.0) + val binaryOnePredictions = + binaryModel.transform(smallBinaryDataset).select("prediction").collect() + assert(binaryOnePredictions.forall(_.getDouble(0) === 1.0)) + + + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(smallMultinomialDataset) + val basePredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + + // should predict all zeros + model.setThresholds(Array(1, 1000, 1000)) + val zeroPredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(zeroPredictions.forall(_.getDouble(0) === 0.0)) + + // should predict all ones + model.setThresholds(Array(1000, 1, 1000)) + val onePredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(onePredictions.forall(_.getDouble(0) === 1.0)) + + // should predict all twos + model.setThresholds(Array(1000, 1000, 1)) + val twoPredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(twoPredictions.forall(_.getDouble(0) === 2.0)) + + // constant threshold scaling is the same as no thresholds + model.setThresholds(Array(1000, 1000, 1000)) + val scaledPredictions = model.transform(smallMultinomialDataset).select("prediction").collect() + assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) => + scaled.getDouble(0) === base.getDouble(0) + }) + } + test("logistic regression doesn't fit intercept when fitIntercept is off") { val lr = new LogisticRegression lr.setFitIntercept(false) - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert(model.intercept === 0.0) // copied model must have the same parent. @@ -187,7 +250,7 @@ class LogisticRegressionSuite .setRegParam(1.0) .setThreshold(0.6) .setProbabilityCol("myProbability") - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) val parent = model.parent.asInstanceOf[LogisticRegression] assert(parent.getMaxIter === 10) assert(parent.getRegParam === 1.0) @@ -196,16 +259,16 @@ class LogisticRegressionSuite // Modify model params, and check that the params worked. model.setThreshold(1.0) - val predAllZero = model.transform(dataset) + val predAllZero = model.transform(smallBinaryDataset) .select("prediction", "myProbability") .collect() .map { case Row(pred: Double, prob: Vector) => pred } assert(predAllZero.forall(_ === 0), s"With threshold=1.0, expected predictions to be all 0, but only" + - s" ${predAllZero.count(_ === 0)} of ${dataset.count()} were 0.") + s" ${predAllZero.count(_ === 0)} of ${smallBinaryDataset.count()} were 0.") // Call transform with params, and check that the params worked. val predNotAllZero = - model.transform(dataset, model.threshold -> 0.0, + model.transform(smallBinaryDataset, model.threshold -> 0.0, model.probabilityCol -> "myProb") .select("prediction", "myProb") .collect() @@ -214,7 +277,7 @@ class LogisticRegressionSuite // Call fit() with new params, and check as many params as we can. lr.setThresholds(Array(0.6, 0.4)) - val model2 = lr.fit(dataset, lr.maxIter -> 5, lr.regParam -> 0.1, + val model2 = lr.fit(smallBinaryDataset, lr.maxIter -> 5, lr.regParam -> 0.1, lr.probabilityCol -> "theProb") val parent2 = model2.parent.asInstanceOf[LogisticRegression] assert(parent2.getMaxIter === 5) @@ -224,16 +287,63 @@ class LogisticRegressionSuite assert(model2.getProbabilityCol === "theProb") } - test("logistic regression: Predictor, Classifier methods") { + test("multinomial logistic regression: Predictor, Classifier methods") { + val mlr = new LogisticRegression + + val model = mlr.fit(smallMultinomialDataset) + assert(model.numClasses === 3) + val numFeatures = smallMultinomialDataset.select("features").first().getAs[Vector](0).size + assert(model.numFeatures === numFeatures) + + val results = model.transform(smallMultinomialDataset) + // check that raw prediction is coefficients dot features + intercept + results.select("rawPrediction", "features").collect().foreach { + case Row(raw: Vector, features: Vector) => + assert(raw.size === 3) + val margins = Array.tabulate(3) { k => + var margin = 0.0 + features.foreachActive { (index, value) => + margin += value * model.coefficientMatrix(k, index) + } + margin += model.interceptVector(k) + margin + } + assert(raw ~== Vectors.dense(margins) relTol eps) + } + + // Compare rawPrediction with probability + results.select("rawPrediction", "probability").collect().foreach { + case Row(raw: Vector, prob: Vector) => + assert(raw.size === 3) + assert(prob.size === 3) + val max = raw.toArray.max + val subtract = if (max > 0) max else 0.0 + val sum = raw.toArray.map(x => math.exp(x - subtract)).sum + val probFromRaw0 = math.exp(raw(0) - subtract) / sum + val probFromRaw1 = math.exp(raw(1) - subtract) / sum + assert(prob(0) ~== probFromRaw0 relTol eps) + assert(prob(1) ~== probFromRaw1 relTol eps) + assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps) + } + + // Compare prediction with probability + results.select("prediction", "probability").collect().foreach { + case Row(pred: Double, prob: Vector) => + val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 + assert(pred == predFromProb) + } + } + + test("binary logistic regression: Predictor, Classifier methods") { val lr = new LogisticRegression - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert(model.numClasses === 2) - val numFeatures = dataset.select("features").first().getAs[Vector](0).size + val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) val threshold = model.getThreshold - val results = model.transform(dataset) + val results = model.transform(smallBinaryDataset) // Compare rawPrediction with probability results.select("rawPrediction", "probability").collect().foreach { @@ -253,6 +363,29 @@ class LogisticRegressionSuite } } + test("overflow prediction for multiclass") { + val model = new LogisticRegressionModel("mLogReg", + Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), + Vectors.dense(0.0, 0.0, 0.0), 3, true) + val overFlowData = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)), + LabeledPoint(1.0, Vectors.dense(0.0, -1.0)) + )) + val results = model.transform(overFlowData).select("rawPrediction", "probability").collect() + + // probabilities are correct when margins have to be adjusted + val raw1 = results(0).getAs[Vector](0) + val prob1 = results(0).getAs[Vector](1) + assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0)) + assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps) + + // probabilities are correct when margins don't have to be adjusted + val raw2 = results(1).getAs[Vector](0) + val prob2 = results(1).getAs[Vector](1) + assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0)) + assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps) + } + test("MultiClassSummarizer") { val summarizer1 = (new MultiClassSummarizer) .add(0.0).add(3.0).add(4.0).add(3.0).add(6.0) @@ -789,6 +922,7 @@ class LogisticRegressionSuite assert(model2.coefficients ~= coefficientsTheory absTol 1E-6) /* + TODO: why is this needed? The correctness of L1 regularization is already checked elsewhere Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -813,17 +947,69 @@ class LogisticRegressionSuite assert(model1.coefficients ~== coefficientsR absTol 1E-6) } + test("multinomial logistic regression with intercept with strong L1 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(6.0).setStandardization(false) + + val sqlContext = multinomialDataset.sqlContext + import sqlContext.implicits._ + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + val histogram = multinomialDataset.as[LabeledPoint].rdd.map(_.label) + .treeAggregate(new MultiClassSummarizer)( + seqOp = (c, v) => (c, v) match { + case (classSummarizer: MultiClassSummarizer, label: Double) => classSummarizer.add(label) + }, + combOp = (c1, c2) => (c1, c2) match { + case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) => + classSummarizer1.merge(classSummarizer2) + }).histogram + val numFeatures = multinomialDataset.as[LabeledPoint].first().features.size + val numClasses = histogram.length + + /* + For multinomial logistic regression with strong L1 regularization, all the coefficients + will be zeros. As a result, the intercepts will be proportional to the log counts in the + histogram. + {{{ + \exp(b_k) = count_k * \exp(\lambda) + b_k = \log(count_k) * \lambda + }}} + \lambda is a free parameter, so choose the phase \lambda such that the + mean is centered. This yields + {{{ + b_k = \log(count_k) + b_k' = b_k - \mean(b_k) + }}} + */ + val rawInterceptsTheory = histogram.map(c => math.log(c + 1)) // add 1 for smoothing + val rawMean = rawInterceptsTheory.sum / rawInterceptsTheory.length + val interceptsTheory = Vectors.dense(rawInterceptsTheory.map(_ - rawMean)) + val coefficientsTheory = new DenseMatrix(numClasses, numFeatures, + Array.fill[Double](numClasses * numFeatures)(0.0), isTransposed = true) + + assert(model1.interceptVector ~== interceptsTheory relTol 1E-3) + assert(model1.coefficientMatrix ~= coefficientsTheory absTol 1E-6) + + assert(model2.interceptVector ~== interceptsTheory relTol 1E-3) + assert(model2.coefficientMatrix ~= coefficientsTheory absTol 1E-6) + } + test("evaluate on test set") { - // TODO: add for multiclass + // TODO: add for multiclass when model summary becomes available // Evaluate on test set should be same as that of the transformed training data. val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(1.0) .setThreshold(0.6) - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) val summary = model.summary.asInstanceOf[BinaryLogisticRegressionSummary] - val sameSummary = model.evaluate(dataset).asInstanceOf[BinaryLogisticRegressionSummary] + val sameSummary = + model.evaluate(smallBinaryDataset).asInstanceOf[BinaryLogisticRegressionSummary] assert(summary.areaUnderROC === sameSummary.areaUnderROC) assert(summary.roc.collect() === sameSummary.roc.collect()) assert(summary.pr.collect === sameSummary.pr.collect()) @@ -840,7 +1026,7 @@ class LogisticRegressionSuite .setMaxIter(10) .setRegParam(1.0) .setThreshold(0.6) - val model = lr.fit(dataset) + val model = lr.fit(smallBinaryDataset) assert( model.summary .objectiveHistory @@ -934,9 +1120,16 @@ class LogisticRegressionSuite assert(model5.interceptVector.size === 3) } + test("intercept priors") { + // TODO + // Get coefficients from normal model with strong L1 + // Set initial model with computed priors... + } + test("set initial model") { // TODO: the binary one doesn't converge any faster // TODO: should they converge after one or two iterations? + // We can just run the other ones for a few iterations then check the predictions val lr = new LogisticRegression() val model1 = lr.fit(binaryDataset) val lr2 = new LogisticRegression().setInitialModel(model1) @@ -949,7 +1142,7 @@ class LogisticRegressionSuite } test("logistic regression with all labels the same") { - val sameLabels = dataset + val sameLabels = smallBinaryDataset .withColumn("zeroLabel", lit(0.0)) .withColumn("oneLabel", lit(1.0)) @@ -990,6 +1183,76 @@ class LogisticRegressionSuite assert(allOneNoInterceptModel.summary.totalIterations > 0) } + test("multiclass logistic regression with all labels the same") { + val constantData = spark.createDataFrame(Seq( + LabeledPoint(4.0, Vectors.dense(0.0)), + LabeledPoint(4.0, Vectors.dense(1.0)), + LabeledPoint(4.0, Vectors.dense(2.0))) + ) + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(constantData) + val results = model.transform(constantData) + results.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity))) + assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0))) + assert(pred === 4.0) + } + + // force the model to be trained with only one class + val constantZeroData = spark.createDataFrame(Seq( + LabeledPoint(0.0, Vectors.dense(0.0)), + LabeledPoint(0.0, Vectors.dense(1.0)), + LabeledPoint(0.0, Vectors.dense(2.0))) + ) + val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData) + val resultsZero = modelZeroLabel.transform(constantZeroData) + resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(prob === Vectors.dense(Array(1.0))) + assert(pred === 0.0) + } + + // ensure that the correct value is predicted when numClasses passed through metadata + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() + val constantDataWithMetadata = constantData + .select(constantData("label").as("label", labelMeta), constantData("features")) + val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata) + val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata) + resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach { + case Row(raw: Vector, prob: Vector, pred: Double) => + assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0))) + assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0))) + assert(pred === 4.0) + } + // TODO: check num iters is zero when it become available in the model + } + + test("numClasses specified in metadata/inferred") { + val lr = new LogisticRegression().setMaxIter(1) + + // specify more classes than unique label values + val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata() + val df = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta), + smallMultinomialDataset("features")) + val model1 = lr.fit(df) + assert(model1.numClasses === 4) + assert(model1.interceptVector.size === 4) + + // specify two classes when there are really three + val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata() + val df1 = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta1), + smallMultinomialDataset("features")) + val thrown = intercept[IllegalArgumentException] { + lr.fit(df1) + } + assert(thrown.getMessage.contains("less than the number of unique labels")) + + // lr should infer the number of classes if not specified + val model3 = lr.fit(smallMultinomialDataset) + assert(model3.numClasses === 3) + } + test("read/write") { def checkModelData(model: LogisticRegressionModel, model2: LogisticRegressionModel): Unit = { assert(model.intercept === model2.intercept) @@ -998,7 +1261,7 @@ class LogisticRegressionSuite assert(model.numFeatures === model2.numFeatures) } val lr = new LogisticRegression() - testEstimatorAndModelReadWrite(lr, dataset, LogisticRegressionSuite.allParamSettings, + testEstimatorAndModelReadWrite(lr, smallBinaryDataset, LogisticRegressionSuite.allParamSettings, checkModelData) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala index 9c7e08820d93b..9969bb02db04b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -86,35 +86,35 @@ class MultinomialLogisticRegressionSuite rdd.saveAsTextFile("target/tmp/MultinomialLogisticRegressionSuite/multinomialDataset") } - test("params") { - ParamsSuite.checkParams(new LogisticRegression) - val model = new LogisticRegressionModel("mLogReg", - Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true) - ParamsSuite.checkParams(model) - } - - test("multinomial logistic regression: default params") { - val mlr = new LogisticRegression - assert(mlr.getLabelCol === "label") - assert(mlr.getFeaturesCol === "features") - assert(mlr.getPredictionCol === "prediction") - assert(mlr.getRawPredictionCol === "rawPrediction") - assert(mlr.getProbabilityCol === "probability") - assert(!mlr.isDefined(mlr.weightCol)) - assert(!mlr.isDefined(mlr.thresholds)) - assert(mlr.getFitIntercept) - assert(mlr.getStandardization) - val model = mlr.fit(dataset) - model.transform(dataset) - .select("label", "probability", "prediction", "rawPrediction") - .collect() - assert(model.getFeaturesCol === "features") - assert(model.getPredictionCol === "prediction") - assert(model.getRawPredictionCol === "rawPrediction") - assert(model.getProbabilityCol === "probability") - assert(model.interceptVector !== Vectors.dense(0.0, 0.0)) - assert(model.hasParent) - } +// test("params") { +// ParamsSuite.checkParams(new LogisticRegression) +// val model = new LogisticRegressionModel("mLogReg", +// Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true) +// ParamsSuite.checkParams(model) +// } +// +// test("multinomial logistic regression: default params") { +// val mlr = new LogisticRegression +// assert(mlr.getLabelCol === "label") +// assert(mlr.getFeaturesCol === "features") +// assert(mlr.getPredictionCol === "prediction") +// assert(mlr.getRawPredictionCol === "rawPrediction") +// assert(mlr.getProbabilityCol === "probability") +// assert(!mlr.isDefined(mlr.weightCol)) +// assert(!mlr.isDefined(mlr.thresholds)) +// assert(mlr.getFitIntercept) +// assert(mlr.getStandardization) +// val model = mlr.fit(dataset) +// model.transform(dataset) +// .select("label", "probability", "prediction", "rawPrediction") +// .collect() +// assert(model.getFeaturesCol === "features") +// assert(model.getPredictionCol === "prediction") +// assert(model.getRawPredictionCol === "rawPrediction") +// assert(model.getProbabilityCol === "probability") +// assert(model.interceptVector !== Vectors.dense(0.0, 0.0)) +// assert(model.hasParent) +// } test("multinomial logistic regression with intercept without regularization") { @@ -813,202 +813,6 @@ class MultinomialLogisticRegressionSuite } */ - test("prediction") { - val model = new LogisticRegressionModel("mLogReg", - Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), - Vectors.dense(0.0, 0.0, 0.0), 3, true) - val overFlowData = spark.createDataFrame(Seq( - LabeledPoint(1.0, Vectors.dense(0.0, 1000.0)), - LabeledPoint(1.0, Vectors.dense(0.0, -1.0)) - )) - val results = model.transform(overFlowData).select("rawPrediction", "probability").collect() - - // probabilities are correct when margins have to be adjusted - val raw1 = results(0).getAs[Vector](0) - val prob1 = results(0).getAs[Vector](1) - assert(raw1 === Vectors.dense(1000.0, 2000.0, 3000.0)) - assert(prob1 ~== Vectors.dense(0.0, 0.0, 1.0) absTol eps) - - // probabilities are correct when margins don't have to be adjusted - val raw2 = results(1).getAs[Vector](0) - val prob2 = results(1).getAs[Vector](1) - assert(raw2 === Vectors.dense(-1.0, -2.0, -3.0)) - assert(prob2 ~== Vectors.dense(0.66524096, 0.24472847, 0.09003057) relTol eps) - } - - test("multinomial logistic regression: Predictor, Classifier methods") { - val mlr = new LogisticRegression - - val model = mlr.fit(dataset) - assert(model.numClasses === 3) - val numFeatures = dataset.select("features").first().getAs[Vector](0).size - assert(model.numFeatures === numFeatures) - - val results = model.transform(dataset) - // check that raw prediction is coefficients dot features + intercept - results.select("rawPrediction", "features").collect().foreach { - case Row(raw: Vector, features: Vector) => - assert(raw.size === 3) - val margins = Array.tabulate(3) { k => - var margin = 0.0 - features.foreachActive { (index, value) => - margin += value * model.coefficientMatrix(k, index) - } - margin += model.interceptVector(k) - margin - } - assert(raw ~== Vectors.dense(margins) relTol eps) - } - - // Compare rawPrediction with probability - results.select("rawPrediction", "probability").collect().foreach { - case Row(raw: Vector, prob: Vector) => - assert(raw.size === 3) - assert(prob.size === 3) - val max = raw.toArray.max - val subtract = if (max > 0) max else 0.0 - val sum = raw.toArray.map(x => math.exp(x - subtract)).sum - val probFromRaw0 = math.exp(raw(0) - subtract) / sum - val probFromRaw1 = math.exp(raw(1) - subtract) / sum - assert(prob(0) ~== probFromRaw0 relTol eps) - assert(prob(1) ~== probFromRaw1 relTol eps) - assert(prob(2) ~== 1.0 - probFromRaw1 - probFromRaw0 relTol eps) - } - - // Compare prediction with probability - results.select("prediction", "probability").collect().foreach { - case Row(pred: Double, prob: Vector) => - val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 - assert(pred == predFromProb) - } - } - - test("multinomial logistic regression coefficients should be centered") { - val mlr = new LogisticRegression().setMaxIter(1) - val model = mlr.fit(dataset) - assert(model.interceptVector.toArray.sum ~== 0.0 absTol 1e-6) - assert(model.coefficientMatrix.toArray.sum ~== 0.0 absTol 1e-6) - } - - test("numClasses specified in metadata/inferred") { - val mlr = new LogisticRegression().setMaxIter(1) - - // specify more classes than unique label values - val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata() - val df = dataset.select(dataset("label").as("label", labelMeta), dataset("features")) - val model1 = mlr.fit(df) - assert(model1.numClasses === 4) - assert(model1.interceptVector.size === 4) - - // specify two classes when there are really three - val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata() - val df1 = dataset.select(dataset("label").as("label", labelMeta1), dataset("features")) - val thrown = intercept[IllegalArgumentException] { - mlr.fit(df1) - } - assert(thrown.getMessage.contains("less than the number of unique labels")) - - // mlr should infer the number of classes if not specified - val model3 = mlr.fit(dataset) - assert(model3.numClasses === 3) - } - - test("all labels the same") { - val constantData = spark.createDataFrame(Seq( - LabeledPoint(4.0, Vectors.dense(0.0)), - LabeledPoint(4.0, Vectors.dense(1.0)), - LabeledPoint(4.0, Vectors.dense(2.0))) - ) - val mlr = new LogisticRegression().setFamily("multinomial") - val model = mlr.fit(constantData) - val results = model.transform(constantData) - results.select("rawPrediction", "probability", "prediction").collect().foreach { - case Row(raw: Vector, prob: Vector, pred: Double) => - assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity))) - assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0))) - assert(pred === 4.0) - } - - // force the model to be trained with only one class - val constantZeroData = spark.createDataFrame(Seq( - LabeledPoint(0.0, Vectors.dense(0.0)), - LabeledPoint(0.0, Vectors.dense(1.0)), - LabeledPoint(0.0, Vectors.dense(2.0))) - ) - val modelZeroLabel = mlr.setFitIntercept(false).fit(constantZeroData) - val resultsZero = modelZeroLabel.transform(constantZeroData) - resultsZero.select("rawPrediction", "probability", "prediction").collect().foreach { - case Row(raw: Vector, prob: Vector, pred: Double) => - assert(prob === Vectors.dense(Array(1.0))) - assert(pred === 0.0) - } - - // ensure that the correct value is predicted when numClasses passed through metadata - val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(6).toMetadata() - val constantDataWithMetadata = constantData - .select(constantData("label").as("label", labelMeta), constantData("features")) - val modelWithMetadata = mlr.setFitIntercept(true).fit(constantDataWithMetadata) - val resultsWithMetadata = modelWithMetadata.transform(constantDataWithMetadata) - resultsWithMetadata.select("rawPrediction", "probability", "prediction").collect().foreach { - case Row(raw: Vector, prob: Vector, pred: Double) => - assert(raw === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, Double.PositiveInfinity, 0.0))) - assert(prob === Vectors.dense(Array(0.0, 0.0, 0.0, 0.0, 1.0, 0.0))) - assert(pred === 4.0) - } - // TODO: check num iters is zero when it become available in the model - } - - test("weighted data") { - val numClasses = 5 - val numPoints = 40 - val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, - numClasses, numPoints) - val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => - LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) - }) - val mlr = new LogisticRegression().setWeightCol("weight") - val model = mlr.fit(outlierData) - val results = model.transform(testData).select("label", "prediction").collect() - - // check that the predictions are the one to one mapping - results.foreach { case Row(label: Double, pred: Double) => - assert(label === pred) - } - val (overSampledData, weightedData) = - MLTestingUtils.genEquivalentOversampledAndWeightedInstances(outlierData, "label", "features", - 42L) - val weightedModel = mlr.fit(weightedData) - val overSampledModel = mlr.setWeightCol("").fit(overSampledData) - assert(weightedModel.coefficientMatrix ~== overSampledModel.coefficientMatrix relTol 0.01) - } - - test("thresholds prediction") { - val mlr = new LogisticRegression - val model = mlr.fit(dataset) - val basePredictions = model.transform(dataset).select("prediction").collect() - - // should predict all zeros - model.setThresholds(Array(1, 1000, 1000)) - val zeroPredictions = model.transform(dataset).select("prediction").collect() - assert(zeroPredictions.forall(_.getDouble(0) === 0.0)) - - // should predict all ones - model.setThresholds(Array(1000, 1, 1000)) - val onePredictions = model.transform(dataset).select("prediction").collect() - assert(onePredictions.forall(_.getDouble(0) === 1.0)) - - // should predict all twos - model.setThresholds(Array(1000, 1000, 1)) - val twoPredictions = model.transform(dataset).select("prediction").collect() - assert(twoPredictions.forall(_.getDouble(0) === 2.0)) - - // constant threshold scaling is the same as no thresholds - model.setThresholds(Array(1000, 1000, 1000)) - val scaledPredictions = model.transform(dataset).select("prediction").collect() - assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) => - scaled.getDouble(0) === base.getDouble(0) - }) - } // test("read/write") { // def checkModelData( @@ -1024,16 +828,6 @@ class MultinomialLogisticRegressionSuite // MultinomialLogisticRegressionSuite.allParamSettings, // checkModelData) // } - - test("should support all NumericType labels and not support other types") { - val mlr = new LogisticRegression().setMaxIter(1) - MLTestingUtils - .checkNumericTypes[LogisticRegressionModel, LogisticRegression]( - mlr, spark) { (expected, actual) => - assert(expected.interceptVector === actual.interceptVector) - assert(expected.coefficientMatrix.toArray === actual.coefficients.toArray) - } - } } object MultinomialLogisticRegressionSuite { From d95370b9d73cb123657e278c0e297bb13ef18331 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 25 Aug 2016 14:33:34 -0700 Subject: [PATCH 05/24] model loading backward compat --- .../classification/LogisticRegression.scala | 38 +++++++++++++------ .../MultinomialLogisticRegressionSuite.scala | 8 ---- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 9b1845eaef98a..e15ebfe00bbac 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -602,12 +602,14 @@ class LogisticRegression @Since("1.2.0") ( /* The intercepts are never regularized, so we always center the mean. */ + // TODO: store model coefficients as multinomial representation? + // If so, zero out one set of coefs or use the +/- representation val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) { val interceptMean = interceptsArray.sum / numClasses interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } Vectors.dense(interceptsArray) - } else if (interceptsArray.nonEmpty) { - Vectors.dense(interceptsArray) + } else if (interceptsArray.length == 2) { + Vectors.dense(interceptsArray.head) } else { Vectors.sparse(numClasses, Seq()) } @@ -980,19 +982,33 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { override def load(path: String): LogisticRegressionModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + val versionRegex = "([0-9]+)\\.([0-9]+)\\.(.+)".r + val versionRegex(major, minor, _) = metadata.sparkVersion val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) - val convertedCoefs = MLUtils.convertMatrixColumnsToML(data, "coefficientMatrix") - val converted = MLUtils.convertVectorColumnsToML(convertedCoefs, "interceptVector") - .select("numClasses", "numFeatures", "interceptVector", "coefficientMatrix", - "isMultinomial") - // TODO: numFeatures not needed? - val Row(numClasses: Int, numFeatures: Int, interceptVector: Vector, - coefficientMatrix: Matrix, isMultinomial: Boolean) = converted.head() - val model = new LogisticRegressionModel(metadata.uid, coefficientMatrix, interceptVector, - numClasses, isMultinomial) + val model = if (major.toInt < 2 || (major.toInt == 2 && minor.toInt == 0)) { + // 2.0 and before + val Row(numClasses: Int, numFeatures: Int, intercept: Double, coefficients: Vector) = + MLUtils.convertVectorColumnsToML(data, "coefficients") + .select("numClasses", "numFeatures", "intercept", "coefficients") + .head() + val coefficientMatrix = + new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true) + val interceptVector = Vectors.dense(intercept) + new LogisticRegressionModel(metadata.uid, coefficientMatrix, + interceptVector, numClasses, isMultinomial = false) + } else { + // 2.1+ + val Row(numClasses: Int, numFeatures: Int, interceptVector: Vector, + coefficientMatrix: Matrix, isMultinomial: Boolean) = data + .select("numClasses", "numFeatures", "interceptVector", "coefficientMatrix", + "isMultinomial").head() + new LogisticRegressionModel(metadata.uid, coefficientMatrix, interceptVector, + numClasses, isMultinomial) + } + DefaultParamsReader.getAndSetParams(model, metadata) model diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala index 9969bb02db04b..5725a47dd8652 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala @@ -806,14 +806,6 @@ class MultinomialLogisticRegressionSuite assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) } - /* - test("multinomial logistic regression with intercept with strong L1 regularization") { - // TODO: implement this test to check that the priors on the intercepts are correct - // TODO: when initial model becomes available - } - */ - - // test("read/write") { // def checkModelData( // model: LogisticRegressionModel, From 942c3b7939879f360ce0a22c57cd6e31293fb044 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 25 Aug 2016 18:27:57 -0700 Subject: [PATCH 06/24] correcting initial model test and deleting multinomial --- .../classification/LogisticRegression.scala | 52 +++++++++++-------- .../LogisticRegressionSuite.scala | 36 +++++++------ 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index e15ebfe00bbac..ebaaa58065fa4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.classification import scala.collection.mutable -import breeze.linalg.{DenseVector => BDV} +import breeze.linalg.{DenseVector => BDV, View} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} import org.apache.hadoop.fs.Path @@ -83,7 +83,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas * If numClasses == 1 || numClasses == 2, set to "binomial". * Else, set to "multinomial" * - "binomial": Binary logistic regression with pivoting. - * - "multinomial": Multinomial (softmax) regression without pivoting. + * - "multinomial": Multinomial logistic (softmax) regression without pivoting. * Default is "auto". * * @group param @@ -181,9 +181,8 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas } /** - * Logistic regression. - * Currently, this class only supports binary classification. For multiclass classification, - * use [[MultinomialLogisticRegression]] + * Logistic regression. Supports multinomial logistic (softmax) regression and binomial logistic + * regression. */ @Since("1.2.0") class LogisticRegression @Since("1.2.0") ( @@ -476,10 +475,11 @@ class LogisticRegression @Since("1.2.0") ( if (initialModelIsValid) { val initialCoefArray = initialCoefficientsWithIntercept.toArray - val providedCoefArray = optInitialModel.get.coefficientMatrix.toArray - providedCoefArray.indices.foreach { i => - val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i - initialCoefArray(flatIndex) = providedCoefArray(i) + val providedCoef = optInitialModel.get.coefficientMatrix + providedCoef.foreachActive { (row, col, value) => + val flatIndex = row * numFeaturesPlusIntercept + col + // We need to scale the coefficients since they will be trained in the scaled space + initialCoefArray(flatIndex) = value * featuresStd(col) } if ($(fitIntercept)) { optInitialModel.get.interceptVector.foreachActive { (index, value) => @@ -608,10 +608,10 @@ class LogisticRegression @Since("1.2.0") ( val interceptMean = interceptsArray.sum / numClasses interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } Vectors.dense(interceptsArray) - } else if (interceptsArray.length == 2) { - Vectors.dense(interceptsArray.head) + } else if (interceptsArray.length == 1) { + Vectors.dense(interceptsArray) } else { - Vectors.sparse(numClasses, Seq()) + Vectors.sparse(numCoefficientSets, Seq()) } (coefficientMatrix, interceptVector, arrayBuilder.result()) } @@ -668,6 +668,7 @@ class LogisticRegressionModel private[spark] ( extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { + // TODO: remove this def this(uid: String, coefficients: Vector, intercept: Double) { this(uid, new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true), @@ -675,19 +676,28 @@ class LogisticRegressionModel private[spark] ( } @Since("2.0.0") - // TODO: this should convert sparse to sparse and dense to dense - val coefficients: Vector = Vectors.dense(coefficientMatrix.toArray) + def coefficients: Vector = if (isMultinomial) { + throw new SparkException("Multinomial models contain a matrix of coefficients, use" + + "coefficientMatrix instead.") + } else { + _coefficients + } + + // convert to appropriate vector representation without replicating data + private lazy val _coefficients: Vector = coefficientMatrix match { + case dm: DenseMatrix => Vectors.dense(dm.values) + case sm: SparseMatrix => Vectors.fromBreeze(sm.asBreeze.flatten(View.Require)) + } @Since("1.3.0") - def intercept: Double = { - if (isMultinomial) { - logWarning("Multiclass model contains a vector of intercepts, use interceptVector instead." + - "Returning 0.0 as placeholder.") - } + def intercept: Double = if (isMultinomial) { + throw new SparkException("Multiclass model contains a vector of intercepts, use " + + "interceptVector instead. Returning 0.0 as placeholder.") + } else { _intercept } - private val _intercept = if (!isMultinomial) interceptVector.toArray.head else 0.0 + private lazy val _intercept = interceptVector.toArray.head @Since("1.5.0") override def setThreshold(value: Double): this.type = super.setThreshold(value) @@ -943,7 +953,6 @@ class LogisticRegressionModel private[spark] ( @Since("1.6.0") object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { - // TODO: we need to be able to load old models as well @Since("1.6.0") override def read: MLReader[LogisticRegressionModel] = new LogisticRegressionModelReader @@ -1009,7 +1018,6 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { numClasses, isMultinomial) } - DefaultParamsReader.getAndSetParams(model, metadata) model } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index a8e94fafa50ed..3a9e0b4f856ca 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -1120,25 +1120,26 @@ class LogisticRegressionSuite assert(model5.interceptVector.size === 3) } - test("intercept priors") { - // TODO - // Get coefficients from normal model with strong L1 - // Set initial model with computed priors... - } - test("set initial model") { - // TODO: the binary one doesn't converge any faster - // TODO: should they converge after one or two iterations? - // We can just run the other ones for a few iterations then check the predictions val lr = new LogisticRegression() - val model1 = lr.fit(binaryDataset) - val lr2 = new LogisticRegression().setInitialModel(model1) - val model2 = lr2.fit(binaryDataset) + val model1 = lr.fit(smallBinaryDataset) + val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5) + val model2 = lr2.fit(smallBinaryDataset) + val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect() + val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect() + predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) => + assert(p1 === p2) + } val lr3 = new LogisticRegression() - val model3 = lr3.fit(multinomialDataset) - val lr4 = new LogisticRegression().setInitialModel(model3) - val model4 = lr4.fit(multinomialDataset) + val model3 = lr3.fit(smallMultinomialDataset) + val lr4 = new LogisticRegression().setInitialModel(model3).setMaxIter(5) + val model4 = lr4.fit(smallMultinomialDataset) + val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect() + val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect() + predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) => + assert(p1 === p2) + } } test("logistic regression with all labels the same") { @@ -1241,8 +1242,9 @@ class LogisticRegressionSuite // specify two classes when there are really three val labelMeta1 = NominalAttribute.defaultAttr.withName("label").withNumValues(2).toMetadata() - val df1 = smallMultinomialDataset.select(smallMultinomialDataset("label").as("label", labelMeta1), - smallMultinomialDataset("features")) + val df1 = smallMultinomialDataset + .select(smallMultinomialDataset("label").as("label", labelMeta1), + smallMultinomialDataset("features")) val thrown = intercept[IllegalArgumentException] { lr.fit(df1) } From ae6150c33b7e93e5c2b6a7b292953150239d9c25 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 25 Aug 2016 21:20:22 -0700 Subject: [PATCH 07/24] small fixes, remove temp constructor --- .../classification/LogisticRegression.scala | 55 ++++--------------- .../ProbabilisticClassifier.scala | 27 +++++++-- .../classification/LogisticRegression.scala | 5 +- .../LogisticRegressionSuite.scala | 3 +- .../ml/classification/OneVsRestSuite.scala | 5 +- .../spark/ml/tuning/CrossValidatorSuite.scala | 5 +- .../ml/tuning/TrainValidationSplitSuite.scala | 5 +- 7 files changed, 47 insertions(+), 58 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index ebaaa58065fa4..2b3cdc5473529 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -365,7 +365,7 @@ class LogisticRegression @Since("1.2.0") ( if (!isMultinomial) { require(isBinaryClassification, s"Binomial family only supports 1 or 2 " + - s"outcome classes but found $numClasses") + s"outcome classes but found $numClasses") } if (isDefined(thresholds)) { @@ -602,8 +602,6 @@ class LogisticRegression @Since("1.2.0") ( /* The intercepts are never regularized, so we always center the mean. */ - // TODO: store model coefficients as multinomial representation? - // If so, zero out one set of coefs or use the +/- representation val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) { val interceptMean = interceptsArray.sum / numClasses interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } @@ -668,13 +666,6 @@ class LogisticRegressionModel private[spark] ( extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { - // TODO: remove this - def this(uid: String, coefficients: Vector, intercept: Double) { - this(uid, - new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true), - Vectors.dense(intercept), 2, false) - } - @Since("2.0.0") def coefficients: Vector = if (isMultinomial) { throw new SparkException("Multinomial models contain a matrix of coefficients, use" + @@ -686,13 +677,14 @@ class LogisticRegressionModel private[spark] ( // convert to appropriate vector representation without replicating data private lazy val _coefficients: Vector = coefficientMatrix match { case dm: DenseMatrix => Vectors.dense(dm.values) + // TODO: better way to flatten sparse matrix? case sm: SparseMatrix => Vectors.fromBreeze(sm.asBreeze.flatten(View.Require)) } @Since("1.3.0") def intercept: Double = if (isMultinomial) { - throw new SparkException("Multiclass model contains a vector of intercepts, use " + - "interceptVector instead. Returning 0.0 as placeholder.") + throw new SparkException("Multinomial models contain a vector of intercepts, use " + + "interceptVector instead.") } else { _intercept } @@ -730,6 +722,7 @@ class LogisticRegressionModel private[spark] ( } /** Score (probability) for each class label. */ + // TODO: do we need this anymore? private val scores: Vector => Vector = (features) => { val m = margins(features) val maxMarginIndex = m.argmax @@ -813,36 +806,11 @@ class LogisticRegressionModel private[spark] ( * Predict label for the given feature vector. * The behavior of this can be adjusted using [[thresholds]]. */ - override protected def predict(features: Vector): Double = { + override protected def predict(features: Vector): Double = if (isMultinomial) { + super.predict(features) + } else { // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. - if (isMultinomial) { - if (isDefined(thresholds)) { - val thresholds: Array[Double] = getThresholds - val probabilities = scores(features).toArray - var argMax = 0 - var max = Double.NegativeInfinity - var i = 0 - while (i < numClasses) { - if (thresholds(i) == 0.0) { - max = Double.PositiveInfinity - argMax = i - } else { - val scaled = probabilities(i) / thresholds(i) - if (scaled > max) { - max = scaled - argMax = i - } - } - i += 1 - } - argMax - } else { - scores(features).argmax - } - } - else { - if (score(features) > getThreshold) 1 else 0 - } + if (score(features) > getThreshold) 1 else 0 } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { @@ -930,10 +898,10 @@ class LogisticRegressionModel private[spark] ( } override protected def probability2prediction(probability: Vector): Double = { - // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. if (isMultinomial) { super.probability2prediction(probability) } else { + // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. if (probability(1) > getThreshold) 1 else 0 } } @@ -983,8 +951,7 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { } } - private class LogisticRegressionModelReader - extends MLReader[LogisticRegressionModel] { + private class LogisticRegressionModelReader extends MLReader[LogisticRegressionModel] { /** Checked against metadata when loading model */ private val className = classOf[LogisticRegressionModel].getName diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index 19df8f7edd43c..989bd19528a97 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -201,11 +201,30 @@ abstract class ProbabilisticClassificationModel[ probability.argmax } else { val thresholds: Array[Double] = getThresholds - val scaledProbability: Array[Double] = - probability.toArray.zip(thresholds).map { case (p, t) => - if (t == 0.0) Double.PositiveInfinity else p / t + val probabilities = probability.toArray + var argMax = 0 + var max = Double.NegativeInfinity + var i = 0 + while (i < probability.size) { + if (thresholds(i) == 0.0) { + max = Double.PositiveInfinity + argMax = i + } else { + val scaled = probabilities(i) / thresholds(i) + if (scaled > max) { + max = scaled + argMax = i + } } - Vectors.dense(scaledProbability).argmax + i += 1 + } + argMax +// val thresholds: Array[Double] = getThresholds +// val scaledProbability: Array[Double] = +// probability.toArray.zip(thresholds).map { case (p, t) => +// if (t == 0.0) Double.PositiveInfinity else p / t +// } +// Vectors.dense(scaledProbability).argmax } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index ad3dab33d2909..c3770dd0a12df 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -430,10 +430,9 @@ class LogisticRegressionWithLBFGS lr.setElasticNetParam(elasticNetParam) lr.setStandardization(useFeatureScaling) if (userSuppliedWeights) { - // TODO: check this val uid = Identifiable.randomUID("logreg-static") - lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel( - uid, new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed=true), + lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(uid, + new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed = true), Vectors.dense(0.0).asML, 2, false)) } lr.setFitIntercept(addIntercept) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 3a9e0b4f856ca..f04d73f979509 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -113,7 +113,8 @@ class LogisticRegressionSuite test("params") { ParamsSuite.checkParams(new LogisticRegression) - val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0) + val model = new LogisticRegressionModel("logReg", + new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index 361dd74cb082e..09e38786aa002 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamsSuite} import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils} import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS @@ -60,7 +60,8 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau test("params") { ParamsSuite.checkParams(new OneVsRest) - val lrModel = new LogisticRegressionModel("lr", Vectors.dense(0.0), 0.0) + val lrModel = new LogisticRegressionModel("logReg", + new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel)) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index 30bd390381e97..0fb26f26e7792 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} import org.apache.spark.ml.feature.HashingTF -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamPair} import org.apache.spark.ml.param.shared.HasInputCol import org.apache.spark.ml.regression.LinearRegression @@ -244,7 +244,8 @@ class CrossValidatorSuite test("read/write: CrossValidatorModel") { val lr = new LogisticRegression() .setThreshold(0.6) - val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2) + val lrModel = new LogisticRegressionModel(lr.uid, + new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") // not default metric diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala index c1e9c2fc1dc11..a05a1d641f1bb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} -import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCol import org.apache.spark.ml.regression.LinearRegression @@ -133,7 +133,8 @@ class TrainValidationSplitSuite test("read/write: TrainValidationSplitModel") { val lr = new LogisticRegression() .setThreshold(0.6) - val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2) + val lrModel = new LogisticRegressionModel(lr.uid, + new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() val paramMaps = new ParamGridBuilder() From 47fa5fde7a0f4ab17042989fb631cf772ff41069 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 25 Aug 2016 21:24:46 -0700 Subject: [PATCH 08/24] rebase --- .../MultinomialLogisticRegression.scala | 632 ------------------ 1 file changed, 632 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala deleted file mode 100644 index 006f57c0ce260..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultinomialLogisticRegression.scala +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.classification - -import scala.collection.mutable - -import breeze.linalg.{DenseVector => BDV} -import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} -import org.apache.hadoop.fs.Path - -import org.apache.spark.SparkException -import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.internal.Logging -import org.apache.spark.ml.feature.Instance -import org.apache.spark.ml.linalg._ -import org.apache.spark.ml.param._ -import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util._ -import org.apache.spark.mllib.linalg.VectorImplicits._ -import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, Row} -import org.apache.spark.sql.functions.{col, lit} -import org.apache.spark.sql.types.DoubleType -import org.apache.spark.storage.StorageLevel - -/** - * Params for multinomial logistic (softmax) regression. - */ -private[classification] trait MultinomialLogisticRegressionParams - extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter - with HasFitIntercept with HasTol with HasStandardization with HasWeightCol - with HasAggregationDepth { - - /** - * Set thresholds in multiclass (or binary) classification to adjust the probability of - * predicting each class. Array must have length equal to the number of classes, with values >= 0. - * The class with largest value p/t is predicted, where p is the original probability of that - * class and t is the class' threshold. - * - * @group setParam - */ - def setThresholds(value: Array[Double]): this.type = { - set(thresholds, value) - } - - /** - * Get thresholds for binary or multiclass classification. - * - * @group getParam - */ - override def getThresholds: Array[Double] = { - $(thresholds) - } -} - -/** - * :: Experimental :: - * Multinomial Logistic (softmax) regression. - */ -@Since("2.1.0") -@Experimental -class MultinomialLogisticRegression @Since("2.1.0") ( - @Since("2.1.0") override val uid: String) - extends ProbabilisticClassifier[Vector, - MultinomialLogisticRegression, MultinomialLogisticRegressionModel] - with MultinomialLogisticRegressionParams with DefaultParamsWritable with Logging { - - @Since("2.1.0") - def this() = this(Identifiable.randomUID("mlogreg")) - - /** - * Set the regularization parameter. - * Default is 0.0. - * - * @group setParam - */ - @Since("2.1.0") - def setRegParam(value: Double): this.type = set(regParam, value) - setDefault(regParam -> 0.0) - - /** - * Set the ElasticNet mixing parameter. - * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - * For 0 < alpha < 1, the penalty is a combination of L1 and L2. - * Default is 0.0 which is an L2 penalty. - * - * @group setParam - */ - @Since("2.1.0") - def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value) - setDefault(elasticNetParam -> 0.0) - - /** - * Set the maximum number of iterations. - * Default is 100. - * - * @group setParam - */ - @Since("2.1.0") - def setMaxIter(value: Int): this.type = set(maxIter, value) - setDefault(maxIter -> 100) - - /** - * Set the convergence tolerance of iterations. - * Smaller value will lead to higher accuracy with the cost of more iterations. - * Default is 1E-6. - * - * @group setParam - */ - @Since("2.1.0") - def setTol(value: Double): this.type = set(tol, value) - setDefault(tol -> 1E-6) - - /** - * Whether to fit an intercept term. - * Default is true. - * - * @group setParam - */ - @Since("2.1.0") - def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) - setDefault(fitIntercept -> true) - - /** - * Whether to standardize the training features before fitting the model. - * The coefficients of models will be always returned on the original scale, - * so it will be transparent for users. Note that with/without standardization, - * the models should always converge to the same solution when no regularization - * is applied. In R's GLMNET package, the default behavior is true as well. - * Default is true. - * - * @group setParam - */ - @Since("2.1.0") - def setStandardization(value: Boolean): this.type = set(standardization, value) - setDefault(standardization -> true) - - /** - * Sets the value of param [[weightCol]]. - * If this is not set or empty, we treat all instance weights as 1.0. - * Default is not set, so all instances have weight one. - * - * @group setParam - */ - @Since("2.1.0") - def setWeightCol(value: String): this.type = set(weightCol, value) - - @Since("2.1.0") - override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) - - /** - * Suggested depth for treeAggregate (>= 2). - * If the dimensions of features or the number of partitions are large, - * this param could be adjusted to a larger size. - * Default is 2. - * @group expertSetParam - */ - @Since("2.1.0") - def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) - setDefault(aggregationDepth -> 2) - - override protected[spark] def train(dataset: Dataset[_]): MultinomialLogisticRegressionModel = { - val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) - val instances: RDD[Instance] = - dataset.select(col($(labelCol)).cast(DoubleType), w, col($(featuresCol))).rdd.map { - case Row(label: Double, weight: Double, features: Vector) => - Instance(label, weight, features) - } - - val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE - if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) - - val instr = Instrumentation.create(this, instances) - instr.logParams(regParam, elasticNetParam, standardization, thresholds, - maxIter, tol, fitIntercept) - - val (summarizer, labelSummarizer) = { - val seqOp = (c: (MultivariateOnlineSummarizer, MultiClassSummarizer), - instance: Instance) => - (c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)) - - val combOp = (c1: (MultivariateOnlineSummarizer, MultiClassSummarizer), - c2: (MultivariateOnlineSummarizer, MultiClassSummarizer)) => - (c1._1.merge(c2._1), c1._2.merge(c2._2)) - - instances.treeAggregate( - new MultivariateOnlineSummarizer, new MultiClassSummarizer)(seqOp, combOp) - } - - val histogram = labelSummarizer.histogram - val numInvalid = labelSummarizer.countInvalid - val numFeatures = summarizer.mean.size - val numFeaturesPlusIntercept = if (getFitIntercept) numFeatures + 1 else numFeatures - - val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match { - case Some(n: Int) => - require(n >= histogram.length, s"Specified number of classes $n was " + - s"less than the number of unique labels ${histogram.length}") - n - case None => histogram.length - } - - instr.logNumClasses(numClasses) - instr.logNumFeatures(numFeatures) - - val (coefficients, intercepts, objectiveHistory) = { - if (numInvalid != 0) { - val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + - s"Found $numInvalid invalid labels." - logError(msg) - throw new SparkException(msg) - } - - val isConstantLabel = histogram.count(_ != 0) == 1 - - if ($(fitIntercept) && isConstantLabel) { - // we want to produce a model that will always predict the constant label so all the - // coefficients will be zero, and the constant label class intercept will be +inf - val constantLabelIndex = Vectors.dense(histogram).argmax - (Matrices.sparse(numClasses, numFeatures, Array.fill(numFeatures + 1)(0), - Array.empty[Int], Array.empty[Double]), - Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))), - Array.empty[Double]) - } else { - if (!$(fitIntercept) && isConstantLabel) { - logWarning(s"All labels belong to a single class and fitIntercept=false. It's" + - s"a dangerous ground, so the algorithm may not converge.") - } - - val featuresStd = summarizer.variance.toArray.map(math.sqrt) - val featuresMean = summarizer.mean.toArray - if (!$(fitIntercept) && (0 until numFeatures).exists { i => - featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) { - logWarning("Fitting MultinomialLogisticRegressionModel without intercept on dataset " + - "with constant nonzero column, Spark MLlib outputs zero coefficients for constant " + - "nonzero columns. This behavior is the same as R glmnet but different from LIBSVM.") - } - - val regParamL1 = $(elasticNetParam) * $(regParam) - val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) - - val bcFeaturesStd = instances.context.broadcast(featuresStd) - val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), - $(standardization), bcFeaturesStd, regParamL2, multinomial = true, $(aggregationDepth)) - - val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { - new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) - } else { - val standardizationParam = $(standardization) - def regParamL1Fun = (index: Int) => { - // Remove the L1 penalization on the intercept - val isIntercept = $(fitIntercept) && ((index + 1) % numFeaturesPlusIntercept == 0) - if (isIntercept) { - 0.0 - } else { - if (standardizationParam) { - regParamL1 - } else { - val featureIndex = if ($(fitIntercept)) { - index % numFeaturesPlusIntercept - } else { - index % numFeatures - } - // If `standardization` is false, we still standardize the data - // to improve the rate of convergence; as a result, we have to - // perform this reverse standardization by penalizing each component - // differently to get effectively the same objective function when - // the training dataset is not standardized. - if (featuresStd(featureIndex) != 0.0) { - regParamL1 / featuresStd(featureIndex) - } else { - 0.0 - } - } - } - } - new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) - } - - val initialCoefficientsWithIntercept = Vectors.zeros(numClasses * numFeaturesPlusIntercept) - - if ($(fitIntercept)) { - /* - For multinomial logistic regression, when we initialize the coefficients as zeros, - it will converge faster if we initialize the intercepts such that - it follows the distribution of the labels. - {{{ - P(1) = \exp(b_1) / Z - ... - P(K) = \exp(b_K) / Z - where Z = \sum_{k=1}^{K} \exp(b_k) - }}} - Since this doesn't have a unique solution, one of the solutions that satisfies the - above equations is - {{{ - \exp(b_k) = count_k * \exp(\lambda) - b_k = \log(count_k) * \lambda - }}} - \lambda is a free parameter, so choose the phase \lambda such that the - mean is centered. This yields - {{{ - b_k = \log(count_k) - b_k' = b_k - \mean(b_k) - }}} - */ - val rawIntercepts = histogram.map(c => math.log(c + 1)) // add 1 for smoothing - val rawMean = rawIntercepts.sum / rawIntercepts.length - rawIntercepts.indices.foreach { i => - initialCoefficientsWithIntercept.toArray(i * numFeaturesPlusIntercept + numFeatures) = - rawIntercepts(i) - rawMean - } - } - - val states = optimizer.iterations(new CachedDiffFunction(costFun), - initialCoefficientsWithIntercept.asBreeze.toDenseVector) - - /* - Note that in Multinomial Logistic Regression, the objective history - (loss + regularization) is log-likelihood which is invariant under feature - standardization. As a result, the objective history from optimizer is the same as the - one in the original space. - */ - val arrayBuilder = mutable.ArrayBuilder.make[Double] - var state: optimizer.State = null - while (states.hasNext) { - state = states.next() - arrayBuilder += state.adjustedValue - } - - if (state == null) { - val msg = s"${optimizer.getClass.getName} failed." - logError(msg) - throw new SparkException(msg) - } - bcFeaturesStd.destroy(blocking = false) - - /* - The coefficients are trained in the scaled space; we're converting them back to - the original space. - Note that the intercept in scaled space and original space is the same; - as a result, no scaling is needed. - */ - val rawCoefficients = state.x.toArray - val interceptsArray: Array[Double] = if ($(fitIntercept)) { - Array.tabulate(numClasses) { i => - val coefIndex = (i + 1) * numFeaturesPlusIntercept - 1 - rawCoefficients(coefIndex) - } - } else { - Array.empty - } - - val coefficientArray: Array[Double] = Array.tabulate(numClasses * numFeatures) { i => - // flatIndex will loop though rawCoefficients, and skip the intercept terms. - val flatIndex = if ($(fitIntercept)) i + i / numFeatures else i - val featureIndex = i % numFeatures - if (featuresStd(featureIndex) != 0.0) { - rawCoefficients(flatIndex) / featuresStd(featureIndex) - } else { - 0.0 - } - } - val coefficientMatrix = - new DenseMatrix(numClasses, numFeatures, coefficientArray, isTransposed = true) - - /* - When no regularization is applied, the coefficients lack identifiability because - we do not use a pivot class. We can add any constant value to the coefficients and - get the same likelihood. So here, we choose the mean centered coefficients for - reproducibility. This method follows the approach in glmnet, described here: - - Friedman, et al. "Regularization Paths for Generalized Linear Models via - Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf - */ - if ($(regParam) == 0.0) { - val coefficientMean = coefficientMatrix.values.sum / (numClasses * numFeatures) - coefficientMatrix.update(_ - coefficientMean) - } - /* - The intercepts are never regularized, so we always center the mean. - */ - val interceptVector = if (interceptsArray.nonEmpty) { - val interceptMean = interceptsArray.sum / numClasses - interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } - Vectors.dense(interceptsArray) - } else { - Vectors.sparse(numClasses, Seq()) - } - - (coefficientMatrix, interceptVector, arrayBuilder.result()) - } - } - - if (handlePersistence) instances.unpersist() - - val model = copyValues( - new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses)) - instr.logSuccess(model) - model - } - - @Since("2.1.0") - override def copy(extra: ParamMap): MultinomialLogisticRegression = defaultCopy(extra) -} - -@Since("2.1.0") -object MultinomialLogisticRegression extends DefaultParamsReadable[MultinomialLogisticRegression] { - - @Since("2.1.0") - override def load(path: String): MultinomialLogisticRegression = super.load(path) -} - -/** - * :: Experimental :: - * Model produced by [[MultinomialLogisticRegression]]. - */ -@Since("2.1.0") -@Experimental -class MultinomialLogisticRegressionModel private[spark] ( - @Since("2.1.0") override val uid: String, - @Since("2.1.0") val coefficients: Matrix, - @Since("2.1.0") val intercepts: Vector, - @Since("2.1.0") val numClasses: Int) - extends ProbabilisticClassificationModel[Vector, MultinomialLogisticRegressionModel] - with MultinomialLogisticRegressionParams with MLWritable { - - @Since("2.1.0") - override def setThresholds(value: Array[Double]): this.type = super.setThresholds(value) - - @Since("2.1.0") - override def getThresholds: Array[Double] = super.getThresholds - - @Since("2.1.0") - override val numFeatures: Int = coefficients.numCols - - /** Margin (rawPrediction) for each class label. */ - private val margins: Vector => Vector = (features) => { - val m = intercepts.toDense.copy - BLAS.gemv(1.0, coefficients, features, 1.0, m) - m - } - - /** Score (probability) for each class label. */ - private val scores: Vector => Vector = (features) => { - val m = margins(features) - val maxMarginIndex = m.argmax - val marginArray = m.toArray - val maxMargin = marginArray(maxMarginIndex) - - // adjust margins for overflow - val sum = { - var temp = 0.0 - var k = 0 - while (k < numClasses) { - marginArray(k) = if (maxMargin > 0) { - math.exp(marginArray(k) - maxMargin) - } else { - math.exp(marginArray(k)) - } - temp += marginArray(k) - k += 1 - } - temp - } - - val scores = Vectors.dense(marginArray) - BLAS.scal(1 / sum, scores) - scores - } - - /** - * Predict label for the given feature vector. - * The behavior of this can be adjusted using [[thresholds]]. - */ - override protected def predict(features: Vector): Double = { - if (isDefined(thresholds)) { - val thresholds: Array[Double] = getThresholds - val probabilities = scores(features).toArray - var argMax = 0 - var max = Double.NegativeInfinity - var i = 0 - while (i < numClasses) { - if (thresholds(i) == 0.0) { - max = Double.PositiveInfinity - argMax = i - } else { - val scaled = probabilities(i) / thresholds(i) - if (scaled > max) { - max = scaled - argMax = i - } - } - i += 1 - } - argMax - } else { - scores(features).argmax - } - } - - override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { - rawPrediction match { - case dv: DenseVector => - val size = dv.size - val values = dv.values - - // get the maximum margin - val maxMarginIndex = rawPrediction.argmax - val maxMargin = rawPrediction(maxMarginIndex) - - if (maxMargin == Double.PositiveInfinity) { - var k = 0 - while (k < size) { - values(k) = if (k == maxMarginIndex) 1.0 else 0.0 - k += 1 - } - } else { - val sum = { - var temp = 0.0 - var k = 0 - while (k < numClasses) { - values(k) = if (maxMargin > 0) { - math.exp(values(k) - maxMargin) - } else { - math.exp(values(k)) - } - temp += values(k) - k += 1 - } - temp - } - BLAS.scal(1 / sum, dv) - } - dv - case sv: SparseVector => - throw new RuntimeException("Unexpected error in MultinomialLogisticRegressionModel:" + - " raw2probabilitiesInPlace encountered SparseVector") - } - } - - override protected def predictRaw(features: Vector): Vector = margins(features) - - @Since("2.1.0") - override def copy(extra: ParamMap): MultinomialLogisticRegressionModel = { - val newModel = - copyValues( - new MultinomialLogisticRegressionModel(uid, coefficients, intercepts, numClasses), extra) - newModel.setParent(parent) - } - - /** - * Returns a [[org.apache.spark.ml.util.MLWriter]] instance for this ML instance. - * - * This does not save the [[parent]] currently. - */ - @Since("2.1.0") - override def write: MLWriter = - new MultinomialLogisticRegressionModel.MultinomialLogisticRegressionModelWriter(this) -} - - -@Since("2.1.0") -object MultinomialLogisticRegressionModel extends MLReadable[MultinomialLogisticRegressionModel] { - - @Since("2.1.0") - override def read: MLReader[MultinomialLogisticRegressionModel] = - new MultinomialLogisticRegressionModelReader - - @Since("2.1.0") - override def load(path: String): MultinomialLogisticRegressionModel = super.load(path) - - /** [[MLWriter]] instance for [[MultinomialLogisticRegressionModel]] */ - private[MultinomialLogisticRegressionModel] - class MultinomialLogisticRegressionModelWriter(instance: MultinomialLogisticRegressionModel) - extends MLWriter with Logging { - - private case class Data( - numClasses: Int, - numFeatures: Int, - intercepts: Vector, - coefficients: Matrix) - - override protected def saveImpl(path: String): Unit = { - // Save metadata and Params - DefaultParamsWriter.saveMetadata(instance, path, sc) - // Save model data: numClasses, numFeatures, intercept, coefficients - val data = Data(instance.numClasses, instance.numFeatures, instance.intercepts, - instance.coefficients) - val dataPath = new Path(path, "data").toString - sqlContext.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) - } - } - - private class MultinomialLogisticRegressionModelReader - extends MLReader[MultinomialLogisticRegressionModel] { - - /** Checked against metadata when loading model */ - private val className = classOf[MultinomialLogisticRegressionModel].getName - - override def load(path: String): MultinomialLogisticRegressionModel = { - val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - - val dataPath = new Path(path, "data").toString - val data = sqlContext.read.format("parquet").load(dataPath) - .select("numClasses", "numFeatures", "intercepts", "coefficients").head() - val numClasses = data.getAs[Int](data.fieldIndex("numClasses")) - val intercepts = data.getAs[Vector](data.fieldIndex("intercepts")) - val coefficients = data.getAs[Matrix](data.fieldIndex("coefficients")) - val model = - new MultinomialLogisticRegressionModel(metadata.uid, coefficients, intercepts, numClasses) - - DefaultParamsReader.getAndSetParams(model, metadata) - model - } - } -} From 79273f7be4234de0d97347df02518b690fef7119 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 26 Aug 2016 08:21:56 -0700 Subject: [PATCH 09/24] removing old test suite --- .../classification/LogisticRegression.scala | 3 +- .../LogisticRegressionSuite.scala | 710 ++++++++++++++- .../MultinomialLogisticRegressionSuite.scala | 842 ------------------ 3 files changed, 710 insertions(+), 845 deletions(-) delete mode 100644 mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2b3cdc5473529..80426fc019e83 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -668,7 +668,7 @@ class LogisticRegressionModel private[spark] ( @Since("2.0.0") def coefficients: Vector = if (isMultinomial) { - throw new SparkException("Multinomial models contain a matrix of coefficients, use" + + throw new SparkException("Multinomial models contain a matrix of coefficients, use " + "coefficientMatrix instead.") } else { _coefficients @@ -1378,7 +1378,6 @@ class BinaryLogisticRegressionSummary private[classification] ( * $$ *

* - * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. * @param numClasses the number of possible outcomes for k classes classification problem in diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index f04d73f979509..47c1a7218fcbd 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -24,7 +24,7 @@ import scala.language.existentials import scala.util.Random import scala.util.control.Breaks._ -import org.apache.spark.SparkFunSuite +import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.{Matrices, DenseMatrix, Vector, Vectors} @@ -364,6 +364,24 @@ class LogisticRegressionSuite } } + test("coefficients and intercept methods") { + val mlr = new LogisticRegression().setMaxIter(1) + val mlrModel = mlr.fit(smallMultinomialDataset) + val thrownCoef = intercept[SparkException] { + mlrModel.coefficients + } + val thrownIntercept = intercept[SparkException] { + mlrModel.intercept + } + assert(thrownCoef.getMessage().contains("use coefficientMatrix instead")) + assert(thrownIntercept.getMessage().contains("use interceptVector instead")) + + val blr = new LogisticRegression().setMaxIter(1) + val blrModel = blr.fit(smallBinaryDataset) + assert(blrModel.coefficients.size === 1) + assert(blrModel.intercept !== 0.0) + } + test("overflow prediction for multiclass") { val model = new LogisticRegressionModel("mLogReg", Matrices.dense(3, 2, Array(0.0, 0.0, 0.0, 1.0, 2.0, 3.0)), @@ -999,6 +1017,696 @@ class LogisticRegressionSuite assert(model2.coefficientMatrix ~= coefficientsTheory absTol 1E-6) } + test("multinomial logistic regression with intercept without regularization") { + + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + > library("glmnet") + > data <- read.csv("path", header=FALSE) + > label = as.factor(data$V1) + > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0)) + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -2.24493379 + V2 0.25096771 + V3 -0.03915938 + V4 0.14766639 + V5 0.36810817 + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.3778931 + V2 -0.3327489 + V3 0.8893666 + V4 -0.2306948 + V5 -0.4442330 + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.86704066 + V2 0.08178121 + V3 -0.85020722 + V4 0.08302840 + V5 0.07612480 + */ + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.2509677, -0.0391594, 0.1476664, 0.3681082, + -0.3327489, 0.8893666, -0.2306948, -0.4442330, + 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true) + val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407) + + assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model1.interceptVector ~== interceptsR relTol 0.05) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model2.interceptVector ~== interceptsR relTol 0.05) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept without regularization") { + + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Using the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0, + intercept=F)) + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.06992464 + V3 -0.36562784 + V4 0.12142680 + V5 0.32052211 + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.3036269 + V3 0.9449630 + V4 -0.2271038 + V5 -0.4364839 + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.2337022 + V3 -0.5793351 + V4 0.1056770 + V5 0.1159618 + */ + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0699246, -0.3656278, 0.1214268, 0.3205221, + -0.3036269, 0.9449630, -0.2271038, -0.4364839, + 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression with intercept with L1 regularization") { + + // use tighter constraints because OWL-QN solver takes longer to converge + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, + lambda = 0.05, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, + standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.68988825 + V2 . + V3 . + V4 . + V5 0.09404023 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2303499 + V2 -0.1232443 + V3 0.3258380 + V4 -0.1564688 + V5 -0.2053965 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.9202381 + V2 . + V3 -0.4803856 + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.44893320 + V2 . + V3 . + V4 0.01933812 + V5 0.03666044 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.7376760 + V2 -0.0577182 + V3 . + V4 -0.2081718 + V5 -0.1304592 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2887428 + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.09404023, + -0.1232443, 0.3258380, -0.1564688, -0.2053965, + 0.0, -0.4803856, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.01933812, 0.03666044, + -0.0577182, 0.0, -0.2081718, -0.1304592, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02) + assert(model1.interceptVector ~== interceptsRStd relTol 0.1) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02) + assert(model2.interceptVector ~== interceptsR relTol 0.1) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept with L1 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, + lambda = 0.05, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, + intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 0.01525105 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.1502410 + V3 0.5134658 + V4 -0.1601146 + V5 -0.2500232 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.003301875 + V3 . + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 0.1943624 + V4 -0.1902577 + V5 -0.1028789 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.01525105, + -0.1502410, 0.5134658, -0.1601146, -0.2500232, + 0.003301875, 0.0, 0.0, 0.0), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.1943624, -0.1902577, -0.1028789, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression with intercept with L2 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=T, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=T, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -1.70040424 + V2 0.17576070 + V3 0.01527894 + V4 0.10216108 + V5 0.26099531 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.2438590 + V2 -0.2238875 + V3 0.5967610 + V4 -0.1555496 + V5 -0.3010479 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.45654525 + V2 0.04812679 + V3 -0.61203992 + V4 0.05338850 + V5 0.04005258 + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -1.65488543 + V2 0.15715048 + V3 0.01992903 + V4 0.12428858 + V5 0.22130317 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 1.1297533 + V2 -0.1974768 + V3 0.2776373 + V4 -0.1869445 + V5 -0.2510320 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.52513212 + V2 0.04032627 + V3 -0.29756637 + V4 0.06265594 + V5 0.02972883 + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.17576070, 0.01527894, 0.10216108, 0.26099531, + -0.2238875, 0.5967610, -0.1555496, -0.3010479, + 0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true) + val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.15715048, 0.01992903, 0.12428858, 0.22130317, + -0.1974768, 0.2776373, -0.1869445, -0.2510320, + 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true) + val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212) + + assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05) + assert(model1.interceptVector ~== interceptsRStd relTol 0.05) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) + assert(model2.interceptVector ~== interceptsR relTol 0.05) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept with L2 regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, + lambda = 0.1, intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.03904171 + V3 -0.23354322 + V4 0.08288096 + V5 0.22706393 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.2061848 + V3 0.6341398 + V4 -0.1530059 + V5 -0.2958455 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.16714312 + V3 -0.40059658 + V4 0.07012496 + V5 0.06878158 + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.005704542 + V3 -0.144466409 + V4 0.092080736 + V5 0.182927657 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.08469036 + V3 0.38996748 + V4 -0.16468436 + V5 -0.22522976 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.09039490 + V3 -0.24550107 + V4 0.07260362 + V5 0.04230210 + */ + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.03904171, -0.23354322, 0.08288096, 0.2270639, + -0.2061848, 0.6341398, -0.1530059, -0.2958455, + 0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + -0.005704542, -0.144466409, 0.092080736, 0.182927657, + -0.08469036, 0.38996748, -0.16468436, -0.22522976, + 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression with intercept with elasticnet regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new LogisticRegression).setFitIntercept(true) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=T, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=T, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.5521819483 + V2 0.0003092611 + V3 . + V4 . + V5 0.0913818490 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.27531989 + V2 -0.09790029 + V3 0.28502034 + V4 -0.12416487 + V5 -0.16513373 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.8275018 + V2 . + V3 -0.4044859 + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.39876213 + V2 . + V3 . + V4 0.02547520 + V5 0.03893991 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + 0.61089869 + V2 -0.04224269 + V3 . + V4 -0.18923970 + V5 -0.09104249 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + -0.2121366 + V2 . + V3 . + V4 . + V5 . + */ + + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0003092611, 0.0, 0.0, 0.091381849, + -0.09790029, 0.28502034, -0.12416487, -0.16513373, + 0.0, -0.4044859, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0254752, 0.03893991, + -0.04224269, 0.0, -0.1892397, -0.09104249, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector ~== interceptsRStd absTol 0.01) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector ~== interceptsR absTol 0.01) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + + test("multinomial logistic regression without intercept with elasticnet regularization") { + val trainer1 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) + .setMaxIter(300).setTol(1e-10) + val trainer2 = (new LogisticRegression).setFitIntercept(false) + .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) + .setMaxIter(300).setTol(1e-10) + + val model1 = trainer1.fit(multinomialDataset) + val model2 = trainer2.fit(multinomialDataset) + /* + Use the following R code to load the data and train the model using glmnet package. + library("glmnet") + data <- read.csv("path", header=FALSE) + label = as.factor(data$V1) + features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) + coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=F, standardization=T)) + coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, + lambda = 0.1, intercept=F, standardization=F)) + > coefficientsStd + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 0.03543706 + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 -0.1187387 + V3 0.4025482 + V4 -0.1270969 + V5 -0.1918386 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 0.00774365 + V3 . + V4 . + V5 . + + > coefficients + $`0` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + + $`1` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 0.14666497 + V4 -0.16570638 + V5 -0.05982875 + + $`2` + 5 x 1 sparse Matrix of class "dgCMatrix" + s0 + . + V2 . + V3 . + V4 . + V5 . + */ + val coefficientsRStd = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.03543706, + -0.1187387, 0.4025482, -0.1270969, -0.1918386, + 0.0, 0.0, 0.0, 0.00774365), isTransposed = true) + + val coefficientsR = new DenseMatrix(3, 4, Array( + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.14666497, -0.16570638, -0.05982875, + 0.0, 0.0, 0.0, 0.0), isTransposed = true) + + assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) + assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) + assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) + assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) + assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) + } + test("evaluate on test set") { // TODO: add for multiclass when model summary becomes available // Evaluate on test set should be same as that of the transformed training data. diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala deleted file mode 100644 index 5725a47dd8652..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultinomialLogisticRegressionSuite.scala +++ /dev/null @@ -1,842 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.ml.classification - -import scala.language.existentials - -import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.attribute.NominalAttribute -import org.apache.spark.ml.classification.LogisticRegressionSuite._ -import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg._ -import org.apache.spark.ml.param.ParamsSuite -import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} -import org.apache.spark.ml.util.TestingUtils._ -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset, Row} - -class MultinomialLogisticRegressionSuite - extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - - @transient var dataset: Dataset[_] = _ - @transient var multinomialDataset: DataFrame = _ - private val eps: Double = 1e-5 - - override def beforeAll(): Unit = { - super.beforeAll() - - dataset = { - val nPoints = 100 - val coefficients = Array( - -0.57997, 0.912083, -0.371077, - -0.16624, -0.84355, -0.048509) - - val xMean = Array(5.843, 3.057) - val xVariance = Array(0.6856, 0.1899) - - val testData = generateMultinomialLogisticInput( - coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) - - val df = spark.createDataFrame(sc.parallelize(testData, 4)) - df.cache() - df - } - - multinomialDataset = { - val nPoints = 10000 - val coefficients = Array( - -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, - -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) - - val xMean = Array(5.843, 3.057, 3.758, 1.199) - val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) - - val testData = generateMultinomialLogisticInput( - coefficients, xMean, xVariance, addIntercept = true, nPoints, 42) - - val df = spark.createDataFrame(sc.parallelize(testData, 4)) - df.cache() - df - } - } - - /** - * Enable the ignored test to export the dataset into CSV format, - * so we can validate the training accuracy compared with R's glmnet package. - */ - ignore("export test data into CSV format") { - val rdd = multinomialDataset.rdd.map { case Row(label: Double, features: Vector) => - label + "," + features.toArray.mkString(",") - }.repartition(1) - rdd.saveAsTextFile("target/tmp/MultinomialLogisticRegressionSuite/multinomialDataset") - } - -// test("params") { -// ParamsSuite.checkParams(new LogisticRegression) -// val model = new LogisticRegressionModel("mLogReg", -// Matrices.dense(2, 1, Array(0.0, 0.0)), Vectors.dense(0.0, 0.0), 2, true) -// ParamsSuite.checkParams(model) -// } -// -// test("multinomial logistic regression: default params") { -// val mlr = new LogisticRegression -// assert(mlr.getLabelCol === "label") -// assert(mlr.getFeaturesCol === "features") -// assert(mlr.getPredictionCol === "prediction") -// assert(mlr.getRawPredictionCol === "rawPrediction") -// assert(mlr.getProbabilityCol === "probability") -// assert(!mlr.isDefined(mlr.weightCol)) -// assert(!mlr.isDefined(mlr.thresholds)) -// assert(mlr.getFitIntercept) -// assert(mlr.getStandardization) -// val model = mlr.fit(dataset) -// model.transform(dataset) -// .select("label", "probability", "prediction", "rawPrediction") -// .collect() -// assert(model.getFeaturesCol === "features") -// assert(model.getPredictionCol === "prediction") -// assert(model.getRawPredictionCol === "rawPrediction") -// assert(model.getProbabilityCol === "probability") -// assert(model.interceptVector !== Vectors.dense(0.0, 0.0)) -// assert(model.hasParent) -// } - - test("multinomial logistic regression with intercept without regularization") { - - val trainer1 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true).setMaxIter(100) - val trainer2 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - - /* - Using the following R code to load the data and train the model using glmnet package. - > library("glmnet") - > data <- read.csv("path", header=FALSE) - > label = as.factor(data$V1) - > features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - > coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0)) - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -2.24493379 - V2 0.25096771 - V3 -0.03915938 - V4 0.14766639 - V5 0.36810817 - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.3778931 - V2 -0.3327489 - V3 0.8893666 - V4 -0.2306948 - V5 -0.4442330 - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 1.86704066 - V2 0.08178121 - V3 -0.85020722 - V4 0.08302840 - V5 0.07612480 - */ - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.2509677, -0.0391594, 0.1476664, 0.3681082, - -0.3327489, 0.8893666, -0.2306948, -0.4442330, - 0.0817812, -0.8502072, 0.0830284, 0.0761248), isTransposed = true) - val interceptsR = Vectors.dense(-2.2449338, 0.3778931, 1.8670407) - - assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) - assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) - assert(model1.interceptVector ~== interceptsR relTol 0.05) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) - assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) - assert(model2.interceptVector ~== interceptsR relTol 0.05) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression without intercept without regularization") { - - val trainer1 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(true) - val trainer2 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.0).setRegParam(0.0).setStandardization(false) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - - /* - Using the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, lambda = 0, - intercept=F)) - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.06992464 - V3 -0.36562784 - V4 0.12142680 - V5 0.32052211 - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 -0.3036269 - V3 0.9449630 - V4 -0.2271038 - V5 -0.4364839 - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.2337022 - V3 -0.5793351 - V4 0.1056770 - V5 0.1159618 - */ - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0699246, -0.3656278, 0.1214268, 0.3205221, - -0.3036269, 0.9449630, -0.2271038, -0.4364839, - 0.2337022, -0.5793351, 0.1056770, 0.1159618), isTransposed = true) - - assert(model1.coefficientMatrix ~== coefficientsR relTol 0.05) - assert(model1.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) - assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) - assert(model2.coefficientMatrix.toArray.sum ~== 0.0 absTol eps) - assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression with intercept with L1 regularization") { - - // use tighter constraints because OWL-QN solver takes longer to converge - val trainer1 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) - .setMaxIter(300).setTol(1e-10) - val trainer2 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) - .setMaxIter(300).setTol(1e-10) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - - /* - Use the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, - lambda = 0.05, standardization=T)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, - standardization=F)) - > coefficientsStd - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.68988825 - V2 . - V3 . - V4 . - V5 0.09404023 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.2303499 - V2 -0.1232443 - V3 0.3258380 - V4 -0.1564688 - V5 -0.2053965 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.9202381 - V2 . - V3 -0.4803856 - V4 . - V5 . - - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.44893320 - V2 . - V3 . - V4 0.01933812 - V5 0.03666044 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.7376760 - V2 -0.0577182 - V3 . - V4 -0.2081718 - V5 -0.1304592 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.2887428 - V2 . - V3 . - V4 . - V5 . - */ - - val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.09404023, - -0.1232443, 0.3258380, -0.1564688, -0.2053965, - 0.0, -0.4803856, 0.0, 0.0), isTransposed = true) - val interceptsRStd = Vectors.dense(-0.68988825, -0.2303499, 0.9202381) - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.01933812, 0.03666044, - -0.0577182, 0.0, -0.2081718, -0.1304592, - 0.0, 0.0, 0.0, 0.0), isTransposed = true) - val interceptsR = Vectors.dense(-0.44893320, 0.7376760, -0.2887428) - - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.02) - assert(model1.interceptVector ~== interceptsRStd relTol 0.1) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR absTol 0.02) - assert(model2.interceptVector ~== interceptsR relTol 0.1) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression without intercept with L1 regularization") { - val trainer1 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(true) - val trainer2 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(1.0).setRegParam(0.05).setStandardization(false) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - /* - Use the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 1, - lambda = 0.05, intercept=F, standardization=T)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 1, lambda = 0.05, - intercept=F, standardization=F)) - > coefficientsStd - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 . - V4 . - V5 0.01525105 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 -0.1502410 - V3 0.5134658 - V4 -0.1601146 - V5 -0.2500232 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.003301875 - V3 . - V4 . - V5 . - - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 . - V4 . - V5 . - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 0.1943624 - V4 -0.1902577 - V5 -0.1028789 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 . - V4 . - V5 . - */ - - val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.01525105, - -0.1502410, 0.5134658, -0.1601146, -0.2500232, - 0.003301875, 0.0, 0.0, 0.0), isTransposed = true) - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.1943624, -0.1902577, -0.1028789, - 0.0, 0.0, 0.0, 0.0), isTransposed = true) - - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) - assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) - assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression with intercept with L2 regularization") { - val trainer1 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) - val trainer2 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - /* - Use the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, - lambda = 0.1, intercept=T, standardization=T)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, - lambda = 0.1, intercept=T, standardization=F)) - > coefficientsStd - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -1.70040424 - V2 0.17576070 - V3 0.01527894 - V4 0.10216108 - V5 0.26099531 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.2438590 - V2 -0.2238875 - V3 0.5967610 - V4 -0.1555496 - V5 -0.3010479 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 1.45654525 - V2 0.04812679 - V3 -0.61203992 - V4 0.05338850 - V5 0.04005258 - - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -1.65488543 - V2 0.15715048 - V3 0.01992903 - V4 0.12428858 - V5 0.22130317 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 1.1297533 - V2 -0.1974768 - V3 0.2776373 - V4 -0.1869445 - V5 -0.2510320 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.52513212 - V2 0.04032627 - V3 -0.29756637 - V4 0.06265594 - V5 0.02972883 - */ - - val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.17576070, 0.01527894, 0.10216108, 0.26099531, - -0.2238875, 0.5967610, -0.1555496, -0.3010479, - 0.04812679, -0.61203992, 0.05338850, 0.04005258), isTransposed = true) - val interceptsRStd = Vectors.dense(-1.70040424, 0.2438590, 1.45654525) - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.15715048, 0.01992903, 0.12428858, 0.22130317, - -0.1974768, 0.2776373, -0.1869445, -0.2510320, - 0.04032627, -0.29756637, 0.06265594, 0.02972883), isTransposed = true) - val interceptsR = Vectors.dense(-1.65488543, 1.1297533, 0.52513212) - - assert(model1.coefficientMatrix ~== coefficientsRStd relTol 0.05) - assert(model1.interceptVector ~== interceptsRStd relTol 0.05) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR relTol 0.05) - assert(model2.interceptVector ~== interceptsR relTol 0.05) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression without intercept with L2 regularization") { - val trainer1 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(true) - val trainer2 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.0).setRegParam(0.1).setStandardization(false) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - /* - Use the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0, - lambda = 0.1, intercept=F, standardization=T)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0, - lambda = 0.1, intercept=F, standardization=F)) - > coefficientsStd - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.03904171 - V3 -0.23354322 - V4 0.08288096 - V5 0.22706393 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 -0.2061848 - V3 0.6341398 - V4 -0.1530059 - V5 -0.2958455 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.16714312 - V3 -0.40059658 - V4 0.07012496 - V5 0.06878158 - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 -0.005704542 - V3 -0.144466409 - V4 0.092080736 - V5 0.182927657 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 -0.08469036 - V3 0.38996748 - V4 -0.16468436 - V5 -0.22522976 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.09039490 - V3 -0.24550107 - V4 0.07260362 - V5 0.04230210 - */ - val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.03904171, -0.23354322, 0.08288096, 0.2270639, - -0.2061848, 0.6341398, -0.1530059, -0.2958455, - 0.16714312, -0.40059658, 0.07012496, 0.06878158), isTransposed = true) - - val coefficientsR = new DenseMatrix(3, 4, Array( - -0.005704542, -0.144466409, 0.092080736, 0.182927657, - -0.08469036, 0.38996748, -0.16468436, -0.22522976, - 0.0903949, -0.24550107, 0.07260362, 0.0423021), isTransposed = true) - - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) - assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) - assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression with intercept with elasticnet regularization") { - val trainer1 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) - .setMaxIter(300).setTol(1e-10) - val trainer2 = (new LogisticRegression).setFitIntercept(true) - .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) - .setMaxIter(300).setTol(1e-10) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - /* - Use the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, - lambda = 0.1, intercept=T, standardization=T)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, - lambda = 0.1, intercept=T, standardization=F)) - > coefficientsStd - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.5521819483 - V2 0.0003092611 - V3 . - V4 . - V5 0.0913818490 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.27531989 - V2 -0.09790029 - V3 0.28502034 - V4 -0.12416487 - V5 -0.16513373 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.8275018 - V2 . - V3 -0.4044859 - V4 . - V5 . - - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.39876213 - V2 . - V3 . - V4 0.02547520 - V5 0.03893991 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - 0.61089869 - V2 -0.04224269 - V3 . - V4 -0.18923970 - V5 -0.09104249 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.2121366 - V2 . - V3 . - V4 . - V5 . - */ - - val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0003092611, 0.0, 0.0, 0.091381849, - -0.09790029, 0.28502034, -0.12416487, -0.16513373, - 0.0, -0.4044859, 0.0, 0.0), isTransposed = true) - val interceptsRStd = Vectors.dense(-0.5521819483, -0.27531989, 0.8275018) - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0254752, 0.03893991, - -0.04224269, 0.0, -0.1892397, -0.09104249, - 0.0, 0.0, 0.0, 0.0), isTransposed = true) - val interceptsR = Vectors.dense(-0.39876213, 0.61089869, -0.2121366) - - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) - assert(model1.interceptVector ~== interceptsRStd absTol 0.01) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) - assert(model2.interceptVector ~== interceptsR absTol 0.01) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - - test("multinomial logistic regression without intercept with elasticnet regularization") { - val trainer1 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(true) - .setMaxIter(300).setTol(1e-10) - val trainer2 = (new LogisticRegression).setFitIntercept(false) - .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) - .setMaxIter(300).setTol(1e-10) - - val model1 = trainer1.fit(multinomialDataset) - val model2 = trainer2.fit(multinomialDataset) - /* - Use the following R code to load the data and train the model using glmnet package. - library("glmnet") - data <- read.csv("path", header=FALSE) - label = as.factor(data$V1) - features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5)) - coefficientsStd = coef(glmnet(features, label, family="multinomial", alpha = 0.5, - lambda = 0.1, intercept=F, standardization=T)) - coefficients = coef(glmnet(features, label, family="multinomial", alpha = 0.5, - lambda = 0.1, intercept=F, standardization=F)) - > coefficientsStd - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 . - V4 . - V5 0.03543706 - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 -0.1187387 - V3 0.4025482 - V4 -0.1270969 - V5 -0.1918386 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 0.00774365 - V3 . - V4 . - V5 . - - > coefficients - $`0` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 . - V4 . - V5 . - - $`1` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 0.14666497 - V4 -0.16570638 - V5 -0.05982875 - - $`2` - 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - V2 . - V3 . - V4 . - V5 . - */ - val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.03543706, - -0.1187387, 0.4025482, -0.1270969, -0.1918386, - 0.0, 0.0, 0.0, 0.00774365), isTransposed = true) - - val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.0, - 0.0, 0.14666497, -0.16570638, -0.05982875, - 0.0, 0.0, 0.0, 0.0), isTransposed = true) - - assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) - assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model1.interceptVector.toArray.sum ~== 0.0 absTol eps) - assert(model2.coefficientMatrix ~== coefficientsR absTol 0.01) - assert(model2.interceptVector.toArray === Array.fill(3)(0.0)) - assert(model2.interceptVector.toArray.sum ~== 0.0 absTol eps) - } - -// test("read/write") { -// def checkModelData( -// model: LogisticRegressionModel, -// model2: LogisticRegressionModel): Unit = { -// assert(model.interceptVector === model2.interceptVector) -// assert(model.coefficientMatrix.toArray === model2.coefficients.toArray) -// assert(model.numClasses === model2.numClasses) -// assert(model.numFeatures === model2.numFeatures) -// } -// val mlr = new LogisticRegression() -// testEstimatorAndModelReadWrite(mlr, dataset, -// MultinomialLogisticRegressionSuite.allParamSettings, -// checkModelData) -// } -} - -object MultinomialLogisticRegressionSuite { - - /** - * Mapping from all Params to valid settings which differ from the defaults. - * This is useful for tests which need to exercise all Params, such as save/load. - * This excludes input columns to simplify some tests. - */ - val allParamSettings: Map[String, Any] = ProbabilisticClassifierSuite.allParamSettings ++ Map( - "probabilityCol" -> "myProbability", - "thresholds" -> Array(0.4, 0.6), - "regParam" -> 0.01, - "elasticNetParam" -> 0.1, - "maxIter" -> 2, // intentionally small - "fitIntercept" -> true, - "tol" -> 0.8, - "standardization" -> false - ) -} From 262bc996063f4d07b9440d6164be01f497d180ef Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 26 Aug 2016 09:36:05 -0700 Subject: [PATCH 10/24] some small fixes --- .../classification/LogisticRegression.scala | 24 +++++++++---------- .../ProbabilisticClassifier.scala | 6 ----- .../classification/LogisticRegression.scala | 4 ++-- .../LogisticRegressionSuite.scala | 15 ++++++------ .../ml/classification/OneVsRestSuite.scala | 4 ++-- .../spark/ml/tuning/CrossValidatorSuite.scala | 4 ++-- .../ml/tuning/TrainValidationSplitSuite.scala | 4 ++-- 7 files changed, 28 insertions(+), 33 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 80426fc019e83..1a7d6a2aa68a5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -68,6 +68,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas * * @group setParam */ + // TODO: Implement SPARK-11543? def setThreshold(value: Double): this.type = { if (isSet(thresholds)) clear(thresholds) set(threshold, value) @@ -88,14 +89,14 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas * * @group param */ - @Since("2.0.0") + @Since("2.1.0") final val family: Param[String] = new Param(this, "family", "The name of family which is a description of the label distribution to be used in the " + s"model. Supported options: ${supportedFamilyNames.mkString(", ")}.", ParamValidators.inArray[String](supportedFamilyNames)) /** @group getParam */ - @Since("2.0.0") + @Since("2.1.0") def getFamily: String = $(family) /** @@ -252,7 +253,7 @@ class LogisticRegression @Since("1.2.0") ( * * @group setParam */ - @Since("2.0.0") + @Since("2.1.0") def setFamily(value: String): this.type = set(family, value) setDefault(family -> "auto") @@ -271,7 +272,6 @@ class LogisticRegression @Since("1.2.0") ( setDefault(standardization -> true) @Since("1.5.0") - // TODO: Check this behavior override def setThreshold(value: Double): this.type = super.setThreshold(value) @Since("1.5.0") @@ -354,18 +354,18 @@ class LogisticRegression @Since("1.2.0") ( val numClasses = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match { case Some(n: Int) => require(n >= histogram.length, s"Specified number of classes $n was " + - s"less than the number of unique labels ${histogram.length}") + s"less than the number of unique labels ${histogram.length}.") n case None => histogram.length } val isBinaryClassification = numClasses == 1 || numClasses == 2 - val isMultinomial = ($(family) == LogisticRegression.auto && !isBinaryClassification) || - ($(family) == LogisticRegression.multinomial) + val isMultinomial = ($(family) == LogisticRegression.Auto && !isBinaryClassification) || + ($(family) == LogisticRegression.Multinomial) val numCoefficientSets = if (isMultinomial) numClasses else 1 if (!isMultinomial) { require(isBinaryClassification, s"Binomial family only supports 1 or 2 " + - s"outcome classes but found $numClasses") + s"outcome classes but found $numClasses.") } if (isDefined(thresholds)) { @@ -646,11 +646,11 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Since("1.6.0") override def load(path: String): LogisticRegression = super.load(path) - private val multinomial = "multinomial" - private val binomial = "binomial" - private val auto = "auto" + private val Multinomial = "multinomial" + private val Binomial = "binomial" + private val Auto = "auto" - private[classification] lazy val supportedFamilyNames = Array(auto, binomial, multinomial) + private[classification] val supportedFamilyNames = Array(Auto, Binomial, Multinomial) } /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index 989bd19528a97..1a07aab663030 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -219,12 +219,6 @@ abstract class ProbabilisticClassificationModel[ i += 1 } argMax -// val thresholds: Array[Double] = getThresholds -// val scaledProbability: Array[Double] = -// probability.toArray.zip(thresholds).map { case (p, t) => -// if (t == 0.0) Double.PositiveInfinity else p / t -// } -// Vectors.dense(scaledProbability).argmax } } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index c3770dd0a12df..d851b983349c9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -432,8 +432,8 @@ class LogisticRegressionWithLBFGS if (userSuppliedWeights) { val uid = Identifiable.randomUID("logreg-static") lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(uid, - new DenseMatrix(1, initialWeights.size, initialWeights.toArray, isTransposed = true), - Vectors.dense(0.0).asML, 2, false)) + new DenseMatrix(1, initialWeights.size, initialWeights.toArray), + Vectors.dense(1.0).asML, 2, false)) } lr.setFitIntercept(addIntercept) lr.setMaxIter(optimizer.getNumIterations()) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 47c1a7218fcbd..31f991b3fd5e4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -17,22 +17,21 @@ package org.apache.spark.ml.classification -import org.apache.spark.ml.attribute.NominalAttribute - import scala.collection.JavaConverters._ import scala.language.existentials import scala.util.Random import scala.util.control.Breaks._ import org.apache.spark.{SparkException, SparkFunSuite} +import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ -import org.apache.spark.ml.feature.{Instance, LabeledPoint} -import org.apache.spark.ml.linalg.{Matrices, DenseMatrix, Vector, Vectors} +import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions.lit class LogisticRegressionSuite @@ -99,7 +98,7 @@ class LogisticRegressionSuite } /** - * Enable the ignored test to export the smallBinaryDataset into CSV format, + * Enable the ignored test to export the dataset into CSV format, * so we can validate the training accuracy compared with R's glmnet package. */ ignore("export test data into CSV format") { @@ -114,7 +113,7 @@ class LogisticRegressionSuite test("params") { ParamsSuite.checkParams(new LogisticRegression) val model = new LogisticRegressionModel("logReg", - new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) + new DenseMatrix(1, 1, Array(0.0)), Vectors.dense(0.0), 2, isMultinomial = false) ParamsSuite.checkParams(model) } @@ -1839,6 +1838,7 @@ class LogisticRegressionSuite predictions1.zip(predictions2).foreach { case (Row(p1: Double), Row(p2: Double)) => assert(p1 === p2) } + assert(model2.summary.totalIterations === 1) val lr3 = new LogisticRegression() val model3 = lr3.fit(smallMultinomialDataset) @@ -1849,6 +1849,7 @@ class LogisticRegressionSuite predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) => assert(p1 === p2) } + // TODO: check that it converges in a single iteration when initial model is available } test("logistic regression with all labels the same") { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index 09e38786aa002..3ae47029c8dd8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} +import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamsSuite} import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils} import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS @@ -61,7 +61,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau test("params") { ParamsSuite.checkParams(new OneVsRest) val lrModel = new LogisticRegressionModel("logReg", - new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) + Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false) val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel)) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index 0fb26f26e7792..87c7c82e4c3b2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} import org.apache.spark.ml.feature.HashingTF -import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} +import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamPair} import org.apache.spark.ml.param.shared.HasInputCol import org.apache.spark.ml.regression.LinearRegression @@ -245,7 +245,7 @@ class CrossValidatorSuite val lr = new LogisticRegression() .setThreshold(0.6) val lrModel = new LogisticRegressionModel(lr.uid, - new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) + Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") // not default metric diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala index a05a1d641f1bb..6c58bed9812c1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} -import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} +import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCol import org.apache.spark.ml.regression.LinearRegression @@ -134,7 +134,7 @@ class TrainValidationSplitSuite val lr = new LogisticRegression() .setThreshold(0.6) val lrModel = new LogisticRegressionModel(lr.uid, - new DenseMatrix(1, 1, Array(0.0), isTransposed = false), Vectors.dense(0.0), 2, false) + Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() val paramMaps = new ParamGridBuilder() From b64ffad60d8f344a576227bf5f150eea5679aaa9 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 26 Aug 2016 10:52:29 -0700 Subject: [PATCH 11/24] use _coefficients --- .../org/apache/spark/ml/classification/LogisticRegression.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 1a7d6a2aa68a5..314fde435eb5a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -705,7 +705,7 @@ class LogisticRegressionModel private[spark] ( /** Margin (rawPrediction) for class label 1. For binary classification only. */ private val margin: Vector => Double = (features) => { - BLAS.dot(features, coefficients) + _intercept + BLAS.dot(features, _coefficients) + _intercept } /** Margin (rawPrediction) for each class label. */ From 7895c8176df4fccc144ddb29079af8dc8a9a1942 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 26 Aug 2016 11:15:58 -0700 Subject: [PATCH 12/24] use strings in supported families --- .../spark/ml/classification/LogisticRegression.scala | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 314fde435eb5a..75fab6fc81094 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -359,8 +359,8 @@ class LogisticRegression @Since("1.2.0") ( case None => histogram.length } val isBinaryClassification = numClasses == 1 || numClasses == 2 - val isMultinomial = ($(family) == LogisticRegression.Auto && !isBinaryClassification) || - ($(family) == LogisticRegression.Multinomial) + val isMultinomial = ($(family) == "auto" && !isBinaryClassification) || + ($(family) == "multinomial") val numCoefficientSets = if (isMultinomial) numClasses else 1 if (!isMultinomial) { @@ -646,11 +646,8 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Since("1.6.0") override def load(path: String): LogisticRegression = super.load(path) - private val Multinomial = "multinomial" - private val Binomial = "binomial" - private val Auto = "auto" - - private[classification] val supportedFamilyNames = Array(Auto, Binomial, Multinomial) + private[classification] val supportedFamilyNames = + Array("auto", "binomial", "multinomial").map(_.toLowerCase) } /** From c9b6d970a625fff921d0c512bb7a1dd4f7a10bf1 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 1 Sep 2016 21:43:32 -0700 Subject: [PATCH 13/24] mima exclusion for lr model constructor --- project/MimaExcludes.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 1bdcf9a623dc9..d4cbf510b9a5c 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -812,6 +812,9 @@ object MimaExcludes { ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ui.exec.ExecutorsListener.executorToTotalCores"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ui.exec.ExecutorsListener.executorToTasksMax"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ui.exec.ExecutorsListener.executorToJvmGCTime") + ) ++ Seq( + // [SPARK-17163] Unify logistic regression interface. Private constructor has new signature. + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.LogisticRegressionModel.this") ) } From b532692f4f63c414dbd4bec38c0adecb5d83d853 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 9 Sep 2016 13:24:33 -0700 Subject: [PATCH 14/24] address initial review --- .../classification/LogisticRegression.scala | 56 +++++++++++-------- .../LogisticRegressionSuite.scala | 24 +++++++- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 75fab6fc81094..b5ef73cfa8356 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -50,7 +50,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol with HasStandardization with HasWeightCol with HasThreshold with HasAggregationDepth { - import LogisticRegression._ + import org.apache.spark.ml.classification.LogisticRegression.supportedFamilyNames /** * Set threshold in binary classification, in range [0, 1]. @@ -377,7 +377,7 @@ class LogisticRegression @Since("1.2.0") ( instr.logNumClasses(numClasses) instr.logNumFeatures(numFeatures) - val (coefficients, intercept, objectiveHistory) = { + val (coefficientMatrix, interceptVector, objectiveHistory) = { if (numInvalid != 0) { val msg = s"Classification labels should be in [0 to ${numClasses - 1}]. " + s"Found $numInvalid invalid labels." @@ -385,20 +385,25 @@ class LogisticRegression @Since("1.2.0") ( throw new SparkException(msg) } - val isConstantLabel = histogram.count(_ != 0) == 1 + val isConstantLabel = histogram.count(_ != 0.0) == 1 if ($(fitIntercept) && isConstantLabel) { logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " + s"will be zeros. Training is not needed.") val constantLabelIndex = Vectors.dense(histogram).argmax - val coefficientMatrix = Matrices.sparse(numCoefficientSets, numFeatures, - Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double]) - val interceptVector = if (isMultinomial) { + val coefMatrix = if (numFeatures < numClasses) { + new SparseMatrix(numCoefficientSets, numFeatures, + Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double]) + } else { + new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numClasses + 1)(0), + Array.empty[Int], Array.empty[Double], isTransposed = true) + } + val interceptVec = if (isMultinomial) { Vectors.sparse(numClasses, Seq((constantLabelIndex, Double.PositiveInfinity))) } else { Vectors.dense(if (numClasses == 2) Double.PositiveInfinity else Double.NegativeInfinity) } - (coefficientMatrix, interceptVector, Array.empty[Double]) + (coefMatrix, interceptVec, Array.empty[Double]) } else { if (!$(fitIntercept) && isConstantLabel) { logWarning(s"All labels belong to a single class and fitIntercept=false. It's a " + @@ -460,31 +465,34 @@ class LogisticRegression @Since("1.2.0") ( val initialCoefficientsWithIntercept = Vectors.zeros(numCoefficientSets * numFeaturesPlusIntercept) - val initialModelIsValid = optInitialModel.exists { model => - val providedCoefs = model.coefficientMatrix - val modelValid = (providedCoefs.numRows == numCoefficientSets) && - (providedCoefs.numCols == numFeatures) && - (model.interceptVector.size == numCoefficientSets) - if (!modelValid) { - logWarning(s"Initial coefficients will be ignored! Its dimensions " + - s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the expected " + - s"size ($numCoefficientSets, $numFeatures)") - } - modelValid + val initialModelIsValid = optInitialModel match { + case Some(_initialModel) => + val providedCoefs = _initialModel.coefficientMatrix + val modelIsValid = (providedCoefs.numRows == numCoefficientSets) && + (providedCoefs.numCols == numFeatures) && + (_initialModel.interceptVector.size == numCoefficientSets) && + (_initialModel.getFitIntercept == $(fitIntercept)) + if (!modelIsValid) { + logWarning(s"Initial coefficients will be ignored! Its dimensions " + + s"(${providedCoefs.numRows}, ${providedCoefs.numCols}) did not match the " + + s"expected size ($numCoefficientSets, $numFeatures)") + } + modelIsValid + case None => false } if (initialModelIsValid) { - val initialCoefArray = initialCoefficientsWithIntercept.toArray + val initialCoefWithInterceptArray = initialCoefficientsWithIntercept.toArray val providedCoef = optInitialModel.get.coefficientMatrix providedCoef.foreachActive { (row, col, value) => val flatIndex = row * numFeaturesPlusIntercept + col // We need to scale the coefficients since they will be trained in the scaled space - initialCoefArray(flatIndex) = value * featuresStd(col) + initialCoefWithInterceptArray(flatIndex) = value * featuresStd(col) } if ($(fitIntercept)) { optInitialModel.get.interceptVector.foreachActive { (index, value) => val coefIndex = (index + 1) * numFeaturesPlusIntercept - 1 - initialCoefArray(coefIndex) = value + initialCoefWithInterceptArray(coefIndex) = value } } } else if ($(fitIntercept) && isMultinomial) { @@ -549,13 +557,13 @@ class LogisticRegression @Since("1.2.0") ( state = states.next() arrayBuilder += state.adjustedValue } + bcFeaturesStd.destroy(blocking = false) if (state == null) { val msg = s"${optimizer.getClass.getName} failed." logError(msg) throw new SparkException(msg) } - bcFeaturesStd.destroy(blocking = false) /* The coefficients are trained in the scaled space; we're converting them back to @@ -617,8 +625,8 @@ class LogisticRegression @Since("1.2.0") ( if (handlePersistence) instances.unpersist() - val model = copyValues(new LogisticRegressionModel(uid, coefficients, intercept, numClasses, - isMultinomial)) + val model = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector, + numClasses, isMultinomial)) // TODO: implement summary model for multinomial case val m = if (!isMultinomial) { val (summaryModel, probabilityColName) = model.findSummaryModelAndProbabilityCol() diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 31f991b3fd5e4..5af825ca0c0c3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors} +import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -1849,7 +1849,7 @@ class LogisticRegressionSuite predictions3.zip(predictions4).foreach { case (Row(p1: Double), Row(p2: Double)) => assert(p1 === p2) } - // TODO: check that it converges in a single iteration when initial model is available + // TODO: check that it converges in a single iteration when model summary is available } test("logistic regression with all labels the same") { @@ -1894,6 +1894,26 @@ class LogisticRegressionSuite assert(allOneNoInterceptModel.summary.totalIterations > 0) } + test("compressed storage") { + val moreClassesThanFeatures = spark.createDataFrame(Seq( + LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)), + LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)), + LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))) + ) + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(moreClassesThanFeatures) + assert(model.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4) + val moreFeaturesThanClasses = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)), + LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)), + LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))) + ) + val model2 = mlr.fit(moreFeaturesThanClasses) + assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3) + } + test("multiclass logistic regression with all labels the same") { val constantData = spark.createDataFrame(Seq( LabeledPoint(4.0, Vectors.dense(0.0)), From af8fb453e86b08956d06ee1f37ef3eb393287b74 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 9 Sep 2016 13:49:49 -0700 Subject: [PATCH 15/24] rewriting family detection logic --- .../ml/classification/LogisticRegression.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index b5ef73cfa8356..c4f283be0b4f4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -359,14 +359,16 @@ class LogisticRegression @Since("1.2.0") ( case None => histogram.length } val isBinaryClassification = numClasses == 1 || numClasses == 2 - val isMultinomial = ($(family) == "auto" && !isBinaryClassification) || - ($(family) == "multinomial") - val numCoefficientSets = if (isMultinomial) numClasses else 1 - - if (!isMultinomial) { - require(isBinaryClassification, s"Binomial family only supports 1 or 2 " + + val isMultinomial = $(family) match { + case "binomial" => + require(isBinaryClassification, s"Binomial family only supports 1 or 2 " + s"outcome classes but found $numClasses.") + false + case "multinomial" => true + case "auto" => !isBinaryClassification + case other => throw new IllegalArgumentException(s"Unsupported family: $other") } + val numCoefficientSets = if (isMultinomial) numClasses else 1 if (isDefined(thresholds)) { require($(thresholds).length == numClasses, this.getClass.getSimpleName + From b27cb2c190f4cdc7bd8540c8cd83e55814e52378 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 9 Sep 2016 14:32:22 -0700 Subject: [PATCH 16/24] set family explicitly in tests --- .../LogisticRegressionSuite.scala | 78 ++++++++++--------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 5af825ca0c0c3..2060d7d113376 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -153,7 +153,7 @@ class LogisticRegressionSuite } test("setThreshold, getThreshold") { - val lr = new LogisticRegression + val lr = new LogisticRegression().setFamily("binomial") // default assert(lr.getThreshold === 0.5, "LogisticRegression.threshold should default to 0.5") withClue("LogisticRegression should not have thresholds set by default.") { @@ -170,7 +170,7 @@ class LogisticRegressionSuite lr.setThreshold(0.5) assert(lr.getThresholds === Array(0.5, 0.5)) // Set via thresholds - val lr2 = new LogisticRegression + val lr2 = new LogisticRegression().setFamily("binomial") lr2.setThresholds(Array(0.3, 0.7)) val expectedThreshold = 1.0 / (1.0 + 0.3 / 0.7) assert(lr2.getThreshold ~== expectedThreshold relTol 1E-7) @@ -234,11 +234,16 @@ class LogisticRegressionSuite } test("logistic regression doesn't fit intercept when fitIntercept is off") { - val lr = new LogisticRegression + val lr = new LogisticRegression().setFamily("binomial") lr.setFitIntercept(false) val model = lr.fit(smallBinaryDataset) assert(model.intercept === 0.0) + val mlr = new LogisticRegression().setFamily("multinomial") + mlr.setFitIntercept(false) + val mlrModel = mlr.fit(smallMultinomialDataset) + assert(mlrModel.interceptVector === Vectors.sparse(3, Seq())) + // copied model must have the same parent. MLTestingUtils.checkCopy(model) } @@ -288,7 +293,7 @@ class LogisticRegressionSuite } test("multinomial logistic regression: Predictor, Classifier methods") { - val mlr = new LogisticRegression + val mlr = new LogisticRegression().setFamily("multinomial") val model = mlr.fit(smallMultinomialDataset) assert(model.numClasses === 3) @@ -335,7 +340,7 @@ class LogisticRegressionSuite } test("binary logistic regression: Predictor, Classifier methods") { - val lr = new LogisticRegression + val lr = new LogisticRegression().setFamily("binomial") val model = lr.fit(smallBinaryDataset) assert(model.numClasses === 2) @@ -364,7 +369,7 @@ class LogisticRegressionSuite } test("coefficients and intercept methods") { - val mlr = new LogisticRegression().setMaxIter(1) + val mlr = new LogisticRegression().setMaxIter(1).setFamily("multinomial") val mlrModel = mlr.fit(smallMultinomialDataset) val thrownCoef = intercept[SparkException] { mlrModel.coefficients @@ -375,7 +380,7 @@ class LogisticRegressionSuite assert(thrownCoef.getMessage().contains("use coefficientMatrix instead")) assert(thrownIntercept.getMessage().contains("use interceptVector instead")) - val blr = new LogisticRegression().setMaxIter(1) + val blr = new LogisticRegression().setMaxIter(1).setFamily("binomial") val blrModel = blr.fit(smallBinaryDataset) assert(blrModel.coefficients.size === 1) assert(blrModel.intercept !== 0.0) @@ -1751,7 +1756,7 @@ class LogisticRegressionSuite val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) }) - val lr = new LogisticRegression().setWeightCol("weight") + val lr = new LogisticRegression().setFamily("binomial").setWeightCol("weight") val model = lr.fit(outlierData) val results = model.transform(testData).select("label", "prediction").collect() @@ -1775,7 +1780,7 @@ class LogisticRegressionSuite val testData = spark.createDataFrame(Array.tabulate[LabeledPoint](numClasses) { i => LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) }) - val mlr = new LogisticRegression().setWeightCol("weight") + val mlr = new LogisticRegression().setFamily("multinomial").setWeightCol("weight") val model = mlr.fit(outlierData) val results = model.transform(testData).select("label", "prediction").collect() @@ -1829,9 +1834,9 @@ class LogisticRegressionSuite } test("set initial model") { - val lr = new LogisticRegression() + val lr = new LogisticRegression().setFamily("binomial") val model1 = lr.fit(smallBinaryDataset) - val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5) + val lr2 = new LogisticRegression().setInitialModel(model1).setMaxIter(5).setFamily("binomial") val model2 = lr2.fit(smallBinaryDataset) val predictions1 = model1.transform(smallBinaryDataset).select("prediction").collect() val predictions2 = model2.transform(smallBinaryDataset).select("prediction").collect() @@ -1840,9 +1845,10 @@ class LogisticRegressionSuite } assert(model2.summary.totalIterations === 1) - val lr3 = new LogisticRegression() + val lr3 = new LogisticRegression().setFamily("multinomial") val model3 = lr3.fit(smallMultinomialDataset) - val lr4 = new LogisticRegression().setInitialModel(model3).setMaxIter(5) + val lr4 = new LogisticRegression() + .setInitialModel(model3).setMaxIter(5).setFamily("multinomial") val model4 = lr4.fit(smallMultinomialDataset) val predictions3 = model3.transform(smallMultinomialDataset).select("prediction").collect() val predictions4 = model4.transform(smallMultinomialDataset).select("prediction").collect() @@ -1852,7 +1858,7 @@ class LogisticRegressionSuite // TODO: check that it converges in a single iteration when model summary is available } - test("logistic regression with all labels the same") { + test("binary logistic regression with all labels the same") { val sameLabels = smallBinaryDataset .withColumn("zeroLabel", lit(0.0)) .withColumn("oneLabel", lit(1.0)) @@ -1861,6 +1867,7 @@ class LogisticRegressionSuite val lrIntercept = new LogisticRegression() .setFitIntercept(true) .setMaxIter(3) + .setFamily("binomial") val allZeroInterceptModel = lrIntercept .setLabelCol("zeroLabel") @@ -1880,6 +1887,7 @@ class LogisticRegressionSuite val lrNoIntercept = new LogisticRegression() .setFitIntercept(false) .setMaxIter(3) + .setFamily("binomial") val allZeroNoInterceptModel = lrNoIntercept .setLabelCol("zeroLabel") @@ -1894,26 +1902,6 @@ class LogisticRegressionSuite assert(allOneNoInterceptModel.summary.totalIterations > 0) } - test("compressed storage") { - val moreClassesThanFeatures = spark.createDataFrame(Seq( - LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)), - LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)), - LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))) - ) - val mlr = new LogisticRegression().setFamily("multinomial") - val model = mlr.fit(moreClassesThanFeatures) - assert(model.coefficientMatrix.isInstanceOf[SparseMatrix]) - assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4) - val moreFeaturesThanClasses = spark.createDataFrame(Seq( - LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)), - LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)), - LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))) - ) - val model2 = mlr.fit(moreFeaturesThanClasses) - assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) - assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3) - } - test("multiclass logistic regression with all labels the same") { val constantData = spark.createDataFrame(Seq( LabeledPoint(4.0, Vectors.dense(0.0)), @@ -1959,8 +1947,28 @@ class LogisticRegressionSuite // TODO: check num iters is zero when it become available in the model } + test("compressed storage") { + val moreClassesThanFeatures = spark.createDataFrame(Seq( + LabeledPoint(4.0, Vectors.dense(0.0, 0.0, 0.0)), + LabeledPoint(4.0, Vectors.dense(1.0, 1.0, 1.0)), + LabeledPoint(4.0, Vectors.dense(2.0, 2.0, 2.0))) + ) + val mlr = new LogisticRegression().setFamily("multinomial") + val model = mlr.fit(moreClassesThanFeatures) + assert(model.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 4) + val moreFeaturesThanClasses = spark.createDataFrame(Seq( + LabeledPoint(1.0, Vectors.dense(0.0, 0.0, 0.0)), + LabeledPoint(1.0, Vectors.dense(1.0, 1.0, 1.0)), + LabeledPoint(1.0, Vectors.dense(2.0, 2.0, 2.0))) + ) + val model2 = mlr.fit(moreFeaturesThanClasses) + assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3) + } + test("numClasses specified in metadata/inferred") { - val lr = new LogisticRegression().setMaxIter(1) + val lr = new LogisticRegression().setMaxIter(1).setFamily("multinomial") // specify more classes than unique label values val labelMeta = NominalAttribute.defaultAttr.withName("label").withNumValues(4).toMetadata() From be030b5269518fd5c018e9e172cea7685addcb03 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 9 Sep 2016 16:01:32 -0700 Subject: [PATCH 17/24] fix compression bug --- .../apache/spark/ml/classification/LogisticRegression.scala | 5 +++-- .../spark/ml/classification/LogisticRegressionSuite.scala | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index c4f283be0b4f4..fb491dd9d60be 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -358,6 +358,7 @@ class LogisticRegression @Since("1.2.0") ( n case None => histogram.length } + val isBinaryClassification = numClasses == 1 || numClasses == 2 val isMultinomial = $(family) match { case "binomial" => @@ -393,11 +394,11 @@ class LogisticRegression @Since("1.2.0") ( logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " + s"will be zeros. Training is not needed.") val constantLabelIndex = Vectors.dense(histogram).argmax - val coefMatrix = if (numFeatures < numClasses) { + val coefMatrix = if (numFeatures < numCoefficientSets) { new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double]) } else { - new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numClasses + 1)(0), + new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numCoefficientSets + 1)(0), Array.empty[Int], Array.empty[Double], isTransposed = true) } val interceptVec = if (isMultinomial) { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 2060d7d113376..e3e3000018a12 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -1965,6 +1965,11 @@ class LogisticRegressionSuite val model2 = mlr.fit(moreFeaturesThanClasses) assert(model2.coefficientMatrix.isInstanceOf[SparseMatrix]) assert(model2.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 3) + + val blr = new LogisticRegression().setFamily("binomial") + val blrModel = blr.fit(moreFeaturesThanClasses) + assert(blrModel.coefficientMatrix.isInstanceOf[SparseMatrix]) + assert(blrModel.coefficientMatrix.asInstanceOf[SparseMatrix].colPtrs.length === 2) } test("numClasses specified in metadata/inferred") { From 73158e5b24e5e58de8284aef84297bdefa75e8ca Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 9 Sep 2016 19:18:56 -0700 Subject: [PATCH 18/24] use regex util --- .../apache/spark/ml/classification/LogisticRegression.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index fb491dd9d60be..4eb9dfd9cc128 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -42,6 +42,7 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, lit} import org.apache.spark.sql.types.DoubleType import org.apache.spark.storage.StorageLevel +import org.apache.spark.util.VersionUtils /** * Params for logistic regression. @@ -298,6 +299,7 @@ class LogisticRegression @Since("1.2.0") ( * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. + * * @group expertSetParam */ @Since("2.1.0") @@ -966,8 +968,7 @@ object LogisticRegressionModel extends MLReadable[LogisticRegressionModel] { override def load(path: String): LogisticRegressionModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) - val versionRegex = "([0-9]+)\\.([0-9]+)\\.(.+)".r - val versionRegex(major, minor, _) = metadata.sparkVersion + val (major, minor) = VersionUtils.majorMinorVersion(metadata.sparkVersion) val dataPath = new Path(path, "data").toString val data = sparkSession.read.format("parquet").load(dataPath) @@ -1386,6 +1387,7 @@ class BinaryLogisticRegressionSummary private[classification] ( * $$ *

* + * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. * @param numClasses the number of possible outcomes for k classes classification problem in From f538e1e36c6be6201b4408afbc89f2a9daf6cea7 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 12 Sep 2016 15:09:18 -0700 Subject: [PATCH 19/24] sparse storage for binary lor --- .../classification/LogisticRegression.scala | 49 ++++++++++++------- .../LogisticRegressionSuite.scala | 4 +- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 4eb9dfd9cc128..ed730230c6d29 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -299,7 +299,7 @@ class LogisticRegression @Since("1.2.0") ( * If the dimensions of features or the number of partitions are large, * this param could be adjusted to a larger size. * Default is 2. - * + * * @group expertSetParam */ @Since("2.1.0") @@ -361,14 +361,13 @@ class LogisticRegression @Since("1.2.0") ( case None => histogram.length } - val isBinaryClassification = numClasses == 1 || numClasses == 2 val isMultinomial = $(family) match { case "binomial" => - require(isBinaryClassification, s"Binomial family only supports 1 or 2 " + + require(numClasses == 1 || numClasses == 2, s"Binomial family only supports 1 or 2 " + s"outcome classes but found $numClasses.") false case "multinomial" => true - case "auto" => !isBinaryClassification + case "auto" => numClasses > 2 case other => throw new IllegalArgumentException(s"Unsupported family: $other") } val numCoefficientSets = if (isMultinomial) numClasses else 1 @@ -396,6 +395,7 @@ class LogisticRegression @Since("1.2.0") ( logWarning(s"All labels are the same value and fitIntercept=true, so the coefficients " + s"will be zeros. Training is not needed.") val constantLabelIndex = Vectors.dense(histogram).argmax + // TODO: use `compressed` after SPARK-17471 val coefMatrix = if (numFeatures < numCoefficientSets) { new SparseMatrix(numCoefficientSets, numFeatures, Array.fill(numFeatures + 1)(0), Array.empty[Int], Array.empty[Double]) @@ -587,21 +587,34 @@ class LogisticRegression @Since("1.2.0") ( 0.0 } } - val coefficientMatrix = - new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true) if ($(regParam) == 0.0 && isMultinomial) { /* - When no regularization is applied, the coefficients lack identifiability because - we do not use a pivot class. We can add any constant value to the coefficients and - get the same likelihood. So here, we choose the mean centered coefficients for + When no regularization is applied, the multinomial coefficients lack identifiability + because we do not use a pivot class. We can add any constant value to the coefficients + and get the same likelihood. So here, we choose the mean centered coefficients for reproducibility. This method follows the approach in glmnet, described here: Friedman, et al. "Regularization Paths for Generalized Linear Models via Coordinate Descent," https://core.ac.uk/download/files/153/6287975.pdf */ - val coefficientMean = coefficientMatrix.values.sum / coefficientMatrix.values.length - coefficientMatrix.update(_ - coefficientMean) + val coefficientMean = coefficientArray.sum / coefficientArray.length + coefficientArray.indices.foreach { i => coefficientArray(i) -= coefficientMean} + } + + val denseCoefficientMatrix = + new DenseMatrix(numCoefficientSets, numFeatures, coefficientArray, isTransposed = true) + // TODO: use `denseCoefficientMatrix.compressed` after SPARK-17471 + val compressedCoefficientMatrix = if (isMultinomial) { + denseCoefficientMatrix + } else { + val compressedVector = Vectors.dense(coefficientArray).compressed + compressedVector match { + case dv: DenseVector => denseCoefficientMatrix + case sv: SparseVector => + new SparseMatrix(1, numFeatures, Array(0, sv.indices.length), sv.indices, sv.values, + isTransposed = true) + } } val interceptsArray: Array[Double] = if ($(fitIntercept)) { @@ -612,10 +625,8 @@ class LogisticRegression @Since("1.2.0") ( } else { Array[Double]() } - /* - The intercepts are never regularized, so we always center the mean. - */ val interceptVector = if (interceptsArray.nonEmpty && isMultinomial) { + // The intercepts are never regularized, so we always center the mean. val interceptMean = interceptsArray.sum / numClasses interceptsArray.indices.foreach { i => interceptsArray(i) -= interceptMean } Vectors.dense(interceptsArray) @@ -624,7 +635,7 @@ class LogisticRegression @Since("1.2.0") ( } else { Vectors.sparse(numCoefficientSets, Seq()) } - (coefficientMatrix, interceptVector, arrayBuilder.result()) + (compressedCoefficientMatrix, interceptVector.compressed, arrayBuilder.result()) } } @@ -687,8 +698,12 @@ class LogisticRegressionModel private[spark] ( // convert to appropriate vector representation without replicating data private lazy val _coefficients: Vector = coefficientMatrix match { case dm: DenseMatrix => Vectors.dense(dm.values) - // TODO: better way to flatten sparse matrix? - case sm: SparseMatrix => Vectors.fromBreeze(sm.asBreeze.flatten(View.Require)) + case sm: SparseMatrix => + if (coefficientMatrix.isTransposed) { + Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values) + } else { + throw new IllegalStateException("LogisticRegressionModel coefficients should be row major.") + } } @Since("1.3.0") diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index e3e3000018a12..e7304401f324b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -604,7 +604,9 @@ class LogisticRegressionSuite val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0) assert(model2.intercept ~== interceptR2 relTol 1E-2) - assert(model2.coefficients ~= coefficientsR2 absTol 1E-3) + assert(model2.coefficients ~== coefficientsR2 absTol 1E-3) + // TODO: move this to a standalone test of compression after SPARK-17471 + assert(model2.coefficients.isInstanceOf[SparseVector]) } test("binary logistic regression without intercept with L1 regularization") { From a3a7d20414ff9a2c9df1fdf60417a4c307dd472f Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 13 Sep 2016 22:41:45 -0700 Subject: [PATCH 20/24] remove scores and address some review --- .../classification/LogisticRegression.scala | 46 ++++--------------- .../ProbabilisticClassifier.scala | 3 +- 2 files changed, 10 insertions(+), 39 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index ed730230c6d29..151002b37df65 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.classification import scala.collection.mutable -import breeze.linalg.{DenseVector => BDV, View} +import breeze.linalg.{DenseVector => BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN} import org.apache.hadoop.fs.Path @@ -696,14 +696,13 @@ class LogisticRegressionModel private[spark] ( } // convert to appropriate vector representation without replicating data - private lazy val _coefficients: Vector = coefficientMatrix match { - case dm: DenseMatrix => Vectors.dense(dm.values) - case sm: SparseMatrix => - if (coefficientMatrix.isTransposed) { - Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values) - } else { - throw new IllegalStateException("LogisticRegressionModel coefficients should be row major.") - } + private lazy val _coefficients: Vector = { + require(coefficientMatrix.isTransposed, + "LogisticRegressionModel coefficients should be row major.") + coefficientMatrix match { + case dm: DenseMatrix => Vectors.dense(dm.values) + case sm: SparseMatrix => Vectors.sparse(coefficientMatrix.numCols, sm.rowIndices, sm.values) + } } @Since("1.3.0") @@ -746,35 +745,6 @@ class LogisticRegressionModel private[spark] ( 1.0 / (1.0 + math.exp(-m)) } - /** Score (probability) for each class label. */ - // TODO: do we need this anymore? - private val scores: Vector => Vector = (features) => { - val m = margins(features) - val maxMarginIndex = m.argmax - val marginArray = m.toArray - val maxMargin = marginArray(maxMarginIndex) - - // adjust margins for overflow - val sum = { - var temp = 0.0 - var k = 0 - while (k < numClasses) { - marginArray(k) = if (maxMargin > 0) { - math.exp(marginArray(k) - maxMargin) - } else { - math.exp(marginArray(k)) - } - temp += marginArray(k) - k += 1 - } - temp - } - - val scores = Vectors.dense(marginArray) - BLAS.scal(1 / sum, scores) - scores - } - @Since("1.6.0") override val numFeatures: Int = coefficientMatrix.numCols diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index 1a07aab663030..1b6e77542cc80 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -205,7 +205,8 @@ abstract class ProbabilisticClassificationModel[ var argMax = 0 var max = Double.NegativeInfinity var i = 0 - while (i < probability.size) { + val probabilitySize = probability.size + while (i < probabilitySize) { if (thresholds(i) == 0.0) { max = Double.PositiveInfinity argMax = i From cb1666e43ba604de780f937cc5d3eaac28e7a0f8 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Sep 2016 07:42:16 -0700 Subject: [PATCH 21/24] transposed error in test suites --- .../org/apache/spark/ml/classification/OneVsRestSuite.scala | 4 ++-- .../org/apache/spark/ml/tuning/CrossValidatorSuite.scala | 4 ++-- .../apache/spark/ml/tuning/TrainValidationSplitSuite.scala | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index 3ae47029c8dd8..01a043195ad3f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.feature.StringIndexer -import org.apache.spark.ml.linalg.{Matrices, Vectors} +import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamsSuite} import org.apache.spark.ml.util.{DefaultReadWriteTest, MetadataUtils, MLTestingUtils} import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS @@ -61,7 +61,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau test("params") { ParamsSuite.checkParams(new OneVsRest) val lrModel = new LogisticRegressionModel("logReg", - Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false) + new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false) val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel)) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index 87c7c82e4c3b2..a0a2e87b10edf 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressio import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} import org.apache.spark.ml.feature.HashingTF -import org.apache.spark.ml.linalg.{Matrices, Vectors} +import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.param.{ParamMap, ParamPair} import org.apache.spark.ml.param.shared.HasInputCol import org.apache.spark.ml.regression.LinearRegression @@ -245,7 +245,7 @@ class CrossValidatorSuite val lr = new LogisticRegression() .setThreshold(0.6) val lrModel = new LogisticRegressionModel(lr.uid, - Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false) + new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") // not default metric diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala index 6c58bed9812c1..39e23e6c45dbb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.classification.LogisticRegressionSuite.generateLogisticInput import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator, RegressionEvaluator} -import org.apache.spark.ml.linalg.{Matrices, Vectors} +import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCol import org.apache.spark.ml.regression.LinearRegression @@ -134,7 +134,7 @@ class TrainValidationSplitSuite val lr = new LogisticRegression() .setThreshold(0.6) val lrModel = new LogisticRegressionModel(lr.uid, - Matrices.dense(1, 1, Array(0.0)), Vectors.dense(0.0), 2, false) + new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() val paramMaps = new ParamGridBuilder() From bd7fca10e2081372574a6c9dd59da4aca9aaf13e Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Sep 2016 14:02:26 -0700 Subject: [PATCH 22/24] update scaladoc and correct predict method --- .../classification/LogisticRegression.scala | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 151002b37df65..de5e23780c86a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -687,6 +687,11 @@ class LogisticRegressionModel private[spark] ( extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { + /** + * A vector of model coefficients for "binomial" logistic regression. If this model was trained + * using the "multinomial" family then an exception is thrown. + * @return Vector + */ @Since("2.0.0") def coefficients: Vector = if (isMultinomial) { throw new SparkException("Multinomial models contain a matrix of coefficients, use " + @@ -705,6 +710,11 @@ class LogisticRegressionModel private[spark] ( } } + /** + * The model intercept for "binomial" logistic regression. If this model was fit with the + * "multinomial" family then an exception is thrown. + * @return Double + */ @Since("1.3.0") def intercept: Double = if (isMultinomial) { throw new SparkException("Multinomial models contain a vector of intercepts, use " + @@ -745,6 +755,34 @@ class LogisticRegressionModel private[spark] ( 1.0 / (1.0 + math.exp(-m)) } + /** Score (probability) for each class label. */ + private val scores: Vector => Vector = (features) => { + val m = margins(features) + val maxMarginIndex = m.argmax + val marginArray = m.toArray + val maxMargin = marginArray(maxMarginIndex) + + // adjust margins for overflow + val sum = { + var temp = 0.0 + var k = 0 + while (k < numClasses) { + marginArray(k) = if (maxMargin > 0) { + math.exp(marginArray(k) - maxMargin) + } else { + math.exp(marginArray(k)) + } + temp += marginArray(k) + k += 1 + } + temp + } + + val scores = Vectors.dense(marginArray) + BLAS.scal(1 / sum, scores) + scores + } + @Since("1.6.0") override val numFeatures: Int = coefficientMatrix.numCols @@ -802,7 +840,11 @@ class LogisticRegressionModel private[spark] ( * The behavior of this can be adjusted using [[thresholds]]. */ override protected def predict(features: Vector): Double = if (isMultinomial) { - super.predict(features) + if (isDefined(thresholds)) { + probability2prediction(scores(features)) + } else { + super.predict(features) + } } else { // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. if (score(features) > getThreshold) 1 else 0 From 38fad988956458aac59109613c7d468855a0faf8 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 14 Sep 2016 17:05:19 -0700 Subject: [PATCH 23/24] revert predict changes and add tests --- .../classification/LogisticRegression.scala | 34 +-------------- .../LogisticRegressionSuite.scala | 43 ++++++++++++++++++- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index de5e23780c86a..2229009571a0d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -755,34 +755,6 @@ class LogisticRegressionModel private[spark] ( 1.0 / (1.0 + math.exp(-m)) } - /** Score (probability) for each class label. */ - private val scores: Vector => Vector = (features) => { - val m = margins(features) - val maxMarginIndex = m.argmax - val marginArray = m.toArray - val maxMargin = marginArray(maxMarginIndex) - - // adjust margins for overflow - val sum = { - var temp = 0.0 - var k = 0 - while (k < numClasses) { - marginArray(k) = if (maxMargin > 0) { - math.exp(marginArray(k) - maxMargin) - } else { - math.exp(marginArray(k)) - } - temp += marginArray(k) - k += 1 - } - temp - } - - val scores = Vectors.dense(marginArray) - BLAS.scal(1 / sum, scores) - scores - } - @Since("1.6.0") override val numFeatures: Int = coefficientMatrix.numCols @@ -840,11 +812,7 @@ class LogisticRegressionModel private[spark] ( * The behavior of this can be adjusted using [[thresholds]]. */ override protected def predict(features: Vector): Double = if (isMultinomial) { - if (isDefined(thresholds)) { - probability2prediction(scores(features)) - } else { - super.predict(features) - } + super.predict(features) } else { // Note: We should use getThreshold instead of $(threshold) since getThreshold is overridden. if (score(features) > getThreshold) 1 else 0 diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index e7304401f324b..e6d520f69bd7c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -231,6 +231,12 @@ class LogisticRegressionSuite assert(scaledPredictions.zip(basePredictions).forall { case (scaled, base) => scaled.getDouble(0) === base.getDouble(0) }) + + // force it to use the predict method + model.setRawPredictionCol("").setProbabilityCol("").setThresholds(Array(0, 1, 1)) + val predictionsWithPredict = + model.transform(smallMultinomialDataset).select("prediction").collect() + assert(predictionsWithPredict.forall(_.getDouble(0) === 0.0)) } test("logistic regression doesn't fit intercept when fitIntercept is off") { @@ -293,6 +299,8 @@ class LogisticRegressionSuite } test("multinomial logistic regression: Predictor, Classifier methods") { + val sqlContext = smallMultinomialDataset.sqlContext + import sqlContext.implicits._ val mlr = new LogisticRegression().setFamily("multinomial") val model = mlr.fit(smallMultinomialDataset) @@ -337,9 +345,27 @@ class LogisticRegressionSuite val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 assert(pred == predFromProb) } + + // force it to use probability2prediction + model.setProbabilityCol("") + val resultsUsingProb2Predict = + model.transform(smallMultinomialDataset).select("prediction").as[Double].collect() + resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } + + // force it to use predict + model.setRawPredictionCol("").setProbabilityCol("") + val resultsUsingPredict = + model.transform(smallMultinomialDataset).select("prediction").as[Double].collect() + resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } } test("binary logistic regression: Predictor, Classifier methods") { + val sqlContext = smallBinaryDataset.sqlContext + import sqlContext.implicits._ val lr = new LogisticRegression().setFamily("binomial") val model = lr.fit(smallBinaryDataset) @@ -347,7 +373,6 @@ class LogisticRegressionSuite val numFeatures = smallBinaryDataset.select("features").first().getAs[Vector](0).size assert(model.numFeatures === numFeatures) - val threshold = model.getThreshold val results = model.transform(smallBinaryDataset) // Compare rawPrediction with probability @@ -366,6 +391,22 @@ class LogisticRegressionSuite val predFromProb = prob.toArray.zipWithIndex.maxBy(_._1)._2 assert(pred == predFromProb) } + + // force it to use probability2prediction + model.setProbabilityCol("") + val resultsUsingProb2Predict = + model.transform(smallBinaryDataset).select("prediction").as[Double].collect() + resultsUsingProb2Predict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } + + // force it to use predict + model.setRawPredictionCol("").setProbabilityCol("") + val resultsUsingPredict = + model.transform(smallBinaryDataset).select("prediction").as[Double].collect() + resultsUsingPredict.zip(results.select("prediction").as[Double].collect()).foreach { + case (pred1, pred2) => assert(pred1 === pred2) + } } test("coefficients and intercept methods") { From 4dae59569732ace5cb2cf583d6db315fb3eda596 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 19 Sep 2016 17:47:56 -0700 Subject: [PATCH 24/24] code review, add secondary constructor --- .../spark/ml/classification/LogisticRegression.scala | 10 ++++++++-- .../ml/classification/LogisticRegressionSuite.scala | 5 ++--- .../spark/ml/classification/OneVsRestSuite.scala | 3 +-- .../apache/spark/ml/tuning/CrossValidatorSuite.scala | 3 +-- .../spark/ml/tuning/TrainValidationSplitSuite.scala | 3 +-- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2229009571a0d..343d50c790e85 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -75,7 +75,6 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas set(threshold, value) } - /** * Param for the name of family which is a description of the label distribution * to be used in the model. @@ -687,6 +686,14 @@ class LogisticRegressionModel private[spark] ( extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { + require(coefficientMatrix.numRows == interceptVector.size, s"Dimension mismatch! Expected " + + s"coefficientMatrix.numRows == interceptVector.size, but ${coefficientMatrix.numRows} != " + + s"${interceptVector.size}") + + private[spark] def this(uid: String, coefficients: Vector, intercept: Double) = + this(uid, new DenseMatrix(1, coefficients.size, coefficients.toArray, isTransposed = true), + Vectors.dense(intercept), 2, isMultinomial = false) + /** * A vector of model coefficients for "binomial" logistic regression. If this model was trained * using the "multinomial" family then an exception is thrown. @@ -1382,7 +1389,6 @@ class BinaryLogisticRegressionSummary private[classification] ( * $$ *

* - * * @param bcCoefficients The broadcast coefficients corresponding to the features. * @param bcFeaturesStd The broadcast standard deviation values of the features. * @param numClasses the number of possible outcomes for k classes classification problem in diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index e6d520f69bd7c..2623759f24d91 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.classification.LogisticRegressionSuite._ import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, SparseVector, Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -112,8 +112,7 @@ class LogisticRegressionSuite test("params") { ParamsSuite.checkParams(new LogisticRegression) - val model = new LogisticRegressionModel("logReg", - new DenseMatrix(1, 1, Array(0.0)), Vectors.dense(0.0), 2, isMultinomial = false) + val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index 01a043195ad3f..99dd5854ff649 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -60,8 +60,7 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau test("params") { ParamsSuite.checkParams(new OneVsRest) - val lrModel = new LogisticRegressionModel("logReg", - new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false) + val lrModel = new LogisticRegressionModel("lr", Vectors.dense(0.0), 0.0) val model = new OneVsRestModel("ovr", Metadata.empty, Array(lrModel)) ParamsSuite.checkParams(model) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala index a0a2e87b10edf..750dc5bf01e6a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala @@ -244,8 +244,7 @@ class CrossValidatorSuite test("read/write: CrossValidatorModel") { val lr = new LogisticRegression() .setThreshold(0.6) - val lrModel = new LogisticRegressionModel(lr.uid, - new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false) + val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") // not default metric diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala index 39e23e6c45dbb..9971371e47288 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala @@ -133,8 +133,7 @@ class TrainValidationSplitSuite test("read/write: TrainValidationSplitModel") { val lr = new LogisticRegression() .setThreshold(0.6) - val lrModel = new LogisticRegressionModel(lr.uid, - new DenseMatrix(1, 1, Array(0.0), isTransposed = true), Vectors.dense(0.0), 2, false) + val lrModel = new LogisticRegressionModel(lr.uid, Vectors.dense(1.0, 2.0), 1.2) .setThreshold(0.6) val evaluator = new BinaryClassificationEvaluator() val paramMaps = new ParamGridBuilder()