From fbf6b5cab51544c9230567e9479528a9bd8960c5 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Wed, 13 Jan 2016 17:52:56 +0000 Subject: [PATCH 1/8] Initial fix and println unit test --- .../ml/classification/LogisticRegression.scala | 8 +++++--- .../classification/LogisticRegressionSuite.scala | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 486043e8d9741..3040081df9bc7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -339,9 +339,11 @@ class LogisticRegression @Since("1.2.0") ( b = \log{P(1) / P(0)} = \log{count_1 / count_0} }}} */ - initialCoefficientsWithIntercept.toArray(numFeatures) - = math.log(histogram(1) / histogram(0)) - } + if (histogram.length >= 2) { // check to make sure indexing into histogram(1) is safe + initialCoefficientsWithIntercept.toArray(numFeatures) = + math.log(histogram(1) / histogram(0)) + } + } val states = optimizer.iterations(new CachedDiffFunction(costFun), initialCoefficientsWithIntercept.toBreeze.toDenseVector) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index ff0d0ff771042..d41540822449d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -883,6 +883,20 @@ class LogisticRegressionSuite assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) } + test("logistic regression with all labels the same") { + val lr = new LogisticRegression() + .setFitIntercept(true) + .setMaxIter(3) + .setLabelCol("sameLabel") + val sameLabelDataset = dataset.withColumn(lit(0.0), "sameLabel") + val model = lr.fit(sameLabelDataset) + val predictions = model.transform(sameLabelDataset) + + predictions.show() + println(model.weights) + println(model.intercept) + } + test("read/write") { def checkModelData(model: LogisticRegressionModel, model2: LogisticRegressionModel): Unit = { assert(model.intercept === model2.intercept) From e4c13d4a89abc8160f1c2fa906cb3e3d1affd473 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Wed, 13 Jan 2016 19:23:49 +0000 Subject: [PATCH 2/8] Cleans up test --- .../classification/LogisticRegression.scala | 4 +- .../LogisticRegressionSuite.scala | 51 ++++++++++--------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 3040081df9bc7..cf7da12286b7f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -340,8 +340,8 @@ class LogisticRegression @Since("1.2.0") ( }}} */ if (histogram.length >= 2) { // check to make sure indexing into histogram(1) is safe - initialCoefficientsWithIntercept.toArray(numFeatures) = - math.log(histogram(1) / histogram(0)) + initialCoefficientsWithIntercept.toArray(numFeatures) = math.log( + histogram(1) / histogram(0)) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index d41540822449d..a0d425f22dbed 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -30,6 +30,7 @@ import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.lit class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @@ -302,7 +303,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -338,7 +339,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -377,7 +378,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -401,7 +402,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-2) assert(model1.coefficients ~= coefficientsR1 absTol 2E-2) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -436,7 +437,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -461,7 +462,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-3) assert(model1.coefficients ~= coefficientsR1 absTol 1E-3) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -496,7 +497,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -520,7 +521,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-3) assert(model1.coefficients ~= coefficientsR1 relTol 1E-3) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -555,7 +556,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -580,7 +581,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 absTol 1E-3) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -615,7 +616,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -639,7 +640,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 6E-3) assert(model1.coefficients ~== coefficientsR1 absTol 5E-3) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -674,7 +675,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -699,7 +700,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-3) assert(model1.coefficients ~= coefficientsR1 absTol 1E-2) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -744,7 +745,7 @@ class LogisticRegressionSuite classSummarizer1.merge(classSummarizer2) }).histogram - /* + [> For binary logistic regression with strong L1 regularization, all the coefficients will be zeros. As a result, {{{ @@ -764,7 +765,7 @@ class LogisticRegressionSuite assert(model2.intercept ~== interceptTheory relTol 1E-5) assert(model2.coefficients ~= coefficientsTheory absTol 1E-6) - /* + [> Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -887,14 +888,16 @@ class LogisticRegressionSuite val lr = new LogisticRegression() .setFitIntercept(true) .setMaxIter(3) - .setLabelCol("sameLabel") - val sameLabelDataset = dataset.withColumn(lit(0.0), "sameLabel") - val model = lr.fit(sameLabelDataset) - val predictions = model.transform(sameLabelDataset) - - predictions.show() - println(model.weights) - println(model.intercept) + val sameLabels = dataset + .withColumn("zeroLabel", lit(0.0)) + .withColumn("oneLabel", lit(1.0)) + + val model = lr + .setLabelCol("oneLabel") + .fit(sameLabels) + + assert(model.coefficients ~== Vectors.dense(0.0) absTol 1E-3) + assert(model.intercept === Double.PositiveInfinity) } test("read/write") { From caf7a1b2cd4336134d1f29e0fb4432a67d44288e Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Wed, 13 Jan 2016 19:26:04 +0000 Subject: [PATCH 3/8] Fix R comments --- .../LogisticRegressionSuite.scala | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index a0d425f22dbed..8ac7bb7db0c1b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -303,7 +303,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -339,7 +339,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -378,7 +378,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -402,7 +402,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-2) assert(model1.coefficients ~= coefficientsR1 absTol 2E-2) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -437,7 +437,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -462,7 +462,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-3) assert(model1.coefficients ~= coefficientsR1 absTol 1E-3) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -497,7 +497,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -521,7 +521,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-3) assert(model1.coefficients ~= coefficientsR1 relTol 1E-3) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -556,7 +556,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -581,7 +581,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 absTol 1E-3) assert(model1.coefficients ~= coefficientsR1 relTol 1E-2) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -616,7 +616,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -640,7 +640,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 6E-3) assert(model1.coefficients ~== coefficientsR1 absTol 5E-3) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -675,7 +675,7 @@ class LogisticRegressionSuite val model1 = trainer1.fit(binaryDataset) val model2 = trainer2.fit(binaryDataset) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -700,7 +700,7 @@ class LogisticRegressionSuite assert(model1.intercept ~== interceptR1 relTol 1E-3) assert(model1.coefficients ~= coefficientsR1 absTol 1E-2) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") @@ -745,7 +745,7 @@ class LogisticRegressionSuite classSummarizer1.merge(classSummarizer2) }).histogram - [> + /* For binary logistic regression with strong L1 regularization, all the coefficients will be zeros. As a result, {{{ @@ -765,7 +765,7 @@ class LogisticRegressionSuite assert(model2.intercept ~== interceptTheory relTol 1E-5) assert(model2.coefficients ~= coefficientsTheory absTol 1E-6) - [> + /* Using the following R code to load the data and train the model using glmnet package. library("glmnet") From d676f6245622cedf61df3875ca0873d77f72a857 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Sun, 17 Jan 2016 14:14:55 +0000 Subject: [PATCH 4/8] Returns model immediately for same labels --- .../classification/LogisticRegression.scala | 202 +++++++++--------- .../LogisticRegressionSuite.scala | 15 +- 2 files changed, 115 insertions(+), 102 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index cf7da12286b7f..10692506cd69b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -276,115 +276,123 @@ class LogisticRegression @Since("1.2.0") ( val numClasses = histogram.length val numFeatures = summarizer.mean.size - if (numInvalid != 0) { - val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + - s"Found $numInvalid invalid labels." - logError(msg) - throw new SparkException(msg) - } - - if (numClasses > 2) { - val msg = s"Currently, LogisticRegression with ElasticNet in ML package only supports " + - s"binary classification. Found $numClasses in the input dataset." - logError(msg) - throw new SparkException(msg) - } + val (coefficients, intercept, objectiveHistory) = { + if (numInvalid != 0) { + val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + + s"Found $numInvalid invalid labels." + logError(msg) + throw new SparkException(msg) + } - val featuresMean = summarizer.mean.toArray - val featuresStd = summarizer.variance.toArray.map(math.sqrt) + if (numClasses > 2) { + val msg = s"Currently, LogisticRegression with ElasticNet in ML package only supports " + + s"binary classification. Found $numClasses in the input dataset." + logError(msg) + throw new SparkException(msg) + } else if ($(fitIntercept) && numClasses == 2 && histogram(0) == 0.0) { + logWarning(s"All labels are one and fitIntercept=true, so the coefficients will be " + + s"zeros and the intercept will be positive infinity; as a result, " + + s"training is not needed.") + (Vectors.sparse(numFeatures, Seq()), Double.PositiveInfinity, Array.empty[Double]) + } else if ($(fitIntercept) && numClasses == 1) { + logWarning(s"All labels are one and fitIntercept=true, so the coefficients will be " + + s"zeros and the intercept will be negative infinity; as a result, " + + s"training is not needed.") + (Vectors.sparse(numFeatures, Seq()), Double.NegativeInfinity, Array.empty[Double]) + } else { + val featuresMean = summarizer.mean.toArray + val featuresStd = summarizer.variance.toArray.map(math.sqrt) - val regParamL1 = $(elasticNetParam) * $(regParam) - val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) + val regParamL1 = $(elasticNetParam) * $(regParam) + val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) - val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), $(standardization), - featuresStd, featuresMean, regParamL2) + val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), + $(standardization), featuresStd, featuresMean, regParamL2) - val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { - new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) - } else { - def regParamL1Fun = (index: Int) => { - // Remove the L1 penalization on the intercept - if (index == numFeatures) { - 0.0 + val optimizer = if ($(elasticNetParam) == 0.0 || $(regParam) == 0.0) { + new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol)) } else { - if ($(standardization)) { - regParamL1 - } else { - // If `standardization` is false, we still standardize the data - // to improve the rate of convergence; as a result, we have to - // perform this reverse standardization by penalizing each component - // differently to get effectively the same objective function when - // the training dataset is not standardized. - if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0 + def regParamL1Fun = (index: Int) => { + // Remove the L1 penalization on the intercept + if (index == numFeatures) { + 0.0 + } else { + if ($(standardization)) { + regParamL1 + } else { + // If `standardization` is false, we still standardize the data + // to improve the rate of convergence; as a result, we have to + // perform this reverse standardization by penalizing each component + // differently to get effectively the same objective function when + // the training dataset is not standardized. + if (featuresStd(index) != 0.0) regParamL1 / featuresStd(index) else 0.0 + } + } } + new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) } - } - new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, regParamL1Fun, $(tol)) - } - val initialCoefficientsWithIntercept = - Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures) - - if ($(fitIntercept)) { - /* - For binary logistic regression, when we initialize the coefficients as zeros, - it will converge faster if we initialize the intercept such that - it follows the distribution of the labels. - - {{{ - P(0) = 1 / (1 + \exp(b)), and - P(1) = \exp(b) / (1 + \exp(b)) - }}}, hence - {{{ - b = \log{P(1) / P(0)} = \log{count_1 / count_0} - }}} - */ - if (histogram.length >= 2) { // check to make sure indexing into histogram(1) is safe - initialCoefficientsWithIntercept.toArray(numFeatures) = math.log( - histogram(1) / histogram(0)) - } - } - - val states = optimizer.iterations(new CachedDiffFunction(costFun), - initialCoefficientsWithIntercept.toBreeze.toDenseVector) + val initialCoefficientsWithIntercept = + Vectors.zeros(if ($(fitIntercept)) numFeatures + 1 else numFeatures) + + if ($(fitIntercept)) { + /* + For binary logistic regression, when we initialize the coefficients as zeros, + it will converge faster if we initialize the intercept such that + it follows the distribution of the labels. + + {{{ + P(0) = 1 / (1 + \exp(b)), and + P(1) = \exp(b) / (1 + \exp(b)) + }}}, hence + {{{ + b = \log{P(1) / P(0)} = \log{count_1 / count_0} + }}} + */ + initialCoefficientsWithIntercept.toArray(numFeatures) = math.log( + histogram(1) / histogram(0)) + } - val (coefficients, intercept, objectiveHistory) = { - /* - Note that in Logistic Regression, the objective history (loss + regularization) - is log-likelihood which is invariance under feature standardization. As a result, - the objective history from optimizer is the same as the one in the original space. - */ - val arrayBuilder = mutable.ArrayBuilder.make[Double] - var state: optimizer.State = null - while (states.hasNext) { - state = states.next() - arrayBuilder += state.adjustedValue - } + val states = optimizer.iterations(new CachedDiffFunction(costFun), + initialCoefficientsWithIntercept.toBreeze.toDenseVector) + + /* + Note that in Logistic Regression, the objective history (loss + regularization) + is log-likelihood which is invariance under feature standardization. As a result, + the objective history from optimizer is the same as the one in the original space. + */ + val arrayBuilder = mutable.ArrayBuilder.make[Double] + var state: optimizer.State = null + while (states.hasNext) { + state = states.next() + arrayBuilder += state.adjustedValue + } - if (state == null) { - val msg = s"${optimizer.getClass.getName} failed." - logError(msg) - throw new SparkException(msg) - } + if (state == null) { + val msg = s"${optimizer.getClass.getName} failed." + logError(msg) + throw new SparkException(msg) + } - /* - The coefficients are trained in the scaled space; we're converting them back to - the original space. - Note that the intercept in scaled space and original space is the same; - as a result, no scaling is needed. - */ - val rawCoefficients = state.x.toArray.clone() - var i = 0 - while (i < numFeatures) { - rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } - i += 1 - } + /* + The coefficients are trained in the scaled space; we're converting them back to + the original space. + Note that the intercept in scaled space and original space is the same; + as a result, no scaling is needed. + */ + val rawCoefficients = state.x.toArray.clone() + var i = 0 + while (i < numFeatures) { + rawCoefficients(i) *= { if (featuresStd(i) != 0.0) 1.0 / featuresStd(i) else 0.0 } + i += 1 + } - if ($(fitIntercept)) { - (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last, - arrayBuilder.result()) - } else { - (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result()) + if ($(fitIntercept)) { + (Vectors.dense(rawCoefficients.dropRight(1)).compressed, rawCoefficients.last, + arrayBuilder.result()) + } else { + (Vectors.dense(rawCoefficients).compressed, 0.0, arrayBuilder.result()) + } } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 8ac7bb7db0c1b..5f9a2371cd07f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -884,7 +884,7 @@ class LogisticRegressionSuite assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) } - test("logistic regression with all labels the same") { + test("logistic regression with fitIntercept=true and all labels the same") { val lr = new LogisticRegression() .setFitIntercept(true) .setMaxIter(3) @@ -892,12 +892,17 @@ class LogisticRegressionSuite .withColumn("zeroLabel", lit(0.0)) .withColumn("oneLabel", lit(1.0)) - val model = lr - .setLabelCol("oneLabel") + val allZeroModel = lr + .setLabelCol("zeroLabel") .fit(sameLabels) + assert(allZeroModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3) + assert(allZeroModel.intercept === Double.NegativeInfinity) - assert(model.coefficients ~== Vectors.dense(0.0) absTol 1E-3) - assert(model.intercept === Double.PositiveInfinity) + val allOneModel = lr + .setLabelCol("oneLabel") + .fit(sameLabels) + assert(allOneModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3) + assert(allOneModel.intercept === Double.PositiveInfinity) } test("read/write") { From 0f4824d9f358f451968aa2a6ab2b31afd85cda64 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 18 Jan 2016 15:15:41 +0000 Subject: [PATCH 5/8] Tests and cleanup from code review --- .../classification/LogisticRegression.scala | 24 +++++------ .../LogisticRegressionSuite.scala | 41 ++++++++++++++----- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 10692506cd69b..0b3ba09bf698f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -295,7 +295,7 @@ class LogisticRegression @Since("1.2.0") ( s"training is not needed.") (Vectors.sparse(numFeatures, Seq()), Double.PositiveInfinity, Array.empty[Double]) } else if ($(fitIntercept) && numClasses == 1) { - logWarning(s"All labels are one and fitIntercept=true, so the coefficients will be " + + logWarning(s"All labels are zero and fitIntercept=true, so the coefficients will be " + s"zeros and the intercept will be negative infinity; as a result, " + s"training is not needed.") (Vectors.sparse(numFeatures, Seq()), Double.NegativeInfinity, Array.empty[Double]) @@ -337,17 +337,17 @@ class LogisticRegression @Since("1.2.0") ( if ($(fitIntercept)) { /* - For binary logistic regression, when we initialize the coefficients as zeros, - it will converge faster if we initialize the intercept such that - it follows the distribution of the labels. - - {{{ - P(0) = 1 / (1 + \exp(b)), and - P(1) = \exp(b) / (1 + \exp(b)) - }}}, hence - {{{ - b = \log{P(1) / P(0)} = \log{count_1 / count_0} - }}} + For binary logistic regression, when we initialize the coefficients as zeros, + it will converge faster if we initialize the intercept such that + it follows the distribution of the labels. + + {{{ + P(0) = 1 / (1 + \exp(b)), and + P(1) = \exp(b) / (1 + \exp(b)) + }}}, hence + {{{ + b = \log{P(1) / P(0)} = \log{count_1 / count_0} + }}} */ initialCoefficientsWithIntercept.toArray(numFeatures) = math.log( histogram(1) / histogram(0)) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 5f9a2371cd07f..fb79a7c1f5f72 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -884,25 +884,46 @@ class LogisticRegressionSuite assert(model1a0.intercept ~== model1b.intercept absTol 1E-3) } - test("logistic regression with fitIntercept=true and all labels the same") { - val lr = new LogisticRegression() - .setFitIntercept(true) - .setMaxIter(3) + test("logistic regression with all labels the same") { val sameLabels = dataset .withColumn("zeroLabel", lit(0.0)) .withColumn("oneLabel", lit(1.0)) - val allZeroModel = lr + // fitIntercept=true + val lrIntercept = new LogisticRegression() + .setFitIntercept(true) + .setMaxIter(3) + + val allZeroInterceptModel = lrIntercept + .setLabelCol("zeroLabel") + .fit(sameLabels) + assert(allZeroInterceptModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3) + assert(allZeroInterceptModel.intercept === Double.NegativeInfinity) + assert(allZeroInterceptModel.summary.totalIterations === 0) + + val allOneInterceptModel = lrIntercept + .setLabelCol("oneLabel") + .fit(sameLabels) + assert(allOneInterceptModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3) + assert(allOneInterceptModel.intercept === Double.PositiveInfinity) + assert(allOneInterceptModel.summary.totalIterations === 0) + + // fitIntercept=false + val lrNoIntercept= new LogisticRegression() + .setFitIntercept(false) + .setMaxIter(3) + + val allZeroNoInterceptModel = lrNoIntercept .setLabelCol("zeroLabel") .fit(sameLabels) - assert(allZeroModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3) - assert(allZeroModel.intercept === Double.NegativeInfinity) + assert(allZeroNoInterceptModel.intercept === 0.0) + println(allZeroNoInterceptModel.summary.totalIterations > 0) - val allOneModel = lr + val allOneNoInterceptModel = lrNoIntercept .setLabelCol("oneLabel") .fit(sameLabels) - assert(allOneModel.coefficients ~== Vectors.dense(0.0) absTol 1E-3) - assert(allOneModel.intercept === Double.PositiveInfinity) + assert(allOneNoInterceptModel.intercept === 0.0) + println(allOneNoInterceptModel.summary.totalIterations > 0) } test("read/write") { From 3d227c0dd3e89ff286ae6ed6df94bad706e569af Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 18 Jan 2016 16:57:04 +0000 Subject: [PATCH 6/8] Fix println --- .../spark/ml/classification/LogisticRegressionSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index fb79a7c1f5f72..66e5d0e28dad1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -917,13 +917,13 @@ class LogisticRegressionSuite .setLabelCol("zeroLabel") .fit(sameLabels) assert(allZeroNoInterceptModel.intercept === 0.0) - println(allZeroNoInterceptModel.summary.totalIterations > 0) + assert(allZeroNoInterceptModel.summary.totalIterations > 0) val allOneNoInterceptModel = lrNoIntercept .setLabelCol("oneLabel") .fit(sameLabels) assert(allOneNoInterceptModel.intercept === 0.0) - println(allOneNoInterceptModel.summary.totalIterations > 0) + assert(allOneNoInterceptModel.summary.totalIterations > 0) } test("read/write") { From c8c1586ffd5e5076c6aed564b41d99314099a5a0 Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 18 Jan 2016 16:57:39 +0000 Subject: [PATCH 7/8] Style fix --- .../spark/ml/classification/LogisticRegressionSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 66e5d0e28dad1..972c0868a454a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -909,7 +909,7 @@ class LogisticRegressionSuite assert(allOneInterceptModel.summary.totalIterations === 0) // fitIntercept=false - val lrNoIntercept= new LogisticRegression() + val lrNoIntercept = new LogisticRegression() .setFitIntercept(false) .setMaxIter(3) From 95816d4c158d7e11cab8ac7bf28b9bb84026d33a Mon Sep 17 00:00:00 2001 From: Feynman Liang Date: Mon, 18 Jan 2016 20:30:27 +0000 Subject: [PATCH 8/8] Fix indentation --- .../classification/LogisticRegression.scala | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 0b3ba09bf698f..dad8dfc84ec15 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -337,18 +337,18 @@ class LogisticRegression @Since("1.2.0") ( if ($(fitIntercept)) { /* - For binary logistic regression, when we initialize the coefficients as zeros, - it will converge faster if we initialize the intercept such that - it follows the distribution of the labels. - - {{{ - P(0) = 1 / (1 + \exp(b)), and - P(1) = \exp(b) / (1 + \exp(b)) - }}}, hence - {{{ - b = \log{P(1) / P(0)} = \log{count_1 / count_0} - }}} - */ + For binary logistic regression, when we initialize the coefficients as zeros, + it will converge faster if we initialize the intercept such that + it follows the distribution of the labels. + + {{{ + P(0) = 1 / (1 + \exp(b)), and + P(1) = \exp(b) / (1 + \exp(b)) + }}}, hence + {{{ + b = \log{P(1) / P(0)} = \log{count_1 / count_0} + }}} + */ initialCoefficientsWithIntercept.toArray(numFeatures) = math.log( histogram(1) / histogram(0)) }