Skip to content

Commit

Permalink
[SPARK-13379][MLLIB] Fix MLlib LogisticRegressionWithLBFGS set regula…
Browse files Browse the repository at this point in the history
…rization incorrectly

## What changes were proposed in this pull request?
Fix MLlib LogisticRegressionWithLBFGS regularization map as:
```SquaredL2Updater``` -> ```elasticNetParam = 0.0```
```L1Updater``` -> ```elasticNetParam = 1.0```
cc dbtsai
## How was the this patch tested?
unit tests

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #11258 from yanboliang/spark-13379.
  • Loading branch information
yanboliang authored and DB Tsai committed Feb 22, 2016
1 parent 9bf6a92 commit 8a4ed78
Show file tree
Hide file tree
Showing 2 changed files with 350 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,8 @@ class LogisticRegressionWithLBFGS
createModel(weights, mlLogisticRegresionModel.intercept)
}
optimizer.getUpdater() match {
case x: SquaredL2Updater => runWithMlLogisitcRegression(1.0)
case x: L1Updater => runWithMlLogisitcRegression(0.0)
case x: SquaredL2Updater => runWithMlLogisitcRegression(0.0)
case x: L1Updater => runWithMlLogisitcRegression(1.0)
case _ => super.run(input, initialWeights)
}
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils


Expand Down Expand Up @@ -171,6 +172,37 @@ object LogisticRegressionSuite {


class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext with Matchers {

@transient var binaryDataset: RDD[LabeledPoint] = _

override def beforeAll(): Unit = {
super.beforeAll()
/*
Here is the instruction describing how to export the test data into CSV format
so we can validate the training accuracy compared with R's glmnet package.
val nPoints = 10000
val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
val xMean = Array(5.843, 3.057, 3.758, 1.199)
val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
val data = sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
coefficients, xMean, xVariance, true, nPoints, 42), 1)
data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", "
+ x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
*/
binaryDataset = {
val nPoints = 10000
val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 2.688191)
val xMean = Array(5.843, 3.057, 3.758, 1.199)
val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)

val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
coefficients, xMean, xVariance, true, nPoints, 42)

sc.parallelize(testData, 2)
}
}

def validatePrediction(
predictions: Seq[Double],
input: Seq[LabeledPoint],
Expand Down Expand Up @@ -555,6 +587,322 @@ class LogisticRegressionSuite extends SparkFunSuite with MLlibTestSparkContext w
}
}

/**
* From Spark 2.0, MLlib LogisticRegressionWithLBFGS will call the LogisticRegression
* implementation in ML to train model. We copies test cases from ML to guarantee
* they produce the same result.
*/
test("binary logistic regression with intercept without regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)

val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 2.8366423
data.V2 -0.5895848
data.V3 0.8931147
data.V4 -0.3925051
data.V5 -0.7996864
*/
val interceptR = 2.8366423
val coefficientsR = Vectors.dense(-0.5895848, 0.8931147, -0.3925051, -0.7996864)

assert(model1.intercept ~== interceptR relTol 1E-3)
assert(model1.weights ~= coefficientsR relTol 1E-3)

// Without regularization, with or without feature scaling will converge to the same solution.
assert(model2.intercept ~== interceptR relTol 1E-3)
assert(model2.weights ~= coefficientsR relTol 1E-3)
}

test("binary logistic regression without intercept without regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)

val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients =
coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 0, intercept=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 -0.3534996
data.V3 1.2964482
data.V4 -0.3571741
data.V5 -0.7407946
*/
val interceptR = 0.0
val coefficientsR = Vectors.dense(-0.3534996, 1.2964482, -0.3571741, -0.7407946)

assert(model1.intercept ~== interceptR relTol 1E-3)
assert(model1.weights ~= coefficientsR relTol 1E-2)

// Without regularization, with or without feature scaling should converge to the same solution.
assert(model2.intercept ~== interceptR relTol 1E-3)
assert(model2.weights ~= coefficientsR relTol 1E-2)
}

test("binary logistic regression with intercept with L1 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)

val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) -0.05627428
data.V2 .
data.V3 .
data.V4 -0.04325749
data.V5 -0.02481551
*/
val interceptR1 = -0.05627428
val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.04325749, -0.02481551)

assert(model1.intercept ~== interceptR1 relTol 1E-2)
assert(model1.weights ~= coefficientsR1 absTol 2E-2)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.3722152
data.V2 .
data.V3 .
data.V4 -0.1665453
data.V5 .
*/
val interceptR2 = 0.3722152
val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.1665453, 0.0)

assert(model2.intercept ~== interceptR2 relTol 1E-2)
assert(model2.weights ~= coefficientsR2 absTol 1E-3)
}

test("binary logistic regression without intercept with L1 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new L1Updater).setRegParam(0.12).setConvergenceTol(1E-6)

val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
intercept=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 .
data.V3 .
data.V4 -0.05189203
data.V5 -0.03891782
*/
val interceptR1 = 0.0
val coefficientsR1 = Vectors.dense(0.0, 0.0, -0.05189203, -0.03891782)

assert(model1.intercept ~== interceptR1 relTol 1E-3)
assert(model1.weights ~= coefficientsR1 absTol 1E-3)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 1, lambda = 0.12,
intercept=FALSE, standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 .
data.V3 .
data.V4 -0.08420782
data.V5 .
*/
val interceptR2 = 0.0
val coefficientsR2 = Vectors.dense(0.0, 0.0, -0.08420782, 0.0)

assert(model2.intercept ~== interceptR2 absTol 1E-3)
assert(model2.weights ~= coefficientsR2 absTol 1E-3)
}

test("binary logistic regression with intercept with L2 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(true).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)

val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.15021751
data.V2 -0.07251837
data.V3 0.10724191
data.V4 -0.04865309
data.V5 -0.10062872
*/
val interceptR1 = 0.15021751
val coefficientsR1 = Vectors.dense(-0.07251837, 0.10724191, -0.04865309, -0.10062872)

assert(model1.intercept ~== interceptR1 relTol 1E-3)
assert(model1.weights ~= coefficientsR1 relTol 1E-3)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) 0.48657516
data.V2 -0.05155371
data.V3 0.02301057
data.V4 -0.11482896
data.V5 -0.06266838
*/
val interceptR2 = 0.48657516
val coefficientsR2 = Vectors.dense(-0.05155371, 0.02301057, -0.11482896, -0.06266838)

assert(model2.intercept ~== interceptR2 relTol 1E-3)
assert(model2.weights ~= coefficientsR2 relTol 1E-3)
}

test("binary logistic regression without intercept with L2 regularization") {
val trainer1 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(true)
trainer1.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)
val trainer2 = new LogisticRegressionWithLBFGS().setIntercept(false).setFeatureScaling(false)
trainer2.optimizer.setUpdater(new SquaredL2Updater).setRegParam(1.37).setConvergenceTol(1E-6)

val model1 = trainer1.run(binaryDataset)
val model2 = trainer2.run(binaryDataset)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
intercept=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 -0.06099165
data.V3 0.12857058
data.V4 -0.04708770
data.V5 -0.09799775
*/
val interceptR1 = 0.0
val coefficientsR1 = Vectors.dense(-0.06099165, 0.12857058, -0.04708770, -0.09799775)

assert(model1.intercept ~== interceptR1 absTol 1E-3)
assert(model1.weights ~= coefficientsR1 relTol 1E-2)

/*
Using the following R code to load the data and train the model using glmnet package.
library("glmnet")
data <- read.csv("path", header=FALSE)
label = factor(data$V1)
features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
coefficients = coef(glmnet(features,label, family="binomial", alpha = 0, lambda = 1.37,
intercept=FALSE, standardize=FALSE))
coefficients
5 x 1 sparse Matrix of class "dgCMatrix"
s0
(Intercept) .
data.V2 -0.005679651
data.V3 0.048967094
data.V4 -0.093714016
data.V5 -0.053314311
*/
val interceptR2 = 0.0
val coefficientsR2 = Vectors.dense(-0.005679651, 0.048967094, -0.093714016, -0.053314311)

assert(model2.intercept ~== interceptR2 absTol 1E-3)
assert(model2.weights ~= coefficientsR2 relTol 1E-2)
}

}

class LogisticRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {
Expand Down

0 comments on commit 8a4ed78

Please sign in to comment.