Skip to content

Commit

Permalink
[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

PR #17715 Added Constrained Logistic Regression for ML. We should add it to SparkR.

## How was this patch tested?

Add new unit tests.

Author: wangmiao1981 <wm624@hotmail.com>

Closes #18128 from wangmiao1981/test.
  • Loading branch information
wangmiao1981 authored and Felix Cheung committed Jun 22, 2017
1 parent 215281d commit 5354337
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 8 deletions.
61 changes: 59 additions & 2 deletions R/pkg/R/mllib_classification.R
Expand Up @@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
#' or the number of partitions are large, this param could be adjusted to a larger size.
#' This is an expert parameter. Default value should be good for most cases.
#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
#' regression, or (number of classes, number of features) for multinomial regression.
#' It is a R matrix.
#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
#' regression, or (number of classes, number of features) for multinomial regression.
#' It is a R matrix.
#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
#' The bounds vector size must be equal to 1 for binomial regression, or the number
#' of classes for multinomial regression.
#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
#' The bound vector size must be equal to 1 for binomial regression, or the number
#' of classes for multinomial regression.
#' @param ... additional arguments passed to the method.
#' @return \code{spark.logit} returns a fitted logistic regression model.
#' @rdname spark.logit
Expand Down Expand Up @@ -241,21 +255,64 @@ function(object, path, overwrite = FALSE) {
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
tol = 1E-6, family = "auto", standardization = TRUE,
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
formula <- paste(deparse(formula), collapse = "")
row <- 0
col <- 0

if (!is.null(weightCol) && weightCol == "") {
weightCol <- NULL
} else if (!is.null(weightCol)) {
weightCol <- as.character(weightCol)
}

if (!is.null(lowerBoundsOnIntercepts)) {
lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
}

if (!is.null(upperBoundsOnIntercepts)) {
upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
}

if (!is.null(lowerBoundsOnCoefficients)) {
if (class(lowerBoundsOnCoefficients) != "matrix") {
stop("lowerBoundsOnCoefficients must be a matrix.")
}
row <- nrow(lowerBoundsOnCoefficients)
col <- ncol(lowerBoundsOnCoefficients)
lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
}

if (!is.null(upperBoundsOnCoefficients)) {
if (class(upperBoundsOnCoefficients) != "matrix") {
stop("upperBoundsOnCoefficients must be a matrix.")
}

if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
|| col != ncol(upperBoundsOnCoefficients))) {
stop(paste0("dimension of upperBoundsOnCoefficients ",
"is not the same as lowerBoundsOnCoefficients", sep = ""))
}

if (is.null(lowerBoundsOnCoefficients)) {
row <- nrow(upperBoundsOnCoefficients)
col <- ncol(upperBoundsOnCoefficients)
}

upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
}

jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
data@sdf, formula, as.numeric(regParam),
as.numeric(elasticNetParam), as.integer(maxIter),
as.numeric(tol), as.character(family),
as.logical(standardization), as.array(thresholds),
weightCol, as.integer(aggregationDepth))
weightCol, as.integer(aggregationDepth),
as.integer(row), as.integer(col),
lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
new("LogisticRegressionModel", jobj = jobj)
})

Expand Down
40 changes: 40 additions & 0 deletions R/pkg/tests/fulltests/test_mllib_classification.R
Expand Up @@ -223,6 +223,46 @@ test_that("spark.logit", {
model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
prediction2 <- collect(select(predict(model2, df2), "prediction"))
expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))

# Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
# and upperBoundsOnIntercepts
u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
upperBoundsOnIntercepts = 1.0)
summary <- summary(model)
coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
coefs <- summary$coefficients[, "Estimate"]
expect_true(all(abs(coefsR - coefs) < 0.1))
# Test upperBoundsOnCoefficients should be matrix
expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
upperBoundsOnIntercepts = 1.0))

# Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
# and lowerBoundsOnIntercepts
l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
lowerBoundsOnIntercepts = 0.0)
summary <- summary(model)
coefsR <- c(0, 0, -1, 0, 1.902192)
coefs <- summary$coefficients[, "Estimate"]
expect_true(all(abs(coefsR - coefs) < 0.1))
# Test lowerBoundsOnCoefficients should be matrix
expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
lowerBoundsOnIntercepts = 0.0))

# Test multinomial logistic regression with lowerBoundsOnCoefficients
# and lowerBoundsOnIntercepts
l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
model <- spark.logit(training, Species ~ ., family = "multinomial",
lowerBoundsOnCoefficients = l,
lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
summary <- summary(model)
versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
versicolorCoefs <- summary$coefficients[, "versicolor"]
virginicaCoefs <- summary$coefficients[, "virginica"]
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
})

test_that("spark.mlp", {
Expand Down
Expand Up @@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas

/**
* The lower bounds on intercepts if fitting under bound constrained optimization.
* The bounds vector size must be equal with 1 for binomial regression, or the number
* The bounds vector size must be equal to 1 for binomial regression, or the number
* of classes for multinomial regression. Otherwise, it throws exception.
* Default is none.
*
Expand All @@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas

/**
* The upper bounds on intercepts if fitting under bound constrained optimization.
* The bound vector size must be equal with 1 for binomial regression, or the number
* The bound vector size must be equal to 1 for binomial regression, or the number
* of classes for multinomial regression. Otherwise, it throws exception.
* Default is none.
*
Expand Down Expand Up @@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
}
if (isSet(lowerBoundsOnIntercepts)) {
require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
"lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
"lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
}
if (isSet(upperBoundsOnIntercepts)) {
require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
"upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
"upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
}
if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {
Expand Down
Expand Up @@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset}
Expand Down Expand Up @@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
standardization: Boolean,
thresholds: Array[Double],
weightCol: String,
aggregationDepth: Int
aggregationDepth: Int,
numRowsOfBoundsOnCoefficients: Int,
numColsOfBoundsOnCoefficients: Int,
lowerBoundsOnCoefficients: Array[Double],
upperBoundsOnCoefficients: Array[Double],
lowerBoundsOnIntercepts: Array[Double],
upperBoundsOnIntercepts: Array[Double]
): LogisticRegressionWrapper = {

val rFormula = new RFormula()
Expand Down Expand Up @@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper

if (weightCol != null) lr.setWeightCol(weightCol)

if (numRowsOfBoundsOnCoefficients != 0 &&
numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
lr.setLowerBoundsOnCoefficients(coef)
}

if (numRowsOfBoundsOnCoefficients != 0 &&
numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
lr.setUpperBoundsOnCoefficients(coef)
}

if (lowerBoundsOnIntercepts != null) {
val intercept = Vectors.dense(lowerBoundsOnIntercepts)
lr.setLowerBoundsOnIntercepts(intercept)
}

if (upperBoundsOnIntercepts != null) {
val intercept = Vectors.dense(upperBoundsOnIntercepts)
lr.setUpperBoundsOnIntercepts(intercept)
}

val idxToStr = new IndexToString()
.setInputCol(PREDICTED_LABEL_INDEX_COL)
.setOutputCol(PREDICTED_LABEL_COL)
Expand Down

0 comments on commit 5354337

Please sign in to comment.