From 67364abee7aae34a093828e843de5ba9315b45a9 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 27 Jan 2017 11:56:49 -0800 Subject: [PATCH 01/28] start working on SparkR tweedie API --- R/pkg/R/mllib_regression.R | 20 +++++++++++++++++-- .../GeneralizedLinearRegressionWrapper.scala | 12 +++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 0e07d3bfd899c..2af0c81cc54ae 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -84,6 +84,12 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' # can also read back the saved model and print #' savedModel <- read.ml(path) #' summary(savedModel) +#' +#' # fit tweedie model +#' library(statmod) +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, +#' family = tweedie(var.power = 1.5, link.power = 0)) +#' summary(model) #' } #' @note spark.glm since 2.0.0 #' @seealso \link{glm}, \link{read.ml} @@ -100,6 +106,15 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), print(family) stop("'family' not recognized") } + # recover variancePower and linkPower from specified tweedie family + if (tolower(family$family) == "tweedie") { + variancePower <- log(family$variance(exp(1))) + linkPower <- log(family$linkfun(exp(1))) + } else { + # these values won't be used if family is not tweedie + variancePower <- 0.0 + linkPower <- 0.0 + } formula <- paste(deparse(formula), collapse = "") if (is.null(weightCol)) { @@ -109,7 +124,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), # For known families, Gamma is upper-cased jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", "fit", formula, data@sdf, tolower(family$family), family$link, - tol, as.integer(maxIter), as.character(weightCol), regParam) + tol, as.integer(maxIter), as.character(weightCol), regParam, + as.double(variancePower), as.double(linkPower)) new("GeneralizedLinearRegressionModel", jobj = jobj) }) @@ -124,7 +140,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, -#' \code{Gamma}, and \code{poisson}. +#' \code{poisson}, \code{Gamma}, and \code{tweedie} (\code{statmod} package). #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param epsilon positive convergence tolerance of iterations. diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 78f401f29b004..9586f2d1ffdb4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -71,7 +71,9 @@ private[r] object GeneralizedLinearRegressionWrapper tol: Double, maxIter: Int, weightCol: String, - regParam: Double): GeneralizedLinearRegressionWrapper = { + regParam: Double, + variancePower: Double, + linkPower: Double): GeneralizedLinearRegressionWrapper = { val rFormula = new RFormula().setFormula(formula) checkDataColumns(rFormula, data) val rFormulaModel = rFormula.fit(data) @@ -81,7 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper .attributes.get val features = featureAttrs.map(_.name.get) // assemble and fit the pipeline - val glr = new GeneralizedLinearRegression() + var glr = new GeneralizedLinearRegression() .setFamily(family) .setLink(link) .setFitIntercept(rFormula.hasIntercept) @@ -90,6 +92,12 @@ private[r] object GeneralizedLinearRegressionWrapper .setWeightCol(weightCol) .setRegParam(regParam) .setFeaturesCol(rFormula.getFeaturesCol) + // set variancePower and linkPower in tweedie family and clear link + if (family.toLowerCase == "tweedie") { + glr = glr.clear(glr.link) + .setVariancePower(variancePower) + .setLinkPower(linkPower) + } val pipeline = new Pipeline() .setStages(Array(rFormulaModel, glr)) .fit(data) From 654551b207315b33fce00f98706360934f09c55a Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Fri, 27 Jan 2017 16:10:54 -0800 Subject: [PATCH 02/28] set link only for non-tweedie; fix issue on aic --- R/pkg/R/mllib_regression.R | 10 ++++++---- .../r/GeneralizedLinearRegressionWrapper.scala | 16 ++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 2af0c81cc54ae..554afc2d610e1 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -106,14 +106,15 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), print(family) stop("'family' not recognized") } - # recover variancePower and linkPower from specified tweedie family + + # recover variancePower and linkPower from the specified tweedie family if (tolower(family$family) == "tweedie") { variancePower <- log(family$variance(exp(1))) linkPower <- log(family$linkfun(exp(1))) } else { - # these values won't be used if family is not tweedie + # these default values are not used variancePower <- 0.0 - linkPower <- 0.0 + linkPower <- 1.0 } formula <- paste(deparse(formula), collapse = "") @@ -186,9 +187,10 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"), deviance <- callJMethod(jobj, "rDeviance") df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull") df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom") - aic <- callJMethod(jobj, "rAic") iter <- callJMethod(jobj, "rNumIterations") family <- callJMethod(jobj, "rFamily") + aic <- callJMethod(jobj, "rAic") + if (family == "tweedie" && aic == 0) aic <- NA deviance.resid <- if (is.loaded) { NULL } else { diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 9586f2d1ffdb4..6b5867e488d9f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -85,18 +85,17 @@ private[r] object GeneralizedLinearRegressionWrapper // assemble and fit the pipeline var glr = new GeneralizedLinearRegression() .setFamily(family) - .setLink(link) .setFitIntercept(rFormula.hasIntercept) .setTol(tol) .setMaxIter(maxIter) .setWeightCol(weightCol) .setRegParam(regParam) .setFeaturesCol(rFormula.getFeaturesCol) - // set variancePower and linkPower in tweedie family and clear link + // set variancePower and linkPower if family is tweedie; otherwise, set link function if (family.toLowerCase == "tweedie") { - glr = glr.clear(glr.link) - .setVariancePower(variancePower) - .setLinkPower(linkPower) + glr = glr.setVariancePower(variancePower).setLinkPower(linkPower) + } else { + glr = glr.setLink(link) } val pipeline = new Pipeline() .setStages(Array(rFormulaModel, glr)) @@ -151,7 +150,12 @@ private[r] object GeneralizedLinearRegressionWrapper val rDeviance: Double = summary.deviance val rResidualDegreeOfFreedomNull: Long = summary.residualDegreeOfFreedomNull val rResidualDegreeOfFreedom: Long = summary.residualDegreeOfFreedom - val rAic: Double = summary.aic + val rAic: Double = if (family.toLowerCase == "tweedie" && + !Array(0.0, 1.0, 2.0).contains(variancePower)) { + 0.0 + } else { + summary.aic + } val rNumIterations: Int = summary.numIterations new GeneralizedLinearRegressionWrapper(pipeline, rFeatures, rCoefficients, rDispersion, From 852dd6eec9ee960487f878dc332ea36acfd6102e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 28 Jan 2017 12:05:57 -0800 Subject: [PATCH 03/28] add test for tweedie --- R/pkg/DESCRIPTION | 3 ++- R/pkg/R/mllib_regression.R | 4 ++-- R/pkg/inst/tests/testthat/test_mllib_regression.R | 12 ++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index cc471edc376b3..af4bd5c3977d9 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -21,7 +21,8 @@ Suggests: rmarkdown, testthat, e1071, - survival + survival, + statmod Collate: 'schema.R' 'generics.R' diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 554afc2d610e1..6d9c0f4d45616 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -86,9 +86,9 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' summary(savedModel) #' #' # fit tweedie model -#' library(statmod) +#' require(statmod) #' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, -#' family = tweedie(var.power = 1.5, link.power = 0)) +#' family = tweedie(var.power = 1.2, link.power = 0)) #' summary(model) #' } #' @note spark.glm since 2.0.0 diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index c450a151713ff..32ae0f0368b87 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -77,6 +77,18 @@ test_that("spark.glm and predict", { out <- capture.output(print(summary(model))) expect_true(any(grepl("Dispersion parameter for gamma family", out))) + # tweedie family + require(statmod) + model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, + family = tweedie(var.power = 1.2, link.power = 1.0)) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + rVals <- suppressWarnings(predict( + glm(Sepal.Width ~ Sepal.Length + Species, data = iris, + family = tweedie(var.power = 1.2, link.power = 1.0)), iris)) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + # Test stats::predict is working x <- rnorm(15) y <- x + rnorm(15) From 5aa4ae79b62cd2039f8024f440c41f209a751090 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 28 Jan 2017 12:18:06 -0800 Subject: [PATCH 04/28] fix style --- R/pkg/inst/tests/testthat/test_mllib_regression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 32ae0f0368b87..3a4bff02c2574 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -85,9 +85,9 @@ test_that("spark.glm and predict", { expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) rVals <- suppressWarnings(predict( - glm(Sepal.Width ~ Sepal.Length + Species, data = iris, + glm(Sepal.Width ~ Sepal.Length + Species, data = iris, family = tweedie(var.power = 1.2, link.power = 1.0)), iris)) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) # Test stats::predict is working x <- rnorm(15) From 3682692752c1c332e29d2a92ca7fca23d8fbe2bc Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 28 Jan 2017 13:26:04 -0800 Subject: [PATCH 05/28] fix style issue --- R/pkg/R/mllib_regression.R | 2 +- R/pkg/inst/tests/testthat/test_mllib_regression.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 6d9c0f4d45616..e901088d3b5ea 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -106,7 +106,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), print(family) stop("'family' not recognized") } - + # recover variancePower and linkPower from the specified tweedie family if (tolower(family$family) == "tweedie") { variancePower <- log(family$variance(exp(1))) diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 3a4bff02c2574..79ada06a4da56 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -88,7 +88,7 @@ test_that("spark.glm and predict", { glm(Sepal.Width ~ Sepal.Length + Species, data = iris, family = tweedie(var.power = 1.2, link.power = 1.0)), iris)) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) - + # Test stats::predict is working x <- rnorm(15) y <- x + rnorm(15) From 3555afb370356f92e825f8dd5cbdb2f55baaac77 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 28 Jan 2017 15:29:23 -0800 Subject: [PATCH 06/28] remove dependency on statmod --- R/pkg/R/mllib_regression.R | 33 ++++++++++--------- .../tests/testthat/test_mllib_regression.R | 14 +++++--- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index e901088d3b5ea..5c1db4c290076 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -53,12 +53,19 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, -#' \code{Gamma}, and \code{poisson}. +#' \code{Gamma}, \code{poisson} and code{tweedie}. +#' The tweedie family must be specified using character string \code{"tweedie"}. The family +#' function \code{tweedie} in the \code{statmod} package is not supported. #' @param tol positive convergence tolerance of iterations. #' @param maxIter integer giving the maximal number of IRLS iterations. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param regParam regularization parameter for L2 regularization. +#' @param variancePower the power in the variance function of the Tweedie distribution which provides +#' the relationship between the variance and mean of the distribution. Refer to +#' \code{\link[statmod]{tweedie}} for details. +#' @param linkPower the index in the power link function. Only applicable for the Tweedie family. +#' Refer to \code{\link[statmod]{tweedie}} for details. #' @param ... additional arguments passed to the method. #' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model. @@ -86,18 +93,22 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' summary(savedModel) #' #' # fit tweedie model -#' require(statmod) -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, -#' family = tweedie(var.power = 1.2, link.power = 0)) +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "tweedie", +#' variancePower = 1.2, linkPower = 0) #' summary(model) #' } #' @note spark.glm since 2.0.0 #' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0) { + regParam = 0.0, variancePower = 0.0, linkPower = 1.0) { if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) + # recover variancePower and linkPower from the specified tweedie family + if (tolower(family) == "tweedie") { + family <- list(family = "tweedie", link = "identity") + } else { + family <- get(family, mode = "function", envir = parent.frame()) + } } if (is.function(family)) { family <- family() @@ -107,16 +118,6 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), stop("'family' not recognized") } - # recover variancePower and linkPower from the specified tweedie family - if (tolower(family$family) == "tweedie") { - variancePower <- log(family$variance(exp(1))) - linkPower <- log(family$linkfun(exp(1))) - } else { - # these default values are not used - variancePower <- 0.0 - linkPower <- 1.0 - } - formula <- paste(deparse(formula), collapse = "") if (is.null(weightCol)) { weightCol <- "" diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 79ada06a4da56..5225cad523ce6 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -78,15 +78,19 @@ test_that("spark.glm and predict", { expect_true(any(grepl("Dispersion parameter for gamma family", out))) # tweedie family - require(statmod) model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = tweedie(var.power = 1.2, link.power = 1.0)) + family = "tweedie", variancePower = 1.2, linkPower = 1.0) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) - rVals <- suppressWarnings(predict( - glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - family = tweedie(var.power = 1.2, link.power = 1.0)), iris)) + # manual calculation of the R predicted values to avoid dependence on statmod + # library(statmod) + # rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, + # family = tweedie(var.power = 1.2, link.power = 1.0)) + # print(coef(rModel)) + rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648) + rVals <- as.numeric(model.matrix(lm(Sepal.Width ~ Sepal.Length + Species, + data = iris)) %*% rCoef) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) # Test stats::predict is working From 56f6da0207b37da46e5fe3ae4b0b65951daba49f Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sat, 28 Jan 2017 16:31:23 -0800 Subject: [PATCH 07/28] create model matix directly from formula --- R/pkg/inst/tests/testthat/test_mllib_regression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 5225cad523ce6..4bbf181ede381 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -89,8 +89,8 @@ test_that("spark.glm and predict", { # family = tweedie(var.power = 1.2, link.power = 1.0)) # print(coef(rModel)) rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648) - rVals <- as.numeric(model.matrix(lm(Sepal.Width ~ Sepal.Length + Species, - data = iris)) %*% rCoef) + rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, + data = iris) %*% rCoef) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) # Test stats::predict is working From 083849c4bcfbe36a08968208b9b6c451bdbd03f5 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 29 Jan 2017 12:10:07 -0800 Subject: [PATCH 08/28] update glmWrapper --- .../spark/ml/r/GeneralizedLinearRegressionWrapper.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala index 6b5867e488d9f..7b66d8046f3ea 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala @@ -83,7 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper .attributes.get val features = featureAttrs.map(_.name.get) // assemble and fit the pipeline - var glr = new GeneralizedLinearRegression() + val glr = new GeneralizedLinearRegression() .setFamily(family) .setFitIntercept(rFormula.hasIntercept) .setTol(tol) @@ -93,9 +93,9 @@ private[r] object GeneralizedLinearRegressionWrapper .setFeaturesCol(rFormula.getFeaturesCol) // set variancePower and linkPower if family is tweedie; otherwise, set link function if (family.toLowerCase == "tweedie") { - glr = glr.setVariancePower(variancePower).setLinkPower(linkPower) + glr.setVariancePower(variancePower).setLinkPower(linkPower) } else { - glr = glr.setLink(link) + glr.setLink(link) } val pipeline = new Pipeline() .setStages(Array(rFormulaModel, glr)) @@ -151,7 +151,7 @@ private[r] object GeneralizedLinearRegressionWrapper val rResidualDegreeOfFreedomNull: Long = summary.residualDegreeOfFreedomNull val rResidualDegreeOfFreedom: Long = summary.residualDegreeOfFreedom val rAic: Double = if (family.toLowerCase == "tweedie" && - !Array(0.0, 1.0, 2.0).contains(variancePower)) { + !Array(0.0, 1.0, 2.0).exists(x => math.abs(x - variancePower) < 1e-8)) { 0.0 } else { summary.aic From fb66ce0bf2e4711a2679f0fef13eccdb27bb313c Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 29 Jan 2017 12:53:57 -0800 Subject: [PATCH 09/28] add comments --- R/pkg/R/mllib_regression.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 5c1db4c290076..5638df1742ef3 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -101,9 +101,10 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0, variancePower = 0.0, linkPower = 1.0) { + regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) { if (is.character(family)) { - # recover variancePower and linkPower from the specified tweedie family + # create a pseudo family for tweedie to avoid family validation + # only family name, variancePower and linkPower are used for the tweedie family if (tolower(family) == "tweedie") { family <- list(family = "tweedie", link = "identity") } else { From 0d722fd6fe608fc6362aed0c0610b93e9afb7811 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 29 Jan 2017 13:12:38 -0800 Subject: [PATCH 10/28] fix style issue --- R/pkg/inst/tests/testthat/test_mllib_regression.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 4bbf181ede381..fd90de6ad611c 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -83,11 +83,13 @@ test_that("spark.glm and predict", { prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) + # manual calculation of the R predicted values to avoid dependence on statmod - # library(statmod) - # rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - # family = tweedie(var.power = 1.2, link.power = 1.0)) - # print(coef(rModel)) + #' library(statmod) + #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, + #' family = tweedie(var.power = 1.2, link.power = 1.0)) + #' print(coef(rModel)) + rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648) rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, data = iris) %*% rCoef) From d11fc4b90ffabb2c1cb2a72a9cb536430c8e2bd5 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 29 Jan 2017 15:15:31 -0800 Subject: [PATCH 11/28] remove statmod from suggest; update glm --- R/pkg/DESCRIPTION | 3 +-- R/pkg/R/mllib_regression.R | 8 +++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index af4bd5c3977d9..cc471edc376b3 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -21,8 +21,7 @@ Suggests: rmarkdown, testthat, e1071, - survival, - statmod + survival Collate: 'schema.R' 'generics.R' diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 5638df1742ef3..618dfeb9f6a56 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -143,7 +143,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, -#' \code{poisson}, \code{Gamma}, and \code{tweedie} (\code{statmod} package). +#' \code{poisson}, \code{Gamma}, and \code{"tweedie"}. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param epsilon positive convergence tolerance of iterations. @@ -162,8 +162,10 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' @note glm since 1.5.0 #' @seealso \link{spark.glm} setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), - function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) { - spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol) + function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL, + variancePower = 0.0, linkPower = 1.0 - variancePower) { + spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol, + variancePower = 0.0, linkPower = linkPower) }) # Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). From 4c24158c442ab54f5acb777612cbadf22b5e7dcd Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 29 Jan 2017 15:31:08 -0800 Subject: [PATCH 12/28] clean up doc --- R/pkg/R/mllib_regression.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 618dfeb9f6a56..c9c38bc75a617 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -53,7 +53,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, -#' \code{Gamma}, \code{poisson} and code{tweedie}. +#' \code{Gamma}, \code{poisson} and \code{"tweedie"}. #' The tweedie family must be specified using character string \code{"tweedie"}. The family #' function \code{tweedie} in the \code{statmod} package is not supported. #' @param tol positive convergence tolerance of iterations. @@ -148,6 +148,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' weights as 1.0. #' @param epsilon positive convergence tolerance of iterations. #' @param maxit integer giving the maximal number of IRLS iterations. +#' @param variancePower the index of the power variance function in the Tweedie family. +#' @param linkPower the index of the power link function in the Tweedie family. #' @return \code{glm} returns a fitted generalized linear model. #' @rdname glm #' @export From 295711d313dd4e6a7a728a21261a5109b7e6e4a9 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 29 Jan 2017 18:55:40 -0800 Subject: [PATCH 13/28] remove link to statmod --- R/pkg/R/mllib_regression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index c9c38bc75a617..fd3d59bd7bde6 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -63,9 +63,9 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @param regParam regularization parameter for L2 regularization. #' @param variancePower the power in the variance function of the Tweedie distribution which provides #' the relationship between the variance and mean of the distribution. Refer to -#' \code{\link[statmod]{tweedie}} for details. +#' \code{tweedie} in package \code{statmod} for details. #' @param linkPower the index in the power link function. Only applicable for the Tweedie family. -#' Refer to \code{\link[statmod]{tweedie}} for details. +#' Refer to \code{tweedie} in package \code{statmod} for details. #' @param ... additional arguments passed to the method. #' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model. From c315fb1da788f1e8479b730e22cf3d3bbf28fba7 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 31 Jan 2017 21:35:06 -0800 Subject: [PATCH 14/28] allow R-like tweedie specification in family --- R/pkg/R/mllib_regression.R | 54 +++++++++++-------- .../tests/testthat/test_mllib_regression.R | 20 ++++++- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index fd3d59bd7bde6..621f77bb64c5b 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -53,19 +53,15 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, -#' \code{Gamma}, \code{poisson} and \code{"tweedie"}. -#' The tweedie family must be specified using character string \code{"tweedie"}. The family -#' function \code{tweedie} in the \code{statmod} package is not supported. +#' \code{Gamma}, \code{poisson} and \code{tweedie}. +#' The tweedie family is specified in a similar way as in the \code{statmod} package, +#' e.g., \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and +#' \code{tweedie(1, 0)} is poisson with log link. #' @param tol positive convergence tolerance of iterations. #' @param maxIter integer giving the maximal number of IRLS iterations. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param regParam regularization parameter for L2 regularization. -#' @param variancePower the power in the variance function of the Tweedie distribution which provides -#' the relationship between the variance and mean of the distribution. Refer to -#' \code{tweedie} in package \code{statmod} for details. -#' @param linkPower the index in the power link function. Only applicable for the Tweedie family. -#' Refer to \code{tweedie} in package \code{statmod} for details. #' @param ... additional arguments passed to the method. #' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model. @@ -93,20 +89,26 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' summary(savedModel) #' #' # fit tweedie model -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "tweedie", -#' variancePower = 1.2, linkPower = 0) +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = tweedie(1.2, 0.0)) #' summary(model) #' } #' @note spark.glm since 2.0.0 #' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) { + regParam = 0.0) { + + # create pseudo tweedie family to avoid dependence on statmod + tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { + list(family = "tweedie", + variance = function(mu) mu^var.power, + linkfun = function(mu) mu^link.power) + } + family <- eval(substitute(family)) + if (is.character(family)) { - # create a pseudo family for tweedie to avoid family validation - # only family name, variancePower and linkPower are used for the tweedie family - if (tolower(family) == "tweedie") { - family <- list(family = "tweedie", link = "identity") + if (family == "tweedie") { + family <- tweedie } else { family <- get(family, mode = "function", envir = parent.frame()) } @@ -124,11 +126,21 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), weightCol <- "" } + # recover variancePower and linkPower from the specified tweedie family + if (tolower(family$family) == "tweedie") { + variancePower <- log(family$variance(exp(1))) + linkPower <- log(family$linkfun(exp(1))) + } else { + # these default values are not used + variancePower <- 0.0 + linkPower <- 1.0 + } + # For known families, Gamma is upper-cased jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", "fit", formula, data@sdf, tolower(family$family), family$link, tol, as.integer(maxIter), as.character(weightCol), regParam, - as.double(variancePower), as.double(linkPower)) + variancePower, linkPower) new("GeneralizedLinearRegressionModel", jobj = jobj) }) @@ -143,13 +155,11 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, -#' \code{poisson}, \code{Gamma}, and \code{"tweedie"}. +#' \code{poisson}, \code{Gamma}, and \code{tweedie}. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param epsilon positive convergence tolerance of iterations. #' @param maxit integer giving the maximal number of IRLS iterations. -#' @param variancePower the index of the power variance function in the Tweedie family. -#' @param linkPower the index of the power link function in the Tweedie family. #' @return \code{glm} returns a fitted generalized linear model. #' @rdname glm #' @export @@ -164,10 +174,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' @note glm since 1.5.0 #' @seealso \link{spark.glm} setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), - function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL, - variancePower = 0.0, linkPower = 1.0 - variancePower) { - spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol, - variancePower = 0.0, linkPower = linkPower) + function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) { + spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol) }) # Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index fd90de6ad611c..dc1f510e57715 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -79,7 +79,7 @@ test_that("spark.glm and predict", { # tweedie family model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = "tweedie", variancePower = 1.2, linkPower = 1.0) + family = tweedie(var.power = 1.2, link.power = 1.0)) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) @@ -264,6 +264,24 @@ test_that("glm and predict", { data = iris, family = poisson(link = identity)), iris)) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + # tweedie family + model <- glm(training, Sepal_Width ~ Sepal_Length + Species, + family = tweedie(var.power = 1.2, link.power = 1.0)) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + + # manual calculation of the R predicted values to avoid dependence on statmod + #' library(statmod) + #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, + #' family = tweedie(var.power = 1.2, link.power = 1.0)) + #' print(coef(rModel)) + + rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648) + rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, + data = iris) %*% rCoef) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + # Test stats::predict is working x <- rnorm(15) y <- x + rnorm(15) From 9be9c5105f4c6d235e9ef880c8b5b465d3523326 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 31 Jan 2017 23:02:40 -0800 Subject: [PATCH 15/28] set default tweedie link to avoid passing functions to scala --- R/pkg/R/mllib_regression.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 621f77bb64c5b..8386b0cb7e59f 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -130,6 +130,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), if (tolower(family$family) == "tweedie") { variancePower <- log(family$variance(exp(1))) linkPower <- log(family$linkfun(exp(1))) + family$link <- "identity" } else { # these default values are not used variancePower <- 0.0 From 201939bc9a88d850551b2f88059e196fea73b666 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Tue, 31 Jan 2017 23:26:44 -0800 Subject: [PATCH 16/28] add tweedie in vignettes --- R/pkg/vignettes/sparkr-vignettes.Rmd | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 6f11c5c51676f..5430efa99851e 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -627,6 +627,7 @@ gaussian | identity, log, inverse binomial | logit, probit, cloglog (complementary log-log) poisson | log, identity, sqrt gamma | inverse, identity, log +tweedie | power link function There are three ways to specify the `family` argument. @@ -634,7 +635,9 @@ There are three ways to specify the `family` argument. * Family function, e.g. `family = binomial`. -* Result returned by a family function, e.g. `family = poisson(link = log)` +* Result returned by a family function, e.g. `family = poisson(link = log)`. + +* Note that the tweedie family is specified as `tweedie(var.power, link.power)`, where `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package, but the package is not needed in order to use the `tweedie` family. Some examples: `family = tweedie(0.0)` is gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link. For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model). @@ -650,6 +653,17 @@ gaussianFitted <- predict(gaussianGLM, carsDF) head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) ``` +The following is the same fit using the tweedie family: +```{r} +tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(0.0)) +summary(tweedieGLM1) +``` +We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link: +```{r} +tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(1.2, 0.0)) +summary(tweedieGLM2) +``` + #### Isotonic Regression `spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to minimize From 6737122763eef58c54a9b7a32994de6fe102c1d0 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 1 Feb 2017 12:00:34 -0800 Subject: [PATCH 17/28] add internal tweedie family --- R/pkg/NAMESPACE | 2 + R/pkg/R/mllib_regression.R | 43 +++++++++---------- .../tests/testthat/test_mllib_regression.R | 4 +- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 7ff6e9a9d3c73..0fb70246b1a1e 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -380,6 +380,8 @@ export("partitionBy", export("windowPartitionBy", "windowOrderBy") +export("tweedie") + S3method(print, jobj) S3method(print, structField) S3method(print, structType) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 8386b0cb7e59f..93f6f93eda0da 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -39,6 +39,19 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' @note IsotonicRegressionModel since 2.1.0 setClass("IsotonicRegressionModel", representation(jobj = "jobj")) + +#' Family object for tweedie. +#' Needs to be outside spark.glm because it is also used in R-compatible glm +#' @param var.power the power index in the variance function. +#' @param linkPower the power index in the link function. +#' @return returns a list for internal use in spark.glm +tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { + list(family = "tweedie", + var.power = var.power, + link.power = link.power, + link = paste("mu^", as.character(link.power), sep = "")) +} + #' Generalized Linear Models #' #' Fits generalized linear model against a Spark DataFrame. @@ -97,21 +110,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, regParam = 0.0) { - - # create pseudo tweedie family to avoid dependence on statmod - tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { - list(family = "tweedie", - variance = function(mu) mu^var.power, - linkfun = function(mu) mu^link.power) - } - family <- eval(substitute(family)) - if (is.character(family)) { - if (family == "tweedie") { - family <- tweedie - } else { - family <- get(family, mode = "function", envir = parent.frame()) - } + family <- get(family, mode = "function", envir = parent.frame()) } if (is.function(family)) { family <- family() @@ -126,16 +126,13 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), weightCol <- "" } - # recover variancePower and linkPower from the specified tweedie family + # set variancePower and linkPower + variancePower <- 0.0 + linkPower <- 1.0 if (tolower(family$family) == "tweedie") { - variancePower <- log(family$variance(exp(1))) - linkPower <- log(family$linkfun(exp(1))) - family$link <- "identity" - } else { - # these default values are not used - variancePower <- 0.0 - linkPower <- 1.0 - } + variancePower <- family$var.power + linkPower <- family$link.power + } # For known families, Gamma is upper-cased jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index dc1f510e57715..5b1740894598d 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -248,7 +248,7 @@ test_that("glm and predict", { training <- suppressWarnings(createDataFrame(iris)) # gaussian family model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training) - prediction <- predict(model, training) + prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) @@ -265,7 +265,7 @@ test_that("glm and predict", { expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) # tweedie family - model <- glm(training, Sepal_Width ~ Sepal_Length + Species, + model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, family = tweedie(var.power = 1.2, link.power = 1.0)) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") From 0b5ed434cc5ba68452f5ef556d20465fec4f8ccb Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 1 Feb 2017 13:05:52 -0800 Subject: [PATCH 18/28] fix style --- R/pkg/R/mllib_regression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 93f6f93eda0da..59ab628c8eba0 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -126,13 +126,13 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), weightCol <- "" } - # set variancePower and linkPower + # set variancePower and linkPower variancePower <- 0.0 linkPower <- 1.0 if (tolower(family$family) == "tweedie") { variancePower <- family$var.power linkPower <- family$link.power - } + } # For known families, Gamma is upper-cased jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", From b10777eb08621141df7190daa6157ff064c9d1af Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 1 Feb 2017 16:21:11 -0800 Subject: [PATCH 19/28] fix doc --- R/pkg/R/mllib_regression.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 59ab628c8eba0..ff35368643fe4 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -41,9 +41,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' Family object for tweedie. -#' Needs to be outside spark.glm because it is also used in R-compatible glm #' @param var.power the power index in the variance function. -#' @param linkPower the power index in the link function. +#' @param link.Power the power index in the link function. #' @return returns a list for internal use in spark.glm tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { list(family = "tweedie", From 7d5bd602ed087c3608838e5ff151ad8234e8546b Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 1 Feb 2017 18:01:02 -0800 Subject: [PATCH 20/28] fix doc --- R/pkg/R/mllib_regression.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index ff35368643fe4..f90237b35a6d0 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -39,11 +39,16 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' @note IsotonicRegressionModel since 2.1.0 setClass("IsotonicRegressionModel", representation(jobj = "jobj")) - #' Family object for tweedie. +#' #' @param var.power the power index in the variance function. #' @param link.Power the power index in the link function. #' @return returns a list for internal use in spark.glm +#' @rdname tweedie +#' @aliases tweedie +#' @name tweedie +#' @export +#' @seealso \link{spark.glm} tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { list(family = "tweedie", var.power = var.power, From a9ac439d0e5d249f09cfe98d3aa25c75c22a820e Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 1 Feb 2017 20:14:23 -0800 Subject: [PATCH 21/28] fix doc issue --- R/pkg/R/mllib_regression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index f90237b35a6d0..af4424e5181f8 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -42,9 +42,9 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' Family object for tweedie. #' #' @param var.power the power index in the variance function. -#' @param link.Power the power index in the link function. -#' @return returns a list for internal use in spark.glm +#' @param link.power the power index in the link function. #' @rdname tweedie +#' @return a list for internal use in spark.glm #' @aliases tweedie #' @name tweedie #' @export From 6cbc62fa8c8be37048193a6e3fc08195f1d497b5 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 20:11:11 -0800 Subject: [PATCH 22/28] make twedie internal (no export) --- R/pkg/R/mllib_regression.R | 25 +++++++++---------- .../tests/testthat/test_mllib_regression.R | 4 +-- R/pkg/vignettes/sparkr-vignettes.Rmd | 6 ++--- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index aea266fb6b30b..3a5eb5c284cd6 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -39,20 +39,16 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' @note IsotonicRegressionModel since 2.1.0 setClass("IsotonicRegressionModel", representation(jobj = "jobj")) -#' Family object for tweedie. +#' Family object for tweedie . #' #' @param var.power the power index in the variance function. #' @param link.power the power index in the link function. -#' @rdname tweedie +#' @noRd #' @return a list for internal use in spark.glm -#' @aliases tweedie -#' @name tweedie -#' @export -#' @seealso \link{spark.glm} tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { list(family = "tweedie", - var.power = var.power, - link.power = link.power, + variance = function (mu) mu^var.power, + linkfun = function (mu) mu^link.power, link = paste("mu^", as.character(link.power), sep = "")) } @@ -71,9 +67,12 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, #' \code{Gamma}, \code{poisson} and \code{tweedie}. -#' The tweedie family is specified in a similar way as in the \code{statmod} package, -#' e.g., \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and +#' Note that when package \code{statmod} is loaded, the tweedie family is specified using the +#' family definition therein, i.e., \code{tweedie(var.power, link.power)}. +#' For example, \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and #' \code{tweedie(1, 0)} is poisson with log link. +#' When package \code{statmod} is not available, one can use the SparkR internal definition +#' \code{SparkR:::tweedie(var.power, link.power)}. #' @param tol positive convergence tolerance of iterations. #' @param maxIter integer giving the maximal number of IRLS iterations. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance @@ -106,7 +105,7 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { #' summary(savedModel) #' #' # fit tweedie model -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = tweedie(1.2, 0.0)) +#' model <- spark.glm(df, Freq ~ Sex + Age, family = SparkR:::tweedie(1.2, 0.0)) #' summary(model) #' } #' @note spark.glm since 2.0.0 @@ -136,8 +135,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), variancePower <- 0.0 linkPower <- 1.0 if (tolower(family$family) == "tweedie") { - variancePower <- family$var.power - linkPower <- family$link.power + variancePower <- log(family$variance(exp(1))) + linkPower <- log(family$linkfun(exp(1))) } # For known families, Gamma is upper-cased diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 884ca95547ca8..7ef5cea8e2c71 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -79,7 +79,7 @@ test_that("spark.glm and predict", { # tweedie family model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = tweedie(var.power = 1.2, link.power = 1.0)) + family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0)) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) @@ -269,7 +269,7 @@ test_that("glm and predict", { # tweedie family model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, - family = tweedie(var.power = 1.2, link.power = 1.0)) + family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0)) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 0125dae796d19..02006b7ddd30a 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -682,7 +682,7 @@ There are three ways to specify the `family` argument. * Result returned by a family function, e.g. `family = poisson(link = log)`. -* Note that the tweedie family is specified as `tweedie(var.power, link.power)`, where `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package, but the package is not needed in order to use the `tweedie` family. Some examples: `family = tweedie(0.0)` is gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link. +* Note that when package `statmod` is loaded, the tweedie family is specified as `tweedie(var.power, link.power)`. Otherwise, one can use the SparkR internal definition `SparkR:::tweedie(var.power, link.power)`. In the above, `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package. Some examples: `family = tweedie(0.0)` is gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link. For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model). @@ -700,12 +700,12 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) The following is the same fit using the tweedie family: ```{r} -tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(0.0)) +tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(0.0)) summary(tweedieGLM1) ``` We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link: ```{r} -tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(1.2, 0.0)) +tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(1.2, 0.0)) summary(tweedieGLM2) ``` From ef65adc69b710366266abf5df6f458af3a66a3bf Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 5 Mar 2017 20:27:08 -0800 Subject: [PATCH 23/28] fix test issue --- R/pkg/NAMESPACE | 2 -- R/pkg/R/mllib_regression.R | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 43dbc0be3c23b..871f8e41a0f23 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -402,8 +402,6 @@ export("partitionBy", export("windowPartitionBy", "windowOrderBy") -export("tweedie") - S3method(print, jobj) S3method(print, structField) S3method(print, structType) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 3a5eb5c284cd6..fa4b790234041 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -47,8 +47,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @return a list for internal use in spark.glm tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { list(family = "tweedie", - variance = function (mu) mu^var.power, - linkfun = function (mu) mu^link.power, + variance = function(mu) mu ^ var.power, + linkfun = function(mu) mu ^ link.power, link = paste("mu^", as.character(link.power), sep = "")) } From c11e57c6146757e5d1201884eaa606eab09f9763 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 8 Mar 2017 21:59:16 -0800 Subject: [PATCH 24/28] add back variancePower and linkPower params --- R/pkg/R/mllib_regression.R | 66 +++++++++---------- .../tests/testthat/test_mllib_regression.R | 12 ++-- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index fa4b790234041..9d187fa3f4b64 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -39,19 +39,6 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj")) #' @note IsotonicRegressionModel since 2.1.0 setClass("IsotonicRegressionModel", representation(jobj = "jobj")) -#' Family object for tweedie . -#' -#' @param var.power the power index in the variance function. -#' @param link.power the power index in the link function. -#' @noRd -#' @return a list for internal use in spark.glm -tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { - list(family = "tweedie", - variance = function(mu) mu ^ var.power, - linkfun = function(mu) mu ^ link.power, - link = paste("mu^", as.character(link.power), sep = "")) -} - #' Generalized Linear Models #' #' Fits generalized linear model against a SparkDataFrame. @@ -67,17 +54,20 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' Currently these families are supported: \code{binomial}, \code{gaussian}, #' \code{Gamma}, \code{poisson} and \code{tweedie}. -#' Note that when package \code{statmod} is loaded, the tweedie family is specified using the +#' +#' Note that there are two ways to specify the tweedie family. +#' a) Set \code{family = "tweedie"} and specify the variancePower and linkPower +#' b) When package \code{statmod} is loaded, the tweedie family is specified using the #' family definition therein, i.e., \code{tweedie(var.power, link.power)}. -#' For example, \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and -#' \code{tweedie(1, 0)} is poisson with log link. -#' When package \code{statmod} is not available, one can use the SparkR internal definition -#' \code{SparkR:::tweedie(var.power, link.power)}. #' @param tol positive convergence tolerance of iterations. #' @param maxIter integer giving the maximal number of IRLS iterations. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param regParam regularization parameter for L2 regularization. +#' @param variancePower the power in the variance function of the Tweedie distribution which provides +#' the relationship between the variance and mean of the distribution. Only +#' applicable to the Tweedie family. +#' @param linkPower the index in the power link function. Only applicable to the Tweedie family. #' @param ... additional arguments passed to the method. #' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model. @@ -105,19 +95,33 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) { #' summary(savedModel) #' #' # fit tweedie model -#' model <- spark.glm(df, Freq ~ Sex + Age, family = SparkR:::tweedie(1.2, 0.0)) +#' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie", +#' variancePower = 1.2, linkPower = 0) #' summary(model) #' } #' @note spark.glm since 2.0.0 #' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0) { + regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) { + if (is.character(family)) { - family <- get(family, mode = "function", envir = parent.frame()) + # family = "tweedie" + if (tolower(family) == "tweedie") { + family <- list(family = "tweedie", link = "linkNotUsed") + } else { + family <- get(family, mode = "function", envir = parent.frame()) + } } if (is.function(family)) { - family <- family() + # family = statmod::tweedie() + if (tolower(family$family) == "tweedie") { + family <- list(family = "tweedie", link = "linkNotUsed") + variancePower <- log(family$variance(exp(1))) + linkPower <- log(family$linkfun(exp(1))) + } else { + family <- family() + } } if (is.null(family$family)) { print(family) @@ -131,19 +135,11 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), weightCol <- as.character(weightCol) } - # set variancePower and linkPower - variancePower <- 0.0 - linkPower <- 1.0 - if (tolower(family$family) == "tweedie") { - variancePower <- log(family$variance(exp(1))) - linkPower <- log(family$linkfun(exp(1))) - } - # For known families, Gamma is upper-cased jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", "fit", formula, data@sdf, tolower(family$family), family$link, tol, as.integer(maxIter), weightCol, regParam, - variancePower, linkPower) + as.double(variancePower), as.double(linkPower)) new("GeneralizedLinearRegressionModel", jobj = jobj) }) @@ -163,6 +159,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' weights as 1.0. #' @param epsilon positive convergence tolerance of iterations. #' @param maxit integer giving the maximal number of IRLS iterations. +#' @param variancePower the index of the power variance function in the Tweedie family. +#' @param linkPower the index of the power link function in the Tweedie family. #' @return \code{glm} returns a fitted generalized linear model. #' @rdname glm #' @export @@ -177,8 +175,10 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' @note glm since 1.5.0 #' @seealso \link{spark.glm} setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), - function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) { - spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol) + function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL, + variancePower = 0.0, linkPower = 1.0 - variancePower) { + spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol, + variancePower = variancePower, linkPower = linkPower) }) # Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 7ef5cea8e2c71..3f96179c60a22 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -79,7 +79,7 @@ test_that("spark.glm and predict", { # tweedie family model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0)) + family = "tweedie", variancePower = 1.2, linkPower = 0.0) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) @@ -87,10 +87,10 @@ test_that("spark.glm and predict", { # manual calculation of the R predicted values to avoid dependence on statmod #' library(statmod) #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - #' family = tweedie(var.power = 1.2, link.power = 1.0)) + #' family = tweedie(var.power = 1.2, link.power = 0.0)) #' print(coef(rModel)) - rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648) + rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, data = iris) %*% rCoef) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) @@ -269,7 +269,7 @@ test_that("glm and predict", { # tweedie family model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, - family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0)) + family = "tweedie", variancePower = 1.2, linkPower = 0.0) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) @@ -277,10 +277,10 @@ test_that("glm and predict", { # manual calculation of the R predicted values to avoid dependence on statmod #' library(statmod) #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - #' family = tweedie(var.power = 1.2, link.power = 1.0)) + #' family = tweedie(var.power = 1.2, link.power = 0.0)) #' print(coef(rModel)) - rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648) + rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, data = iris) %*% rCoef) expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) From 5ce4c8414fabdaaa970931ac61fa53515d240503 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 8 Mar 2017 22:29:50 -0800 Subject: [PATCH 25/28] update vignettes --- R/pkg/R/mllib_regression.R | 15 +++++++-------- R/pkg/inst/tests/testthat/test_mllib_regression.R | 12 ++++++------ R/pkg/vignettes/sparkr-vignettes.Rmd | 9 ++++++--- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 9d187fa3f4b64..3f509f08ffd1b 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -114,19 +114,18 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), } } if (is.function(family)) { - # family = statmod::tweedie() - if (tolower(family$family) == "tweedie") { - family <- list(family = "tweedie", link = "linkNotUsed") - variancePower <- log(family$variance(exp(1))) - linkPower <- log(family$linkfun(exp(1))) - } else { - family <- family() - } + family <- family() } if (is.null(family$family)) { print(family) stop("'family' not recognized") } + # family = statmod::tweedie() + if (tolower(family$family) == "tweedie" && !is.null(family$variance)) { + variancePower <- log(family$variance(exp(1))) + linkPower <- log(family$linkfun(exp(1))) + family <- list(family = "tweedie", link = "linkNotUsed") + } formula <- paste(deparse(formula), collapse = "") if (!is.null(weightCol) && weightCol == "") { diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 3f96179c60a22..7500abc1343ad 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -91,9 +91,9 @@ test_that("spark.glm and predict", { #' print(coef(rModel)) rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) - rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, - data = iris) %*% rCoef) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, + data = iris) %*% rCoef)) + expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals) # Test stats::predict is working x <- rnorm(15) @@ -281,9 +281,9 @@ test_that("glm and predict", { #' print(coef(rModel)) rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) - rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, - data = iris) %*% rCoef) - expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, + data = iris) %*% rCoef)) + expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals) # Test stats::predict is working x <- rnorm(15) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 02006b7ddd30a..2cd31cf7b4de7 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -682,7 +682,9 @@ There are three ways to specify the `family` argument. * Result returned by a family function, e.g. `family = poisson(link = log)`. -* Note that when package `statmod` is loaded, the tweedie family is specified as `tweedie(var.power, link.power)`. Otherwise, one can use the SparkR internal definition `SparkR:::tweedie(var.power, link.power)`. In the above, `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package. Some examples: `family = tweedie(0.0)` is gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link. +* Note that there are two ways to specify the tweedie family: + a) Set `family = "tweedie"` and specify the `variancePower` and `linkPower` + b) When package `statmod` is loaded, the tweedie family is specified using the family definition therein, i.e., `tweedie()`. For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model). @@ -700,12 +702,13 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) The following is the same fit using the tweedie family: ```{r} -tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(0.0)) +tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", variancePower = 0.0) summary(tweedieGLM1) ``` We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link: ```{r} -tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(1.2, 0.0)) +tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", + variancePower = 1.2, linkPower = 0.0) summary(tweedieGLM2) ``` From aeeb3f780be846641a3f38eb127fcba3e3387cb8 Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Wed, 8 Mar 2017 22:45:06 -0800 Subject: [PATCH 26/28] fix style --- R/pkg/R/mllib_regression.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 3f509f08ffd1b..885ae4ee82253 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -106,7 +106,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) { if (is.character(family)) { - # family = "tweedie" + # Handle when family = "tweedie" if (tolower(family) == "tweedie") { family <- list(family = "tweedie", link = "linkNotUsed") } else { @@ -120,7 +120,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), print(family) stop("'family' not recognized") } - # family = statmod::tweedie() + # Handle when family = statmod::tweedie() if (tolower(family$family) == "tweedie" && !is.null(family$variance)) { variancePower <- log(family$variance(exp(1))) linkPower <- log(family$linkfun(exp(1))) From 4cffc40f008641f59a17235ae67e4db0e975f03c Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 12 Mar 2017 18:11:18 -0700 Subject: [PATCH 27/28] change names of tweedie parameters to be consistent with R --- R/pkg/R/mllib_regression.R | 33 +++++++++++-------- .../tests/testthat/test_mllib_regression.R | 4 +-- R/pkg/vignettes/sparkr-vignettes.Rmd | 6 ++-- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index 885ae4ee82253..a526cf6f255d1 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -56,7 +56,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' \code{Gamma}, \code{poisson} and \code{tweedie}. #' #' Note that there are two ways to specify the tweedie family. -#' a) Set \code{family = "tweedie"} and specify the variancePower and linkPower +#' a) Set \code{family = "tweedie"} and specify the var.power and link.power #' b) When package \code{statmod} is loaded, the tweedie family is specified using the #' family definition therein, i.e., \code{tweedie(var.power, link.power)}. #' @param tol positive convergence tolerance of iterations. @@ -64,10 +64,10 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance #' weights as 1.0. #' @param regParam regularization parameter for L2 regularization. -#' @param variancePower the power in the variance function of the Tweedie distribution which provides +#' @param var.power the power in the variance function of the Tweedie distribution which provides #' the relationship between the variance and mean of the distribution. Only #' applicable to the Tweedie family. -#' @param linkPower the index in the power link function. Only applicable to the Tweedie family. +#' @param link.power the index in the power link function. Only applicable to the Tweedie family. #' @param ... additional arguments passed to the method. #' @aliases spark.glm,SparkDataFrame,formula-method #' @return \code{spark.glm} returns a fitted generalized linear model. @@ -96,19 +96,24 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' #' # fit tweedie model #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie", -#' variancePower = 1.2, linkPower = 0) +#' var.power = 1.2, link.power = 0) +#' summary(model) +#' +#' # use the tweedie family from statmod +#' library(statmod) +#' model <- spark.glm(df, Freq ~ Sex + Age, family = tweedie(1.2, 0)) #' summary(model) #' } #' @note spark.glm since 2.0.0 #' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL, - regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) { + regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) { if (is.character(family)) { # Handle when family = "tweedie" if (tolower(family) == "tweedie") { - family <- list(family = "tweedie", link = "linkNotUsed") + family <- list(family = "tweedie", link = NULL) } else { family <- get(family, mode = "function", envir = parent.frame()) } @@ -122,9 +127,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), } # Handle when family = statmod::tweedie() if (tolower(family$family) == "tweedie" && !is.null(family$variance)) { - variancePower <- log(family$variance(exp(1))) - linkPower <- log(family$linkfun(exp(1))) - family <- list(family = "tweedie", link = "linkNotUsed") + var.power <- log(family$variance(exp(1))) + link.power <- log(family$linkfun(exp(1))) + family <- list(family = "tweedie", link = NULL) } formula <- paste(deparse(formula), collapse = "") @@ -138,7 +143,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", "fit", formula, data@sdf, tolower(family$family), family$link, tol, as.integer(maxIter), weightCol, regParam, - as.double(variancePower), as.double(linkPower)) + as.double(var.power), as.double(link.power)) new("GeneralizedLinearRegressionModel", jobj = jobj) }) @@ -158,8 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' weights as 1.0. #' @param epsilon positive convergence tolerance of iterations. #' @param maxit integer giving the maximal number of IRLS iterations. -#' @param variancePower the index of the power variance function in the Tweedie family. -#' @param linkPower the index of the power link function in the Tweedie family. +#' @param var.power the index of the power variance function in the Tweedie family. +#' @param link.power the index of the power link function in the Tweedie family. #' @return \code{glm} returns a fitted generalized linear model. #' @rdname glm #' @export @@ -175,9 +180,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' @seealso \link{spark.glm} setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"), function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL, - variancePower = 0.0, linkPower = 1.0 - variancePower) { + var.power = 0.0, link.power = 1.0 - var.power) { spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol, - variancePower = variancePower, linkPower = linkPower) + var.power = var.power, link.power = link.power) }) # Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary(). diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R index 7500abc1343ad..3e9ad77198073 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_regression.R +++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R @@ -79,7 +79,7 @@ test_that("spark.glm and predict", { # tweedie family model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = "tweedie", variancePower = 1.2, linkPower = 0.0) + family = "tweedie", var.power = 1.2, link.power = 0.0) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) @@ -269,7 +269,7 @@ test_that("glm and predict", { # tweedie family model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training, - family = "tweedie", variancePower = 1.2, linkPower = 0.0) + family = "tweedie", var.power = 1.2, link.power = 0.0) prediction <- predict(model, training) expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") vals <- collect(select(prediction, "prediction")) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 2cd31cf7b4de7..a6ff650c33fea 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -683,7 +683,7 @@ There are three ways to specify the `family` argument. * Result returned by a family function, e.g. `family = poisson(link = log)`. * Note that there are two ways to specify the tweedie family: - a) Set `family = "tweedie"` and specify the `variancePower` and `linkPower` + a) Set `family = "tweedie"` and specify the `var.power` and `link.power` b) When package `statmod` is loaded, the tweedie family is specified using the family definition therein, i.e., `tweedie()`. For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model). @@ -702,13 +702,13 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp")) The following is the same fit using the tweedie family: ```{r} -tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", variancePower = 0.0) +tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", var.power = 0.0) summary(tweedieGLM1) ``` We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link: ```{r} tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", - variancePower = 1.2, linkPower = 0.0) + var.power = 1.2, link.power = 0.0) summary(tweedieGLM2) ``` From 0b496a6a8880dfa81a0c8c3b00577b7313f93fbc Mon Sep 17 00:00:00 2001 From: actuaryzhang Date: Sun, 12 Mar 2017 22:24:31 -0700 Subject: [PATCH 28/28] update doc --- R/pkg/R/mllib_regression.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R index a526cf6f255d1..d59c890f3e5fd 100644 --- a/R/pkg/R/mllib_regression.R +++ b/R/pkg/R/mllib_regression.R @@ -56,9 +56,11 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj")) #' \code{Gamma}, \code{poisson} and \code{tweedie}. #' #' Note that there are two ways to specify the tweedie family. -#' a) Set \code{family = "tweedie"} and specify the var.power and link.power -#' b) When package \code{statmod} is loaded, the tweedie family is specified using the -#' family definition therein, i.e., \code{tweedie(var.power, link.power)}. +#' \itemize{ +#' \item Set \code{family = "tweedie"} and specify the var.power and link.power; +#' \item When package \code{statmod} is loaded, the tweedie family is specified using the +#' family definition therein, i.e., \code{tweedie(var.power, link.power)}. +#' } #' @param tol positive convergence tolerance of iterations. #' @param maxIter integer giving the maximal number of IRLS iterations. #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance