From 67364abee7aae34a093828e843de5ba9315b45a9 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 27 Jan 2017 11:56:49 -0800
Subject: [PATCH 01/28] start working on SparkR tweedie API

---
 R/pkg/R/mllib_regression.R                    | 20 +++++++++++++++++--
 .../GeneralizedLinearRegressionWrapper.scala  | 12 +++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 0e07d3bfd899c..2af0c81cc54ae 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -84,6 +84,12 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' # can also read back the saved model and print
 #' savedModel <- read.ml(path)
 #' summary(savedModel)
+#'
+#' # fit tweedie model
+#' library(statmod)
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, 
+#'   family = tweedie(var.power = 1.5, link.power = 0))
+#' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
@@ -100,6 +106,15 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               print(family)
               stop("'family' not recognized")
             }
+            # recover variancePower and linkPower from specified tweedie family
+            if (tolower(family$family) == "tweedie") {
+              variancePower <- log(family$variance(exp(1)))
+              linkPower <- log(family$linkfun(exp(1)))
+            } else {
+              # these values won't be used if family is not tweedie
+              variancePower <- 0.0
+              linkPower <- 0.0
+            }
 
             formula <- paste(deparse(formula), collapse = "")
             if (is.null(weightCol)) {
@@ -109,7 +124,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
-                                tol, as.integer(maxIter), as.character(weightCol), regParam)
+                                tol, as.integer(maxIter), as.character(weightCol), regParam,
+                                as.double(variancePower), as.double(linkPower))
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -124,7 +140,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{Gamma}, and \code{poisson}.
+#'               \code{poisson}, \code{Gamma}, and \code{tweedie} (\code{statmod} package).
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 78f401f29b004..9586f2d1ffdb4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -71,7 +71,9 @@ private[r] object GeneralizedLinearRegressionWrapper
       tol: Double,
       maxIter: Int,
       weightCol: String,
-      regParam: Double): GeneralizedLinearRegressionWrapper = {
+      regParam: Double,
+      variancePower: Double,
+      linkPower: Double): GeneralizedLinearRegressionWrapper = {
     val rFormula = new RFormula().setFormula(formula)
     checkDataColumns(rFormula, data)
     val rFormulaModel = rFormula.fit(data)
@@ -81,7 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       .attributes.get
     val features = featureAttrs.map(_.name.get)
     // assemble and fit the pipeline
-    val glr = new GeneralizedLinearRegression()
+    var glr = new GeneralizedLinearRegression()
       .setFamily(family)
       .setLink(link)
       .setFitIntercept(rFormula.hasIntercept)
@@ -90,6 +92,12 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setWeightCol(weightCol)
       .setRegParam(regParam)
       .setFeaturesCol(rFormula.getFeaturesCol)
+    // set variancePower and linkPower in tweedie family and clear link
+    if (family.toLowerCase == "tweedie") {
+      glr = glr.clear(glr.link)
+        .setVariancePower(variancePower)
+        .setLinkPower(linkPower)
+    }
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))
       .fit(data)

From 654551b207315b33fce00f98706360934f09c55a Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Fri, 27 Jan 2017 16:10:54 -0800
Subject: [PATCH 02/28] set link only for non-tweedie; fix issue on aic

---
 R/pkg/R/mllib_regression.R                       | 10 ++++++----
 .../r/GeneralizedLinearRegressionWrapper.scala   | 16 ++++++++++------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 2af0c81cc54ae..554afc2d610e1 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -106,14 +106,15 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               print(family)
               stop("'family' not recognized")
             }
-            # recover variancePower and linkPower from specified tweedie family
+            
+            # recover variancePower and linkPower from the specified tweedie family
             if (tolower(family$family) == "tweedie") {
               variancePower <- log(family$variance(exp(1)))
               linkPower <- log(family$linkfun(exp(1)))
             } else {
-              # these values won't be used if family is not tweedie
+              # these default values are not used
               variancePower <- 0.0
-              linkPower <- 0.0
+              linkPower <- 1.0
             }
 
             formula <- paste(deparse(formula), collapse = "")
@@ -186,9 +187,10 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             deviance <- callJMethod(jobj, "rDeviance")
             df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
             df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
-            aic <- callJMethod(jobj, "rAic")
             iter <- callJMethod(jobj, "rNumIterations")
             family <- callJMethod(jobj, "rFamily")
+            aic <- callJMethod(jobj, "rAic")
+            if (family == "tweedie" && aic == 0) aic <- NA
             deviance.resid <- if (is.loaded) {
               NULL
             } else {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 9586f2d1ffdb4..6b5867e488d9f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -85,18 +85,17 @@ private[r] object GeneralizedLinearRegressionWrapper
     // assemble and fit the pipeline
     var glr = new GeneralizedLinearRegression()
       .setFamily(family)
-      .setLink(link)
       .setFitIntercept(rFormula.hasIntercept)
       .setTol(tol)
       .setMaxIter(maxIter)
       .setWeightCol(weightCol)
       .setRegParam(regParam)
       .setFeaturesCol(rFormula.getFeaturesCol)
-    // set variancePower and linkPower in tweedie family and clear link
+    // set variancePower and linkPower if family is tweedie; otherwise, set link function
     if (family.toLowerCase == "tweedie") {
-      glr = glr.clear(glr.link)
-        .setVariancePower(variancePower)
-        .setLinkPower(linkPower)
+      glr = glr.setVariancePower(variancePower).setLinkPower(linkPower)
+    } else {
+      glr = glr.setLink(link)
     }
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))
@@ -151,7 +150,12 @@ private[r] object GeneralizedLinearRegressionWrapper
     val rDeviance: Double = summary.deviance
     val rResidualDegreeOfFreedomNull: Long = summary.residualDegreeOfFreedomNull
     val rResidualDegreeOfFreedom: Long = summary.residualDegreeOfFreedom
-    val rAic: Double = summary.aic
+    val rAic: Double = if (family.toLowerCase == "tweedie" &&
+      !Array(0.0, 1.0, 2.0).contains(variancePower)) {
+      0.0
+    } else {
+      summary.aic
+    }
     val rNumIterations: Int = summary.numIterations
 
     new GeneralizedLinearRegressionWrapper(pipeline, rFeatures, rCoefficients, rDispersion,

From 852dd6eec9ee960487f878dc332ea36acfd6102e Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sat, 28 Jan 2017 12:05:57 -0800
Subject: [PATCH 03/28] add test for tweedie

---
 R/pkg/DESCRIPTION                                 |  3 ++-
 R/pkg/R/mllib_regression.R                        |  4 ++--
 R/pkg/inst/tests/testthat/test_mllib_regression.R | 12 ++++++++++++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index cc471edc376b3..af4bd5c3977d9 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -21,7 +21,8 @@ Suggests:
     rmarkdown,
     testthat,
     e1071,
-    survival
+    survival,
+    statmod
 Collate:
     'schema.R'
     'generics.R'
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 554afc2d610e1..6d9c0f4d45616 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -86,9 +86,9 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' summary(savedModel)
 #'
 #' # fit tweedie model
-#' library(statmod)
+#' require(statmod)
 #' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, 
-#'   family = tweedie(var.power = 1.5, link.power = 0))
+#'   family = tweedie(var.power = 1.2, link.power = 0))
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index c450a151713ff..32ae0f0368b87 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -77,6 +77,18 @@ test_that("spark.glm and predict", {
   out <- capture.output(print(summary(model)))
   expect_true(any(grepl("Dispersion parameter for gamma family", out)))
 
+  # tweedie family
+  require(statmod)
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                     family = tweedie(var.power = 1.2, link.power = 1.0))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- suppressWarnings(predict(
+    glm(Sepal.Width ~ Sepal.Length + Species, data = iris, 
+        family = tweedie(var.power = 1.2, link.power = 1.0)), iris))
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) 
+  
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)

From 5aa4ae79b62cd2039f8024f440c41f209a751090 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sat, 28 Jan 2017 12:18:06 -0800
Subject: [PATCH 04/28] fix style

---
 R/pkg/inst/tests/testthat/test_mllib_regression.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 32ae0f0368b87..3a4bff02c2574 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -85,9 +85,9 @@ test_that("spark.glm and predict", {
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
   rVals <- suppressWarnings(predict(
-    glm(Sepal.Width ~ Sepal.Length + Species, data = iris, 
+    glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
         family = tweedie(var.power = 1.2, link.power = 1.0)), iris))
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) 
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
   
   # Test stats::predict is working
   x <- rnorm(15)

From 3682692752c1c332e29d2a92ca7fca23d8fbe2bc Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sat, 28 Jan 2017 13:26:04 -0800
Subject: [PATCH 05/28] fix style issue

---
 R/pkg/R/mllib_regression.R                        | 2 +-
 R/pkg/inst/tests/testthat/test_mllib_regression.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 6d9c0f4d45616..e901088d3b5ea 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -106,7 +106,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               print(family)
               stop("'family' not recognized")
             }
-            
+
             # recover variancePower and linkPower from the specified tweedie family
             if (tolower(family$family) == "tweedie") {
               variancePower <- log(family$variance(exp(1)))
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 3a4bff02c2574..79ada06a4da56 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -88,7 +88,7 @@ test_that("spark.glm and predict", {
     glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
         family = tweedie(var.power = 1.2, link.power = 1.0)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-  
+
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)

From 3555afb370356f92e825f8dd5cbdb2f55baaac77 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sat, 28 Jan 2017 15:29:23 -0800
Subject: [PATCH 06/28] remove dependency on statmod

---
 R/pkg/R/mllib_regression.R                    | 33 ++++++++++---------
 .../tests/testthat/test_mllib_regression.R    | 14 +++++---
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index e901088d3b5ea..5c1db4c290076 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -53,12 +53,19 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{Gamma}, and \code{poisson}.
+#'               \code{Gamma}, \code{poisson} and code{tweedie}.
+#'               The tweedie family must be specified using character string \code{"tweedie"}. The family
+#'               function \code{tweedie} in the \code{statmod} package is not supported.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param regParam regularization parameter for L2 regularization.
+#' @param variancePower the power in the variance function of the Tweedie distribution which provides
+#'                      the relationship between the variance and mean of the distribution. Refer to
+#'                      \code{\link[statmod]{tweedie}} for details.
+#' @param linkPower the index in the power link function. Only applicable for the Tweedie family.
+#'                  Refer to \code{\link[statmod]{tweedie}} for details.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -86,18 +93,22 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' summary(savedModel)
 #'
 #' # fit tweedie model
-#' require(statmod)
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, 
-#'   family = tweedie(var.power = 1.2, link.power = 0))
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "tweedie",
+#'   variancePower = 1.2, linkPower = 0)
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
-                   regParam = 0.0) {
+                   regParam = 0.0, variancePower = 0.0, linkPower = 1.0) {
             if (is.character(family)) {
-              family <- get(family, mode = "function", envir = parent.frame())
+              # recover variancePower and linkPower from the specified tweedie family
+              if (tolower(family) == "tweedie") {
+                family <- list(family = "tweedie", link = "identity")
+              } else {
+                family <- get(family, mode = "function", envir = parent.frame())
+              }
             }
             if (is.function(family)) {
               family <- family()
@@ -107,16 +118,6 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               stop("'family' not recognized")
             }
 
-            # recover variancePower and linkPower from the specified tweedie family
-            if (tolower(family$family) == "tweedie") {
-              variancePower <- log(family$variance(exp(1)))
-              linkPower <- log(family$linkfun(exp(1)))
-            } else {
-              # these default values are not used
-              variancePower <- 0.0
-              linkPower <- 1.0
-            }
-
             formula <- paste(deparse(formula), collapse = "")
             if (is.null(weightCol)) {
               weightCol <- ""
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 79ada06a4da56..5225cad523ce6 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -78,15 +78,19 @@ test_that("spark.glm and predict", {
   expect_true(any(grepl("Dispersion parameter for gamma family", out)))
 
   # tweedie family
-  require(statmod)
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = tweedie(var.power = 1.2, link.power = 1.0))
+                     family = "tweedie", variancePower = 1.2, linkPower = 1.0)
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
-  rVals <- suppressWarnings(predict(
-    glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-        family = tweedie(var.power = 1.2, link.power = 1.0)), iris))
+  # manual calculation of the R predicted values to avoid dependence on statmod
+  # library(statmod)
+  # rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+  #             family = tweedie(var.power = 1.2, link.power = 1.0))
+  # print(coef(rModel))
+  rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648)
+  rVals <- as.numeric(model.matrix(lm(Sepal.Width ~ Sepal.Length + Species,
+                                      data = iris)) %*% rCoef)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
   # Test stats::predict is working

From 56f6da0207b37da46e5fe3ae4b0b65951daba49f Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sat, 28 Jan 2017 16:31:23 -0800
Subject: [PATCH 07/28] create model matix directly from formula

---
 R/pkg/inst/tests/testthat/test_mllib_regression.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 5225cad523ce6..4bbf181ede381 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -89,8 +89,8 @@ test_that("spark.glm and predict", {
   #             family = tweedie(var.power = 1.2, link.power = 1.0))
   # print(coef(rModel))
   rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648)
-  rVals <- as.numeric(model.matrix(lm(Sepal.Width ~ Sepal.Length + Species,
-                                      data = iris)) %*% rCoef)
+  rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                   data = iris) %*% rCoef)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
   # Test stats::predict is working

From 083849c4bcfbe36a08968208b9b6c451bdbd03f5 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 29 Jan 2017 12:10:07 -0800
Subject: [PATCH 08/28] update glmWrapper

---
 .../spark/ml/r/GeneralizedLinearRegressionWrapper.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 6b5867e488d9f..7b66d8046f3ea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -83,7 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper
       .attributes.get
     val features = featureAttrs.map(_.name.get)
     // assemble and fit the pipeline
-    var glr = new GeneralizedLinearRegression()
+    val glr = new GeneralizedLinearRegression()
       .setFamily(family)
       .setFitIntercept(rFormula.hasIntercept)
       .setTol(tol)
@@ -93,9 +93,9 @@ private[r] object GeneralizedLinearRegressionWrapper
       .setFeaturesCol(rFormula.getFeaturesCol)
     // set variancePower and linkPower if family is tweedie; otherwise, set link function
     if (family.toLowerCase == "tweedie") {
-      glr = glr.setVariancePower(variancePower).setLinkPower(linkPower)
+      glr.setVariancePower(variancePower).setLinkPower(linkPower)
     } else {
-      glr = glr.setLink(link)
+      glr.setLink(link)
     }
     val pipeline = new Pipeline()
       .setStages(Array(rFormulaModel, glr))
@@ -151,7 +151,7 @@ private[r] object GeneralizedLinearRegressionWrapper
     val rResidualDegreeOfFreedomNull: Long = summary.residualDegreeOfFreedomNull
     val rResidualDegreeOfFreedom: Long = summary.residualDegreeOfFreedom
     val rAic: Double = if (family.toLowerCase == "tweedie" &&
-      !Array(0.0, 1.0, 2.0).contains(variancePower)) {
+      !Array(0.0, 1.0, 2.0).exists(x => math.abs(x - variancePower) < 1e-8)) {
       0.0
     } else {
       summary.aic

From fb66ce0bf2e4711a2679f0fef13eccdb27bb313c Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 29 Jan 2017 12:53:57 -0800
Subject: [PATCH 09/28] add comments

---
 R/pkg/R/mllib_regression.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 5c1db4c290076..5638df1742ef3 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -101,9 +101,10 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
-                   regParam = 0.0, variancePower = 0.0, linkPower = 1.0) {
+                   regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) {
             if (is.character(family)) {
-              # recover variancePower and linkPower from the specified tweedie family
+              # create a pseudo family for tweedie to avoid family validation
+              # only family name, variancePower and linkPower are used for the tweedie family
               if (tolower(family) == "tweedie") {
                 family <- list(family = "tweedie", link = "identity")
               } else {

From 0d722fd6fe608fc6362aed0c0610b93e9afb7811 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 29 Jan 2017 13:12:38 -0800
Subject: [PATCH 10/28] fix style issue

---
 R/pkg/inst/tests/testthat/test_mllib_regression.R | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 4bbf181ede381..fd90de6ad611c 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -83,11 +83,13 @@ test_that("spark.glm and predict", {
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
+
   # manual calculation of the R predicted values to avoid dependence on statmod
-  # library(statmod)
-  # rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #             family = tweedie(var.power = 1.2, link.power = 1.0))
-  # print(coef(rModel))
+  #' library(statmod)
+  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+  #'             family = tweedie(var.power = 1.2, link.power = 1.0))
+  #' print(coef(rModel))
+
   rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648)
   rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
                                    data = iris) %*% rCoef)

From d11fc4b90ffabb2c1cb2a72a9cb536430c8e2bd5 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 29 Jan 2017 15:15:31 -0800
Subject: [PATCH 11/28] remove statmod from suggest; update glm

---
 R/pkg/DESCRIPTION          | 3 +--
 R/pkg/R/mllib_regression.R | 8 +++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index af4bd5c3977d9..cc471edc376b3 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -21,8 +21,7 @@ Suggests:
     rmarkdown,
     testthat,
     e1071,
-    survival,
-    statmod
+    survival
 Collate:
     'schema.R'
     'generics.R'
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 5638df1742ef3..618dfeb9f6a56 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -143,7 +143,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{poisson}, \code{Gamma}, and \code{tweedie} (\code{statmod} package).
+#'               \code{poisson}, \code{Gamma}, and \code{"tweedie"}.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
@@ -162,8 +162,10 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @note glm since 1.5.0
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
-          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) {
-            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol)
+          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
+                   variancePower = 0.0, linkPower = 1.0 - variancePower) {
+            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
+                      variancePower = 0.0, linkPower = linkPower)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().

From 4c24158c442ab54f5acb777612cbadf22b5e7dcd Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 29 Jan 2017 15:31:08 -0800
Subject: [PATCH 12/28] clean up doc

---
 R/pkg/R/mllib_regression.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 618dfeb9f6a56..c9c38bc75a617 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -53,7 +53,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{Gamma}, \code{poisson} and code{tweedie}.
+#'               \code{Gamma}, \code{poisson} and \code{"tweedie"}.
 #'               The tweedie family must be specified using character string \code{"tweedie"}. The family
 #'               function \code{tweedie} in the \code{statmod} package is not supported.
 #' @param tol positive convergence tolerance of iterations.
@@ -148,6 +148,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
 #' @param maxit integer giving the maximal number of IRLS iterations.
+#' @param variancePower the index of the power variance function in the Tweedie family.
+#' @param linkPower the index of the power link function in the Tweedie family.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export

From 295711d313dd4e6a7a728a21261a5109b7e6e4a9 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 29 Jan 2017 18:55:40 -0800
Subject: [PATCH 13/28] remove link to statmod

---
 R/pkg/R/mllib_regression.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index c9c38bc75a617..fd3d59bd7bde6 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -63,9 +63,9 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' @param regParam regularization parameter for L2 regularization.
 #' @param variancePower the power in the variance function of the Tweedie distribution which provides
 #'                      the relationship between the variance and mean of the distribution. Refer to
-#'                      \code{\link[statmod]{tweedie}} for details.
+#'                      \code{tweedie} in package \code{statmod} for details.
 #' @param linkPower the index in the power link function. Only applicable for the Tweedie family.
-#'                  Refer to \code{\link[statmod]{tweedie}} for details.
+#'                  Refer to \code{tweedie} in package \code{statmod} for details.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.

From c315fb1da788f1e8479b730e22cf3d3bbf28fba7 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 31 Jan 2017 21:35:06 -0800
Subject: [PATCH 14/28] allow R-like tweedie specification in family

---
 R/pkg/R/mllib_regression.R                    | 54 +++++++++++--------
 .../tests/testthat/test_mllib_regression.R    | 20 ++++++-
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index fd3d59bd7bde6..621f77bb64c5b 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -53,19 +53,15 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{Gamma}, \code{poisson} and \code{"tweedie"}.
-#'               The tweedie family must be specified using character string \code{"tweedie"}. The family
-#'               function \code{tweedie} in the \code{statmod} package is not supported.
+#'               \code{Gamma}, \code{poisson} and \code{tweedie}.
+#'               The tweedie family is specified in a similar way as in the \code{statmod} package,
+#'               e.g., \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and
+#'               \code{tweedie(1, 0)} is poisson with log link.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param regParam regularization parameter for L2 regularization.
-#' @param variancePower the power in the variance function of the Tweedie distribution which provides
-#'                      the relationship between the variance and mean of the distribution. Refer to
-#'                      \code{tweedie} in package \code{statmod} for details.
-#' @param linkPower the index in the power link function. Only applicable for the Tweedie family.
-#'                  Refer to \code{tweedie} in package \code{statmod} for details.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -93,20 +89,26 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' summary(savedModel)
 #'
 #' # fit tweedie model
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "tweedie",
-#'   variancePower = 1.2, linkPower = 0)
+#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = tweedie(1.2, 0.0))
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
-                   regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) {
+                   regParam = 0.0) {
+
+            # create pseudo tweedie family to avoid dependence on statmod
+            tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
+              list(family = "tweedie",
+                   variance = function(mu) mu^var.power,
+                   linkfun = function(mu) mu^link.power)
+            }
+            family <- eval(substitute(family))
+
             if (is.character(family)) {
-              # create a pseudo family for tweedie to avoid family validation
-              # only family name, variancePower and linkPower are used for the tweedie family
-              if (tolower(family) == "tweedie") {
-                family <- list(family = "tweedie", link = "identity")
+              if (family == "tweedie") {
+                family <- tweedie
               } else {
                 family <- get(family, mode = "function", envir = parent.frame())
               }
@@ -124,11 +126,21 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- ""
             }
 
+            # recover variancePower and linkPower from the specified tweedie family
+            if (tolower(family$family) == "tweedie") {
+              variancePower <- log(family$variance(exp(1)))
+              linkPower <- log(family$linkfun(exp(1)))
+            } else {
+              # these default values are not used
+              variancePower <- 0.0
+              linkPower <- 1.0
+            }
+
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), as.character(weightCol), regParam,
-                                as.double(variancePower), as.double(linkPower))
+                                variancePower, linkPower)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -143,13 +155,11 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
-#'               \code{poisson}, \code{Gamma}, and \code{"tweedie"}.
+#'               \code{poisson}, \code{Gamma}, and \code{tweedie}.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
 #' @param maxit integer giving the maximal number of IRLS iterations.
-#' @param variancePower the index of the power variance function in the Tweedie family.
-#' @param linkPower the index of the power link function in the Tweedie family.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -164,10 +174,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @note glm since 1.5.0
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
-          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
-                   variancePower = 0.0, linkPower = 1.0 - variancePower) {
-            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
-                      variancePower = 0.0, linkPower = linkPower)
+          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) {
+            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index fd90de6ad611c..dc1f510e57715 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -79,7 +79,7 @@ test_that("spark.glm and predict", {
 
   # tweedie family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = "tweedie", variancePower = 1.2, linkPower = 1.0)
+                     family = tweedie(var.power = 1.2, link.power = 1.0))
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
@@ -264,6 +264,24 @@ test_that("glm and predict", {
                                         data = iris, family = poisson(link = identity)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
+  # tweedie family
+  model <- glm(training, Sepal_Width ~ Sepal_Length + Species,
+                     family = tweedie(var.power = 1.2, link.power = 1.0))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+
+  # manual calculation of the R predicted values to avoid dependence on statmod
+  #' library(statmod)
+  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+  #'             family = tweedie(var.power = 1.2, link.power = 1.0))
+  #' print(coef(rModel))
+
+  rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648)
+  rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                   data = iris) %*% rCoef)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)

From 9be9c5105f4c6d235e9ef880c8b5b465d3523326 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 31 Jan 2017 23:02:40 -0800
Subject: [PATCH 15/28] set default tweedie link to avoid passing functions to
 scala

---
 R/pkg/R/mllib_regression.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 621f77bb64c5b..8386b0cb7e59f 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -130,6 +130,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             if (tolower(family$family) == "tweedie") {
               variancePower <- log(family$variance(exp(1)))
               linkPower <- log(family$linkfun(exp(1)))
+              family$link <- "identity"
             } else {
               # these default values are not used
               variancePower <- 0.0

From 201939bc9a88d850551b2f88059e196fea73b666 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Tue, 31 Jan 2017 23:26:44 -0800
Subject: [PATCH 16/28] add tweedie in vignettes

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 6f11c5c51676f..5430efa99851e 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -627,6 +627,7 @@ gaussian | identity, log, inverse
 binomial | logit, probit, cloglog (complementary log-log)
 poisson | log, identity, sqrt
 gamma | inverse, identity, log
+tweedie | power link function
 
 There are three ways to specify the `family` argument.
 
@@ -634,7 +635,9 @@ There are three ways to specify the `family` argument.
 
 * Family function, e.g. `family = binomial`.
 
-* Result returned by a family function, e.g. `family = poisson(link = log)`
+* Result returned by a family function, e.g. `family = poisson(link = log)`.
+
+* Note that the tweedie family is specified as `tweedie(var.power, link.power)`, where `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package, but the package is not needed in order to use the `tweedie` family. Some examples: `family = tweedie(0.0)` is  gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link.
 
 For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
 
@@ -650,6 +653,17 @@ gaussianFitted <- predict(gaussianGLM, carsDF)
 head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 ```
 
+The following is the same fit using the tweedie family:
+```{r}
+tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(0.0))
+summary(tweedieGLM1)
+```
+We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
+```{r}
+tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(1.2, 0.0))
+summary(tweedieGLM2)
+```
+
 #### Isotonic Regression
 
 `spark.isoreg` fits an [Isotonic Regression](https://en.wikipedia.org/wiki/Isotonic_regression) model against a `SparkDataFrame`. It solves a weighted univariate a regression problem under a complete order constraint. Specifically, given a set of real observed responses $y_1, \ldots, y_n$, corresponding real features $x_1, \ldots, x_n$, and optionally positive weights $w_1, \ldots, w_n$, we want to find a monotone (piecewise linear) function $f$ to  minimize

From 6737122763eef58c54a9b7a32994de6fe102c1d0 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 1 Feb 2017 12:00:34 -0800
Subject: [PATCH 17/28] add internal tweedie family

---
 R/pkg/NAMESPACE                               |  2 +
 R/pkg/R/mllib_regression.R                    | 43 +++++++++----------
 .../tests/testthat/test_mllib_regression.R    |  4 +-
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 7ff6e9a9d3c73..0fb70246b1a1e 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -380,6 +380,8 @@ export("partitionBy",
 export("windowPartitionBy",
        "windowOrderBy")
 
+export("tweedie")
+
 S3method(print, jobj)
 S3method(print, structField)
 S3method(print, structType)
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 8386b0cb7e59f..93f6f93eda0da 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -39,6 +39,19 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
 #' @note IsotonicRegressionModel since 2.1.0
 setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 
+
+#' Family object for tweedie.
+#' Needs to be outside spark.glm because it is also used in R-compatible glm
+#' @param var.power the power index in the variance function.
+#' @param linkPower the power index in the link function.
+#' @return returns a list for internal use in spark.glm
+tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
+  list(family = "tweedie",
+       var.power = var.power,
+       link.power = link.power,
+       link = paste("mu^", as.character(link.power), sep = ""))
+}
+
 #' Generalized Linear Models
 #'
 #' Fits generalized linear model against a Spark DataFrame.
@@ -97,21 +110,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
                    regParam = 0.0) {
-
-            # create pseudo tweedie family to avoid dependence on statmod
-            tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
-              list(family = "tweedie",
-                   variance = function(mu) mu^var.power,
-                   linkfun = function(mu) mu^link.power)
-            }
-            family <- eval(substitute(family))
-
             if (is.character(family)) {
-              if (family == "tweedie") {
-                family <- tweedie
-              } else {
-                family <- get(family, mode = "function", envir = parent.frame())
-              }
+              family <- get(family, mode = "function", envir = parent.frame())
             }
             if (is.function(family)) {
               family <- family()
@@ -126,16 +126,13 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- ""
             }
 
-            # recover variancePower and linkPower from the specified tweedie family
+            # set variancePower and linkPower 
+            variancePower <- 0.0
+            linkPower <- 1.0
             if (tolower(family$family) == "tweedie") {
-              variancePower <- log(family$variance(exp(1)))
-              linkPower <- log(family$linkfun(exp(1)))
-              family$link <- "identity"
-            } else {
-              # these default values are not used
-              variancePower <- 0.0
-              linkPower <- 1.0
-            }
+              variancePower <- family$var.power
+              linkPower <- family$link.power
+            } 
 
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index dc1f510e57715..5b1740894598d 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -248,7 +248,7 @@ test_that("glm and predict", {
   training <- suppressWarnings(createDataFrame(iris))
   # gaussian family
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
-               prediction <- predict(model, training)
+  prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
@@ -265,7 +265,7 @@ test_that("glm and predict", {
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
   # tweedie family
-  model <- glm(training, Sepal_Width ~ Sepal_Length + Species,
+  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
                      family = tweedie(var.power = 1.2, link.power = 1.0))
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")

From 0b5ed434cc5ba68452f5ef556d20465fec4f8ccb Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 1 Feb 2017 13:05:52 -0800
Subject: [PATCH 18/28] fix style

---
 R/pkg/R/mllib_regression.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 93f6f93eda0da..59ab628c8eba0 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -126,13 +126,13 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- ""
             }
 
-            # set variancePower and linkPower 
+            # set variancePower and linkPower
             variancePower <- 0.0
             linkPower <- 1.0
             if (tolower(family$family) == "tweedie") {
               variancePower <- family$var.power
               linkPower <- family$link.power
-            } 
+            }
 
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",

From b10777eb08621141df7190daa6157ff064c9d1af Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 1 Feb 2017 16:21:11 -0800
Subject: [PATCH 19/28] fix doc

---
 R/pkg/R/mllib_regression.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 59ab628c8eba0..ff35368643fe4 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -41,9 +41,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 
 
 #' Family object for tweedie.
-#' Needs to be outside spark.glm because it is also used in R-compatible glm
 #' @param var.power the power index in the variance function.
-#' @param linkPower the power index in the link function.
+#' @param link.Power the power index in the link function.
 #' @return returns a list for internal use in spark.glm
 tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
   list(family = "tweedie",

From 7d5bd602ed087c3608838e5ff151ad8234e8546b Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 1 Feb 2017 18:01:02 -0800
Subject: [PATCH 20/28] fix doc

---
 R/pkg/R/mllib_regression.R | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index ff35368643fe4..f90237b35a6d0 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -39,11 +39,16 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
 #' @note IsotonicRegressionModel since 2.1.0
 setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 
-
 #' Family object for tweedie.
+#'
 #' @param var.power the power index in the variance function.
 #' @param link.Power the power index in the link function.
 #' @return returns a list for internal use in spark.glm
+#' @rdname tweedie
+#' @aliases tweedie
+#' @name tweedie
+#' @export
+#' @seealso \link{spark.glm}
 tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
   list(family = "tweedie",
        var.power = var.power,

From a9ac439d0e5d249f09cfe98d3aa25c75c22a820e Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 1 Feb 2017 20:14:23 -0800
Subject: [PATCH 21/28] fix doc issue

---
 R/pkg/R/mllib_regression.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index f90237b35a6d0..af4424e5181f8 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -42,9 +42,9 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' Family object for tweedie.
 #'
 #' @param var.power the power index in the variance function.
-#' @param link.Power the power index in the link function.
-#' @return returns a list for internal use in spark.glm
+#' @param link.power the power index in the link function.
 #' @rdname tweedie
+#' @return a list for internal use in spark.glm
 #' @aliases tweedie
 #' @name tweedie
 #' @export

From 6cbc62fa8c8be37048193a6e3fc08195f1d497b5 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 5 Mar 2017 20:11:11 -0800
Subject: [PATCH 22/28] make twedie internal (no export)

---
 R/pkg/R/mllib_regression.R                    | 25 +++++++++----------
 .../tests/testthat/test_mllib_regression.R    |  4 +--
 R/pkg/vignettes/sparkr-vignettes.Rmd          |  6 ++---
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index aea266fb6b30b..3a5eb5c284cd6 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -39,20 +39,16 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
 #' @note IsotonicRegressionModel since 2.1.0
 setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 
-#' Family object for tweedie.
+#' Family object for tweedie .
 #'
 #' @param var.power the power index in the variance function.
 #' @param link.power the power index in the link function.
-#' @rdname tweedie
+#' @noRd
 #' @return a list for internal use in spark.glm
-#' @aliases tweedie
-#' @name tweedie
-#' @export
-#' @seealso \link{spark.glm}
 tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
   list(family = "tweedie",
-       var.power = var.power,
-       link.power = link.power,
+       variance = function (mu) mu^var.power,
+       linkfun = function (mu) mu^link.power,
        link = paste("mu^", as.character(link.power), sep = ""))
 }
 
@@ -71,9 +67,12 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
 #'               \code{Gamma}, \code{poisson} and \code{tweedie}.
-#'               The tweedie family is specified in a similar way as in the \code{statmod} package,
-#'               e.g., \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and
+#'               Note that when package \code{statmod} is loaded, the tweedie family is specified using the
+#'               family definition therein, i.e., \code{tweedie(var.power, link.power)}.
+#'               For example, \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and
 #'               \code{tweedie(1, 0)} is poisson with log link.
+#'               When package \code{statmod} is not available,  one can use the SparkR internal definition
+#'               \code{SparkR:::tweedie(var.power, link.power)}.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
@@ -106,7 +105,7 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
 #' summary(savedModel)
 #'
 #' # fit tweedie model
-#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = tweedie(1.2, 0.0))
+#' model <- spark.glm(df, Freq ~ Sex + Age, family = SparkR:::tweedie(1.2, 0.0))
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
@@ -136,8 +135,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             variancePower <- 0.0
             linkPower <- 1.0
             if (tolower(family$family) == "tweedie") {
-              variancePower <- family$var.power
-              linkPower <- family$link.power
+              variancePower <- log(family$variance(exp(1)))
+              linkPower <- log(family$linkfun(exp(1)))
             }
 
             # For known families, Gamma is upper-cased
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 884ca95547ca8..7ef5cea8e2c71 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -79,7 +79,7 @@ test_that("spark.glm and predict", {
 
   # tweedie family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = tweedie(var.power = 1.2, link.power = 1.0))
+                     family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0))
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
@@ -269,7 +269,7 @@ test_that("glm and predict", {
 
   # tweedie family
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
-                     family = tweedie(var.power = 1.2, link.power = 1.0))
+                     family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0))
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 0125dae796d19..02006b7ddd30a 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -682,7 +682,7 @@ There are three ways to specify the `family` argument.
 
 * Result returned by a family function, e.g. `family = poisson(link = log)`.
 
-* Note that the tweedie family is specified as `tweedie(var.power, link.power)`, where `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package, but the package is not needed in order to use the `tweedie` family. Some examples: `family = tweedie(0.0)` is  gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link.
+* Note that when package `statmod` is loaded, the tweedie family is specified as `tweedie(var.power, link.power)`. Otherwise, one can use the SparkR internal definition `SparkR:::tweedie(var.power, link.power)`. In the above, `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package.  Some examples: `family = tweedie(0.0)` is  gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link.
 
 For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
 
@@ -700,12 +700,12 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 
 The following is the same fit using the tweedie family:
 ```{r}
-tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(0.0))
+tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(0.0))
 summary(tweedieGLM1)
 ```
 We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
 ```{r}
-tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = tweedie(1.2, 0.0))
+tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(1.2, 0.0))
 summary(tweedieGLM2)
 ```
 

From ef65adc69b710366266abf5df6f458af3a66a3bf Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 5 Mar 2017 20:27:08 -0800
Subject: [PATCH 23/28] fix test issue

---
 R/pkg/NAMESPACE            | 2 --
 R/pkg/R/mllib_regression.R | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 43dbc0be3c23b..871f8e41a0f23 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -402,8 +402,6 @@ export("partitionBy",
 export("windowPartitionBy",
        "windowOrderBy")
 
-export("tweedie")
-
 S3method(print, jobj)
 S3method(print, structField)
 S3method(print, structType)
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 3a5eb5c284cd6..fa4b790234041 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -47,8 +47,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' @return a list for internal use in spark.glm
 tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
   list(family = "tweedie",
-       variance = function (mu) mu^var.power,
-       linkfun = function (mu) mu^link.power,
+       variance = function(mu) mu ^ var.power,
+       linkfun = function(mu) mu ^ link.power,
        link = paste("mu^", as.character(link.power), sep = ""))
 }
 

From c11e57c6146757e5d1201884eaa606eab09f9763 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 8 Mar 2017 21:59:16 -0800
Subject: [PATCH 24/28] add back variancePower and linkPower params

---
 R/pkg/R/mllib_regression.R                    | 66 +++++++++----------
 .../tests/testthat/test_mllib_regression.R    | 12 ++--
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index fa4b790234041..9d187fa3f4b64 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -39,19 +39,6 @@ setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
 #' @note IsotonicRegressionModel since 2.1.0
 setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 
-#' Family object for tweedie .
-#'
-#' @param var.power the power index in the variance function.
-#' @param link.power the power index in the link function.
-#' @noRd
-#' @return a list for internal use in spark.glm
-tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
-  list(family = "tweedie",
-       variance = function(mu) mu ^ var.power,
-       linkfun = function(mu) mu ^ link.power,
-       link = paste("mu^", as.character(link.power), sep = ""))
-}
-
 #' Generalized Linear Models
 #'
 #' Fits generalized linear model against a SparkDataFrame.
@@ -67,17 +54,20 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #'               Currently these families are supported: \code{binomial}, \code{gaussian},
 #'               \code{Gamma}, \code{poisson} and \code{tweedie}.
-#'               Note that when package \code{statmod} is loaded, the tweedie family is specified using the
+#'
+#'               Note that there are two ways to specify the tweedie family.
+#'               a) Set \code{family = "tweedie"} and specify the variancePower and linkPower
+#'               b) When package \code{statmod} is loaded, the tweedie family is specified using the
 #'               family definition therein, i.e., \code{tweedie(var.power, link.power)}.
-#'               For example, \code{tweedie(var.power = 0, link.power = 1)} is gaussian with idenity link and
-#'               \code{tweedie(1, 0)} is poisson with log link.
-#'               When package \code{statmod} is not available,  one can use the SparkR internal definition
-#'               \code{SparkR:::tweedie(var.power, link.power)}.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param regParam regularization parameter for L2 regularization.
+#' @param variancePower the power in the variance function of the Tweedie distribution which provides
+#'                      the relationship between the variance and mean of the distribution. Only
+#'                      applicable to the Tweedie family.
+#' @param linkPower the index in the power link function. Only applicable to the Tweedie family.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -105,19 +95,33 @@ tweedie <- function(var.power = 0.0, link.power = 1.0 - var.power) {
 #' summary(savedModel)
 #'
 #' # fit tweedie model
-#' model <- spark.glm(df, Freq ~ Sex + Age, family = SparkR:::tweedie(1.2, 0.0))
+#' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
+#'                    variancePower = 1.2, linkPower = 0)
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
-                   regParam = 0.0) {
+                   regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) {
+
             if (is.character(family)) {
-              family <- get(family, mode = "function", envir = parent.frame())
+              # family = "tweedie"
+              if (tolower(family) == "tweedie") {
+                family <- list(family = "tweedie", link = "linkNotUsed")
+              } else {
+                family <- get(family, mode = "function", envir = parent.frame())
+              }
             }
             if (is.function(family)) {
-              family <- family()
+              # family = statmod::tweedie()
+              if (tolower(family$family) == "tweedie") {
+                family <- list(family = "tweedie", link = "linkNotUsed")
+                variancePower <- log(family$variance(exp(1)))
+                linkPower <- log(family$linkfun(exp(1)))
+              } else {
+                family <- family()
+              }
             }
             if (is.null(family$family)) {
               print(family)
@@ -131,19 +135,11 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- as.character(weightCol)
             }
 
-            # set variancePower and linkPower
-            variancePower <- 0.0
-            linkPower <- 1.0
-            if (tolower(family$family) == "tweedie") {
-              variancePower <- log(family$variance(exp(1)))
-              linkPower <- log(family$linkfun(exp(1)))
-            }
-
             # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), weightCol, regParam,
-                                variancePower, linkPower)
+                                as.double(variancePower), as.double(linkPower))
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -163,6 +159,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
 #' @param maxit integer giving the maximal number of IRLS iterations.
+#' @param variancePower the index of the power variance function in the Tweedie family.
+#' @param linkPower the index of the power link function in the Tweedie family.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -177,8 +175,10 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @note glm since 1.5.0
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
-          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL) {
-            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol)
+          function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
+                   variancePower = 0.0, linkPower = 1.0 - variancePower) {
+            spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
+                      variancePower = variancePower, linkPower = linkPower)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 7ef5cea8e2c71..3f96179c60a22 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -79,7 +79,7 @@ test_that("spark.glm and predict", {
 
   # tweedie family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0))
+                     family = "tweedie", variancePower = 1.2, linkPower = 0.0)
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
@@ -87,10 +87,10 @@ test_that("spark.glm and predict", {
   # manual calculation of the R predicted values to avoid dependence on statmod
   #' library(statmod)
   #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #'             family = tweedie(var.power = 1.2, link.power = 1.0))
+  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
   #' print(coef(rModel))
 
-  rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648)
+  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
   rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
                                    data = iris) %*% rCoef)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
@@ -269,7 +269,7 @@ test_that("glm and predict", {
 
   # tweedie family
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
-                     family = SparkR:::tweedie(var.power = 1.2, link.power = 1.0))
+               family = "tweedie", variancePower = 1.2, linkPower = 0.0)
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
@@ -277,10 +277,10 @@ test_that("glm and predict", {
   # manual calculation of the R predicted values to avoid dependence on statmod
   #' library(statmod)
   #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #'             family = tweedie(var.power = 1.2, link.power = 1.0))
+  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
   #' print(coef(rModel))
 
-  rCoef <- c(1.7009682, 0.3436700, -0.9703189, -0.9852648)
+  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
   rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
                                    data = iris) %*% rCoef)
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)

From 5ce4c8414fabdaaa970931ac61fa53515d240503 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 8 Mar 2017 22:29:50 -0800
Subject: [PATCH 25/28] update vignettes

---
 R/pkg/R/mllib_regression.R                        | 15 +++++++--------
 R/pkg/inst/tests/testthat/test_mllib_regression.R | 12 ++++++------
 R/pkg/vignettes/sparkr-vignettes.Rmd              |  9 ++++++---
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 9d187fa3f4b64..3f509f08ffd1b 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -114,19 +114,18 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               }
             }
             if (is.function(family)) {
-              # family = statmod::tweedie()
-              if (tolower(family$family) == "tweedie") {
-                family <- list(family = "tweedie", link = "linkNotUsed")
-                variancePower <- log(family$variance(exp(1)))
-                linkPower <- log(family$linkfun(exp(1)))
-              } else {
-                family <- family()
-              }
+              family <- family()
             }
             if (is.null(family$family)) {
               print(family)
               stop("'family' not recognized")
             }
+            # family = statmod::tweedie()
+            if (tolower(family$family) == "tweedie" && !is.null(family$variance)) {
+              variancePower <- log(family$variance(exp(1)))
+              linkPower <- log(family$linkfun(exp(1)))
+              family <- list(family = "tweedie", link = "linkNotUsed")
+            }
 
             formula <- paste(deparse(formula), collapse = "")
             if (!is.null(weightCol) && weightCol == "") {
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 3f96179c60a22..7500abc1343ad 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -91,9 +91,9 @@ test_that("spark.glm and predict", {
   #' print(coef(rModel))
 
   rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
-  rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
-                                   data = iris) %*% rCoef)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                       data = iris) %*% rCoef))
+  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
 
   # Test stats::predict is working
   x <- rnorm(15)
@@ -281,9 +281,9 @@ test_that("glm and predict", {
   #' print(coef(rModel))
 
   rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
-  rVals <- as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
-                                   data = iris) %*% rCoef)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                   data = iris) %*% rCoef))
+  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
 
   # Test stats::predict is working
   x <- rnorm(15)
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 02006b7ddd30a..2cd31cf7b4de7 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -682,7 +682,9 @@ There are three ways to specify the `family` argument.
 
 * Result returned by a family function, e.g. `family = poisson(link = log)`.
 
-* Note that when package `statmod` is loaded, the tweedie family is specified as `tweedie(var.power, link.power)`. Otherwise, one can use the SparkR internal definition `SparkR:::tweedie(var.power, link.power)`. In the above, `var.power` is the power index of the variance function and `link.power` is the index of the the power link function (the default value is `link.power = 1.0 - var.power`). This is consistent with the `tweedie` family defined in the `statmod` package.  Some examples: `family = tweedie(0.0)` is  gaussian with identity link, `family = tweedie(1.0)` poisson with log link, `family = tweedie(2.0)` Gamma with inverse link, and `family = tweedie(1.5, 0.0)` compound Poisson with log link.
+* Note that there are two ways to specify the tweedie family:
+  a) Set `family = "tweedie"` and specify the `variancePower` and `linkPower`
+  b) When package `statmod` is loaded, the tweedie family is specified using the family definition therein, i.e., `tweedie()`.
 
 For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
 
@@ -700,12 +702,13 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 
 The following is the same fit using the tweedie family:
 ```{r}
-tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(0.0))
+tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", variancePower = 0.0)
 summary(tweedieGLM1)
 ```
 We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
 ```{r}
-tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = SparkR:::tweedie(1.2, 0.0))
+tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", 
+                         variancePower = 1.2, linkPower = 0.0)
 summary(tweedieGLM2)
 ```
 

From aeeb3f780be846641a3f38eb127fcba3e3387cb8 Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Wed, 8 Mar 2017 22:45:06 -0800
Subject: [PATCH 26/28] fix style

---
 R/pkg/R/mllib_regression.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 3f509f08ffd1b..885ae4ee82253 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -106,7 +106,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
                    regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) {
 
             if (is.character(family)) {
-              # family = "tweedie"
+              # Handle when family = "tweedie"
               if (tolower(family) == "tweedie") {
                 family <- list(family = "tweedie", link = "linkNotUsed")
               } else {
@@ -120,7 +120,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               print(family)
               stop("'family' not recognized")
             }
-            # family = statmod::tweedie()
+            # Handle when family = statmod::tweedie()
             if (tolower(family$family) == "tweedie" && !is.null(family$variance)) {
               variancePower <- log(family$variance(exp(1)))
               linkPower <- log(family$linkfun(exp(1)))

From 4cffc40f008641f59a17235ae67e4db0e975f03c Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 12 Mar 2017 18:11:18 -0700
Subject: [PATCH 27/28] change names of tweedie parameters to be consistent
 with R

---
 R/pkg/R/mllib_regression.R                    | 33 +++++++++++--------
 .../tests/testthat/test_mllib_regression.R    |  4 +--
 R/pkg/vignettes/sparkr-vignettes.Rmd          |  6 ++--
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 885ae4ee82253..a526cf6f255d1 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -56,7 +56,7 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               \code{Gamma}, \code{poisson} and \code{tweedie}.
 #'
 #'               Note that there are two ways to specify the tweedie family.
-#'               a) Set \code{family = "tweedie"} and specify the variancePower and linkPower
+#'               a) Set \code{family = "tweedie"} and specify the var.power and link.power
 #'               b) When package \code{statmod} is loaded, the tweedie family is specified using the
 #'               family definition therein, i.e., \code{tweedie(var.power, link.power)}.
 #' @param tol positive convergence tolerance of iterations.
@@ -64,10 +64,10 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param regParam regularization parameter for L2 regularization.
-#' @param variancePower the power in the variance function of the Tweedie distribution which provides
+#' @param var.power the power in the variance function of the Tweedie distribution which provides
 #'                      the relationship between the variance and mean of the distribution. Only
 #'                      applicable to the Tweedie family.
-#' @param linkPower the index in the power link function. Only applicable to the Tweedie family.
+#' @param link.power the index in the power link function. Only applicable to the Tweedie family.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -96,19 +96,24 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'
 #' # fit tweedie model
 #' model <- spark.glm(df, Freq ~ Sex + Age, family = "tweedie",
-#'                    variancePower = 1.2, linkPower = 0)
+#'                    var.power = 1.2, link.power = 0)
+#' summary(model)
+#'
+#' # use the tweedie family from statmod
+#' library(statmod)
+#' model <- spark.glm(df, Freq ~ Sex + Age, family = tweedie(1.2, 0))
 #' summary(model)
 #' }
 #' @note spark.glm since 2.0.0
 #' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
-                   regParam = 0.0, variancePower = 0.0, linkPower = 1.0 - variancePower) {
+                   regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power) {
 
             if (is.character(family)) {
               # Handle when family = "tweedie"
               if (tolower(family) == "tweedie") {
-                family <- list(family = "tweedie", link = "linkNotUsed")
+                family <- list(family = "tweedie", link = NULL)
               } else {
                 family <- get(family, mode = "function", envir = parent.frame())
               }
@@ -122,9 +127,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             }
             # Handle when family = statmod::tweedie()
             if (tolower(family$family) == "tweedie" && !is.null(family$variance)) {
-              variancePower <- log(family$variance(exp(1)))
-              linkPower <- log(family$linkfun(exp(1)))
-              family <- list(family = "tweedie", link = "linkNotUsed")
+              var.power <- log(family$variance(exp(1)))
+              link.power <- log(family$linkfun(exp(1)))
+              family <- list(family = "tweedie", link = NULL)
             }
 
             formula <- paste(deparse(formula), collapse = "")
@@ -138,7 +143,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
                                 "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), weightCol, regParam,
-                                as.double(variancePower), as.double(linkPower))
+                                as.double(var.power), as.double(link.power))
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
 
@@ -158,8 +163,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.
 #' @param maxit integer giving the maximal number of IRLS iterations.
-#' @param variancePower the index of the power variance function in the Tweedie family.
-#' @param linkPower the index of the power link function in the Tweedie family.
+#' @param var.power the index of the power variance function in the Tweedie family.
+#' @param link.power the index of the power link function in the Tweedie family.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -175,9 +180,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #' @seealso \link{spark.glm}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
           function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
-                   variancePower = 0.0, linkPower = 1.0 - variancePower) {
+                   var.power = 0.0, link.power = 1.0 - var.power) {
             spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
-                      variancePower = variancePower, linkPower = linkPower)
+                      var.power = var.power, link.power = link.power)
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index 7500abc1343ad..3e9ad77198073 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -79,7 +79,7 @@ test_that("spark.glm and predict", {
 
   # tweedie family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = "tweedie", variancePower = 1.2, linkPower = 0.0)
+                     family = "tweedie", var.power = 1.2, link.power = 0.0)
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
@@ -269,7 +269,7 @@ test_that("glm and predict", {
 
   # tweedie family
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
-               family = "tweedie", variancePower = 1.2, linkPower = 0.0)
+               family = "tweedie", var.power = 1.2, link.power = 0.0)
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 2cd31cf7b4de7..a6ff650c33fea 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -683,7 +683,7 @@ There are three ways to specify the `family` argument.
 * Result returned by a family function, e.g. `family = poisson(link = log)`.
 
 * Note that there are two ways to specify the tweedie family:
-  a) Set `family = "tweedie"` and specify the `variancePower` and `linkPower`
+  a) Set `family = "tweedie"` and specify the `var.power` and `link.power`
   b) When package `statmod` is loaded, the tweedie family is specified using the family definition therein, i.e., `tweedie()`.
 
 For more information regarding the families and their link functions, see the Wikipedia page [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model).
@@ -702,13 +702,13 @@ head(select(gaussianFitted, "model", "prediction", "mpg", "wt", "hp"))
 
 The following is the same fit using the tweedie family:
 ```{r}
-tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", variancePower = 0.0)
+tweedieGLM1 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", var.power = 0.0)
 summary(tweedieGLM1)
 ```
 We can try other distributions in the tweedie family, for example, a compound Poisson distribution with a log link:
 ```{r}
 tweedieGLM2 <- spark.glm(carsDF, mpg ~ wt + hp, family = "tweedie", 
-                         variancePower = 1.2, linkPower = 0.0)
+                         var.power = 1.2, link.power = 0.0)
 summary(tweedieGLM2)
 ```
 

From 0b496a6a8880dfa81a0c8c3b00577b7313f93fbc Mon Sep 17 00:00:00 2001
From: actuaryzhang <actuaryzhang10@gmail.com>
Date: Sun, 12 Mar 2017 22:24:31 -0700
Subject: [PATCH 28/28] update doc

---
 R/pkg/R/mllib_regression.R | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index a526cf6f255d1..d59c890f3e5fd 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -56,9 +56,11 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               \code{Gamma}, \code{poisson} and \code{tweedie}.
 #'
 #'               Note that there are two ways to specify the tweedie family.
-#'               a) Set \code{family = "tweedie"} and specify the var.power and link.power
-#'               b) When package \code{statmod} is loaded, the tweedie family is specified using the
-#'               family definition therein, i.e., \code{tweedie(var.power, link.power)}.
+#'               \itemize{
+#'                \item Set \code{family = "tweedie"} and specify the var.power and link.power;
+#'                \item When package \code{statmod} is loaded, the tweedie family is specified using the
+#'                family definition therein, i.e., \code{tweedie(var.power, link.power)}.
+#'               }
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance