From 8a87b8696f68cc9d11b4b46a3eeef2986f6b9a0a Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Sat, 10 Sep 2016 12:47:10 -0700 Subject: [PATCH 1/9] update. --- R/pkg/R/mllib.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index f321fd19b39b4..0cbe481899fd7 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -694,8 +694,8 @@ setMethod("predict", signature(object = "KMeansModel"), #' } #' @note spark.mlp since 2.1.0 setMethod("spark.mlp", signature(data = "SparkDataFrame"), - function(data, blockSize = 128, layers = c(3, 5, 2), solver = "l-bfgs", maxIter = 100, - tol = 0.5, stepSize = 1, seed = 1) { + function(data, blockSize = 128, layers, solver = "l-bfgs", maxIter = 100, + tol = 1E-6, stepSize = 0.03, seed = -763139545) { jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), as.character(solver), as.integer(maxIter), as.numeric(tol), From 50bf5e8a558bb5f574b257d8f578c4b6926148d5 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Tue, 13 Sep 2016 21:21:21 +0800 Subject: [PATCH 2/9] update. --- R/pkg/R/mllib.R | 9 +++++++-- .../ml/r/MultilayerPerceptronClassifierWrapper.scala | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 0cbe481899fd7..7bbbf2440ef30 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -694,8 +694,13 @@ setMethod("predict", signature(object = "KMeansModel"), #' } #' @note spark.mlp since 2.1.0 setMethod("spark.mlp", signature(data = "SparkDataFrame"), - function(data, blockSize = 128, layers, solver = "l-bfgs", maxIter = 100, - tol = 1E-6, stepSize = 0.03, seed = -763139545) { + function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, + tol = 1E-6, stepSize = 0.03, seed = 0x7FFFFFFF) { + if (length(layers) <= 1) stop("layers vector require length > 0.") + for (i in 1 : length(layers)) { + if (!is.numeric(layers[i])) + stop("layers must be an integer vector.") + } jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), as.character(solver), as.integer(maxIter), as.numeric(tol), diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala index be51e74187faa..fe462c0bbcfc1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala @@ -71,8 +71,8 @@ private[r] object MultilayerPerceptronClassifierWrapper .setMaxIter(maxIter) .setTol(tol) .setStepSize(stepSize) - .setSeed(seed) .setPredictionCol(PREDICTED_LABEL_COL) + if (seed != 0x7FFFFFFF) mlp.setSeed(seed) val pipeline = new Pipeline() .setStages(Array(mlp)) .fit(data) From c3e9bf3bf3b0f5115f2e46355256ab1ddad4f129 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Wed, 14 Sep 2016 22:08:57 +0800 Subject: [PATCH 3/9] update_py_mlp_default --- R/pkg/R/mllib.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 7bbbf2440ef30..2b5adb589e1ff 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -697,10 +697,8 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"), function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, tol = 1E-6, stepSize = 0.03, seed = 0x7FFFFFFF) { if (length(layers) <= 1) stop("layers vector require length > 0.") - for (i in 1 : length(layers)) { - if (!is.numeric(layers[i])) - stop("layers must be an integer vector.") - } + if (any(sapply(layers,function(e) !is.numeric(e)))) + stop ("layers must be a numeric vector.") jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), as.character(solver), as.integer(maxIter), as.numeric(tol), From ce2c2f743e912225416a1f28b0e90d5d88ddaf49 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Wed, 14 Sep 2016 22:32:51 +0800 Subject: [PATCH 4/9] update. --- R/pkg/R/mllib.R | 7 +++++-- R/pkg/inst/tests/testthat/test_mllib.R | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 2b5adb589e1ff..45b52beb85834 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -696,9 +696,12 @@ setMethod("predict", signature(object = "KMeansModel"), setMethod("spark.mlp", signature(data = "SparkDataFrame"), function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, tol = 1E-6, stepSize = 0.03, seed = 0x7FFFFFFF) { - if (length(layers) <= 1) stop("layers vector require length > 0.") - if (any(sapply(layers,function(e) !is.numeric(e)))) + if (length(layers) <= 1) { + stop("layers vector require length > 0.") + } + if (any(sapply(layers, function(e) !is.numeric(e)))) { stop ("layers must be a numeric vector.") + } jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), as.character(solver), as.integer(maxIter), as.numeric(tol), diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index ca25f2c7e8263..1fc1f026b543c 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -383,6 +383,10 @@ test_that("spark.mlp", { unlink(modelPath) + # Test default parameter + model <- spark.mlp(df, layers = c(4, 5, 4, 3)) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 10), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 0)) }) test_that("spark.naiveBayes", { From 160caf196faa6dd90f00b882c4167462034f9295 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Sat, 17 Sep 2016 21:26:25 +0800 Subject: [PATCH 5/9] improve integer check --- R/pkg/R/mllib.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 45b52beb85834..16f1e094ddfdd 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -699,8 +699,8 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame"), if (length(layers) <= 1) { stop("layers vector require length > 0.") } - if (any(sapply(layers, function(e) !is.numeric(e)))) { - stop ("layers must be a numeric vector.") + if (any(sapply(layers, function(e) as.integer(e) != e))) { + stop ("layers must be a integer vector.") } jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), From 8952a36685c93d0f84c22097efca8b803b3d7ea7 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Sun, 18 Sep 2016 21:19:19 +0800 Subject: [PATCH 6/9] update scala-wrapper arg type & R-side code --- R/pkg/R/mllib.R | 10 ++++------ R/pkg/inst/tests/testthat/test_mllib.R | 2 +- .../ml/r/MultilayerPerceptronClassifierWrapper.scala | 8 ++++---- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 16f1e094ddfdd..f70d323d418c0 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -695,17 +695,15 @@ setMethod("predict", signature(object = "KMeansModel"), #' @note spark.mlp since 2.1.0 setMethod("spark.mlp", signature(data = "SparkDataFrame"), function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, - tol = 1E-6, stepSize = 0.03, seed = 0x7FFFFFFF) { + tol = 1E-6, stepSize = 0.03, seed = "") { + layers <- as.integer(na.omit(layers)) if (length(layers) <= 1) { - stop("layers vector require length > 0.") - } - if (any(sapply(layers, function(e) as.integer(e) != e))) { - stop ("layers must be a integer vector.") + stop ("layers must be a integer vector with length > 1.") } jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), as.character(solver), as.integer(maxIter), as.numeric(tol), - as.numeric(stepSize), as.integer(seed)) + as.numeric(stepSize), as.character(seed)) new("MultilayerPerceptronClassificationModel", jobj = jobj) }) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 1fc1f026b543c..931f1d3a6cae8 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -356,7 +356,7 @@ test_that("spark.kmeans", { test_that("spark.mlp", { df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100, - tol = 0.5, stepSize = 1, seed = 1) + tol = 0.5, stepSize = 1, seed = "1") # Test summary method summary <- summary(model) diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala index fe462c0bbcfc1..10673003534e6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/MultilayerPerceptronClassifierWrapper.scala @@ -53,26 +53,26 @@ private[r] object MultilayerPerceptronClassifierWrapper def fit( data: DataFrame, blockSize: Int, - layers: Array[Double], + layers: Array[Int], solver: String, maxIter: Int, tol: Double, stepSize: Double, - seed: Int + seed: String ): MultilayerPerceptronClassifierWrapper = { // get labels and feature names from output schema val schema = data.schema // assemble and fit the pipeline val mlp = new MultilayerPerceptronClassifier() - .setLayers(layers.map(_.toInt)) + .setLayers(layers) .setBlockSize(blockSize) .setSolver(solver) .setMaxIter(maxIter) .setTol(tol) .setStepSize(stepSize) .setPredictionCol(PREDICTED_LABEL_COL) - if (seed != 0x7FFFFFFF) mlp.setSeed(seed) + if (seed != null && seed.length > 0) mlp.setSeed(seed.toInt) val pipeline = new Pipeline() .setStages(Array(mlp)) .fit(data) From 36cccb6b9e3aa586af75af1282e1b5436f57ca9d Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Sun, 18 Sep 2016 22:23:43 +0800 Subject: [PATCH 7/9] add negative test --- R/pkg/inst/tests/testthat/test_mllib.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 931f1d3a6cae8..6beb7cc73c9d3 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -387,6 +387,11 @@ test_that("spark.mlp", { model <- spark.mlp(df, layers = c(4, 5, 4, 3)) mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) expect_equal(head(mlpPredictions$prediction, 10), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 0)) + + # Test illegal parameter + expect_error(spark.mlp(df, layers = NULL)) + expect_error(spark.mlp(df, layers = c())) + expect_error(spark.mlp(df, layers = c(3))) }) test_that("spark.naiveBayes", { From c9d3dc4d54a9aabfb9598c7ff6f7364012c1ed81 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 19 Sep 2016 10:35:51 +0800 Subject: [PATCH 8/9] update. --- R/pkg/R/mllib.R | 7 +++++-- R/pkg/inst/tests/testthat/test_mllib.R | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index f70d323d418c0..e55e30b4aa3c2 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -695,15 +695,18 @@ setMethod("predict", signature(object = "KMeansModel"), #' @note spark.mlp since 2.1.0 setMethod("spark.mlp", signature(data = "SparkDataFrame"), function(data, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100, - tol = 1E-6, stepSize = 0.03, seed = "") { + tol = 1E-6, stepSize = 0.03, seed = NULL) { layers <- as.integer(na.omit(layers)) if (length(layers) <= 1) { stop ("layers must be a integer vector with length > 1.") } + if (!is.null(seed)) { + seed <- as.character(as.integer(seed)) + } jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper", "fit", data@sdf, as.integer(blockSize), as.array(layers), as.character(solver), as.integer(maxIter), as.numeric(tol), - as.numeric(stepSize), as.character(seed)) + as.numeric(stepSize), seed) new("MultilayerPerceptronClassificationModel", jobj = jobj) }) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 6beb7cc73c9d3..962b65416ecca 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -356,7 +356,7 @@ test_that("spark.kmeans", { test_that("spark.mlp", { df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm") model <- spark.mlp(df, blockSize = 128, layers = c(4, 5, 4, 3), solver = "l-bfgs", maxIter = 100, - tol = 0.5, stepSize = 1, seed = "1") + tol = 0.5, stepSize = 1, seed = 1) # Test summary method summary <- summary(model) @@ -389,9 +389,9 @@ test_that("spark.mlp", { expect_equal(head(mlpPredictions$prediction, 10), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 0)) # Test illegal parameter - expect_error(spark.mlp(df, layers = NULL)) - expect_error(spark.mlp(df, layers = c())) - expect_error(spark.mlp(df, layers = c(3))) + expect_error(spark.mlp(df, layers = NULL), "layers must be a integer vector with length > 1.") + expect_error(spark.mlp(df, layers = c()), "layers must be a integer vector with length > 1.") + expect_error(spark.mlp(df, layers = c(3)), "layers must be a integer vector with length > 1.") }) test_that("spark.naiveBayes", { From decbf6cdcb37e6a09fa537fa58fef40ecd316875 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Wed, 21 Sep 2016 22:09:02 +0800 Subject: [PATCH 9/9] add test for rand seed --- R/pkg/inst/tests/testthat/test_mllib.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 962b65416ecca..c1aa174a162b2 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -392,6 +392,16 @@ test_that("spark.mlp", { expect_error(spark.mlp(df, layers = NULL), "layers must be a integer vector with length > 1.") expect_error(spark.mlp(df, layers = c()), "layers must be a integer vector with length > 1.") expect_error(spark.mlp(df, layers = c(3)), "layers must be a integer vector with length > 1.") + + # Test random seed + # default seed + model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 0, 1, 2, 2, 1, 2, 0, 1)) + # seed equals 10 + model <- spark.mlp(df, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10) + mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) + expect_equal(head(mlpPredictions$prediction, 12), c(1, 1, 1, 1, 2, 1, 2, 2, 1, 0, 0, 1)) }) test_that("spark.naiveBayes", {