From d13ef8d5f2e9a938802940b8f0ea736a6b0b56b3 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 May 2017 11:56:20 +0800
Subject: [PATCH 1/8] create pr

---
 docs/ml-classification-regression.md  |  7 +++
 examples/src/main/r/ml/decisionTree.R | 65 +++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 examples/src/main/r/ml/decisionTree.R
diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index ab6f587e09ef2..083df2e405d62 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -708,6 +708,13 @@ More details on parameters can be found in the [Python API documentation](api/py
 {% include_example python/ml/decision_tree_regression_example.py %}
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.decisionTree.html) for more details.
+
+{% include_example regression r/ml/decisionTree.R %}
+</div>
+
 </div>
 
 
diff --git a/examples/src/main/r/ml/decisionTree.R b/examples/src/main/r/ml/decisionTree.R
new file mode 100644
index 0000000000000..9e10ae5519cd3
--- /dev/null
+++ b/examples/src/main/r/ml/decisionTree.R
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/decisionTree.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-decisionTree-example")
+
+# DecisionTree classification model
+
+# $example on:classification$
+# Load training data
+df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a DecisionTree classification model with spark.decisionTree
+model <- spark.decisionTree(training, label ~ features, "classification")
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+head(predictions)
+# $example off:classification$
+
+# DecisionTree regression model
+
+# $example on:regression$
+# Load training data
+df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm")
+training <- df
+test <- df
+
+# Fit a DecisionTree regression model with spark.decisionTree
+model <- spark.decisionTree(training, label ~ features, "regression")
+
+# Model summary
+summary(model)
+
+# Prediction
+predictions <- predict(model, test)
+head(predictions)
+# $example off:regression$
+
+sparkR.session.stop()

From c988ec44a444509910260a7750cc224ef5d35460 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 May 2017 12:33:33 +0800
Subject: [PATCH 2/8] update vignettes

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 13a399165c8b4..78feffbb88cb6 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -503,6 +503,8 @@ SparkR supports the following machine learning models and algorithms.
 
 #### Tree - Classification and Regression
 
+* Decision Tree
+
 * Gradient-Boosted Trees (GBT)
 
 * Random Forest
@@ -776,6 +778,19 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2)))
 head(predict(isoregModel, newDF))
 ```
 
+#### Decision Tree
+
+`spark.decisionTree` fits a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or regression model on a `SparkDataFrame`.
+Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
+
+We use the `longley` dataset to train a decision tree and make predictions:
+
+```{r, warning=FALSE}
+df <- createDataFrame(longley)
+dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2)
+summary(dtModel)
+predictions <- predict(dtModel, df)
+
 #### Gradient-Boosted Trees
 
 `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.

From 1003b491b2dde3930cb97861d25e90c708ce03ac Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 May 2017 13:44:44 +0800
Subject: [PATCH 3/8] update sparkr.md

---
 docs/sparkr.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/sparkr.md b/docs/sparkr.md
index 569b85e72c3cf..a3254e7654134 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -492,6 +492,7 @@ SparkR supports the following machine learning algorithms currently:
 
 #### Tree
 
+* [`spark.decisionTree`](api/R/spark.decisionTree.html): `Decision Tree for` [`Regression`](ml-classification-regression.html#decision-tree-regression) `and` [`Classification`](ml-classification-regression.html#decision-tree-classifier)
 * [`spark.gbt`](api/R/spark.gbt.html): `Gradient Boosted Trees for` [`Regression`](ml-classification-regression.html#gradient-boosted-tree-regression) `and` [`Classification`](ml-classification-regression.html#gradient-boosted-tree-classifier)
 * [`spark.randomForest`](api/R/spark.randomForest.html): `Random Forest for` [`Regression`](ml-classification-regression.html#random-forest-regression) `and` [`Classification`](ml-classification-regression.html#random-forest-classifier)
 

From 377588ebdcc581f32659bcb4d179ae8aaf8378e4 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 23 May 2017 14:38:36 +0800
Subject: [PATCH 4/8] fix bug & update r formula

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 78feffbb88cb6..fa1989d2903ee 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -430,7 +430,7 @@ We use `svm` in package `e1071` as an example. We use all default settings excep
 costs <- exp(seq(from = log(1), to = log(1000), length.out = 5))
 train <- function(cost) {
   stopifnot(requireNamespace("e1071", quietly = TRUE))
-  model <- e1071::svm(Species ~ ., data = iris, cost = cost)
+  model <- e1071::svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, cost = cost)
   summary(model)
 }
 ```
@@ -567,7 +567,7 @@ for binary classification.
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
 # fit a Linear SVM classifier model
-model <- spark.svmLinear(training,  Survived ~ ., regParam = 0.01, maxIter = 10)
+model <- spark.svmLinear(training,  Survived ~ Class + Sex + Age + Freq, regParam = 0.01, maxIter = 10)
 summary(model)
 ```
 
@@ -590,7 +590,7 @@ Binomial logistic regression
 ```{r}
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
-model <- spark.logit(training, Survived ~ ., regParam = 0.04741301)
+model <- spark.logit(training, Survived ~ Class + Sex + Age + Freq, regParam = 0.04741301)
 summary(model)
 ```
 
@@ -604,7 +604,7 @@ Multinomial logistic regression against three classes
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
 # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
-model <- spark.logit(training, Class ~ ., regParam = 0.07815179)
+model <- spark.logit(training, Class ~ Sex + Age + Freq + Survived, regParam = 0.07815179)
 summary(model)
 ```
 
@@ -787,9 +787,10 @@ We use the `longley` dataset to train a decision tree and make predictions:
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
-dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2)
+dtModel <- spark.decisionTree(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2)
 summary(dtModel)
 predictions <- predict(dtModel, df)
+```
 
 #### Gradient-Boosted Trees
 
@@ -800,7 +801,7 @@ We use the `longley` dataset to train a gradient-boosted tree and make predictio
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
-gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2)
+gbtModel <- spark.gbt(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, maxIter = 2)
 summary(gbtModel)
 predictions <- predict(gbtModel, df)
 ```
@@ -814,7 +815,7 @@ In the following example, we use the `longley` dataset to train a random forest
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
-rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2)
+rfModel <- spark.randomForest(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, numTrees = 2)
 summary(rfModel)
 predictions <- predict(rfModel, df)
 ```

From 6c07cff75821f2b4ad2460032dc62b220f895aca Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 25 May 2017 09:46:45 +0800
Subject: [PATCH 5/8] update test with warning

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index fa1989d2903ee..cdcc2602b44a3 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -567,7 +567,7 @@ for binary classification.
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
 # fit a Linear SVM classifier model
-model <- spark.svmLinear(training,  Survived ~ Class + Sex + Age + Freq, regParam = 0.01, maxIter = 10)
+model <- spark.svmLinear(training,  Survived ~ ., regParam = 0.01, maxIter = 10)
 summary(model)
 ```
 
@@ -590,7 +590,7 @@ Binomial logistic regression
 ```{r}
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
-model <- spark.logit(training, Survived ~ Class + Sex + Age + Freq, regParam = 0.04741301)
+model <- spark.logit(training, Survived ~ ., regParam = 0.04741301)
 summary(model)
 ```
 
@@ -604,7 +604,7 @@ Multinomial logistic regression against three classes
 t <- as.data.frame(Titanic)
 training <- createDataFrame(t)
 # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
-model <- spark.logit(training, Class ~ Sex + Age + Freq + Survived, regParam = 0.07815179)
+model <- spark.logit(training, Class ~ ., regParam = 0.07815179)
 summary(model)
 ```
 
@@ -785,9 +785,9 @@ Users can call `summary` to get a summary of the fitted model, `predict` to make
 
 We use the `longley` dataset to train a decision tree and make predictions:
 
-```{r, warning=FALSE}
+```{r}
 df <- createDataFrame(longley)
-dtModel <- spark.decisionTree(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2)
+dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2)
 summary(dtModel)
 predictions <- predict(dtModel, df)
 ```
@@ -801,7 +801,7 @@ We use the `longley` dataset to train a gradient-boosted tree and make predictio
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
-gbtModel <- spark.gbt(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, maxIter = 2)
+gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2)
 summary(gbtModel)
 predictions <- predict(gbtModel, df)
 ```
@@ -815,7 +815,7 @@ In the following example, we use the `longley` dataset to train a random forest
 
 ```{r, warning=FALSE}
 df <- createDataFrame(longley)
-rfModel <- spark.randomForest(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, numTrees = 2)
+rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2)
 summary(rfModel)
 predictions <- predict(rfModel, df)
 ```

From 40cc6be3eaa83e334fe541119e5023a71f14d5f7 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 25 May 2017 09:47:56 +0800
Subject: [PATCH 6/8] revert one nit

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index cdcc2602b44a3..9bb918f825bb6 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -430,7 +430,7 @@ We use `svm` in package `e1071` as an example. We use all default settings excep
 costs <- exp(seq(from = log(1), to = log(1000), length.out = 5))
 train <- function(cost) {
   stopifnot(requireNamespace("e1071", quietly = TRUE))
-  model <- e1071::svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, cost = cost)
+  model <- e1071::svm(Species ~ ., data = iris, cost = cost)
   summary(model)
 }
 ```

From 48a968667cdce59e3d8713220160a3d96b20afcd Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 25 May 2017 15:37:49 +0800
Subject: [PATCH 7/8] use Titanic instead of longley

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 40 +++++++++++++++-------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 9bb918f825bb6..4ea538cb51ef6 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -783,11 +783,12 @@ head(predict(isoregModel, newDF))
 `spark.decisionTree` fits a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-We use the `longley` dataset to train a decision tree and make predictions:
+We use the `Titanic` dataset to train a decision tree and make predictions:
 
 ```{r}
-df <- createDataFrame(longley)
-dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2)
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+dtModel <- spark.decisionTree(df, Age ~ ., type = "regression", maxDepth = 2)
 summary(dtModel)
 predictions <- predict(dtModel, df)
 ```
@@ -797,11 +798,12 @@ predictions <- predict(dtModel, df)
 `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-We use the `longley` dataset to train a gradient-boosted tree and make predictions:
+We use the `Titanic` dataset to train a gradient-boosted tree and make predictions:
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2)
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+gbtModel <- spark.gbt(df, Age ~ ., type = "regression", maxDepth = 2, maxIter = 2)
 summary(gbtModel)
 predictions <- predict(gbtModel, df)
 ```
@@ -811,11 +813,12 @@ predictions <- predict(gbtModel, df)
 `spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`.
 Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models.
 
-In the following example, we use the `longley` dataset to train a random forest and make predictions:
+In the following example, we use the `Titanic` dataset to train a random forest and make predictions:
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2)
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+rfModel <- spark.randomForest(df, Age ~ ., type = "regression", maxDepth = 2, numTrees = 2)
 summary(rfModel)
 predictions <- predict(rfModel, df)
 ```
@@ -981,17 +984,18 @@ Given a `SparkDataFrame`, the test compares continuous data in a given column `t
 specified by parameter `nullHypothesis`.
 Users can call `summary` to get a summary of the test results.
 
-In the following example, we test whether the `longley` dataset's `Armed_Forces` column
+In the following example, we test whether the `Titanic` dataset's `Freq` column
 follows a normal distribution.  We set the parameters of the normal distribution using
 the mean and standard deviation of the sample.
 
-```{r, warning=FALSE}
-df <- createDataFrame(longley)
-afStats <- head(select(df, mean(df$Armed_Forces), sd(df$Armed_Forces)))
-afMean <- afStats[1]
-afStd <- afStats[2]
+```{r}
+t <- as.data.frame(Titanic)
+df <- createDataFrame(t)
+freqStats <- head(select(df, mean(df$Freq), sd(df$Freq)))
+freqMean <- freqStats[1]
+freqStd <- freqStats[2]
 
-test <- spark.kstest(df, "Armed_Forces", "norm", c(afMean, afStd))
+test <- spark.kstest(df, "Freq", "norm", c(freqMean, freqStd))
 testSummary <- summary(test)
 testSummary
 ```

From 1a97e42ea9305a043eccead7013ad35e9aa89f91 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 25 May 2017 15:44:47 +0800
Subject: [PATCH 8/8] use cla instead of reg

---
 R/pkg/vignettes/sparkr-vignettes.Rmd | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 4ea538cb51ef6..2301a64576d0e 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -788,7 +788,7 @@ We use the `Titanic` dataset to train a decision tree and make predictions:
 ```{r}
 t <- as.data.frame(Titanic)
 df <- createDataFrame(t)
-dtModel <- spark.decisionTree(df, Age ~ ., type = "regression", maxDepth = 2)
+dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2)
 summary(dtModel)
 predictions <- predict(dtModel, df)
 ```
@@ -803,7 +803,7 @@ We use the `Titanic` dataset to train a gradient-boosted tree and make predictio
 ```{r}
 t <- as.data.frame(Titanic)
 df <- createDataFrame(t)
-gbtModel <- spark.gbt(df, Age ~ ., type = "regression", maxDepth = 2, maxIter = 2)
+gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2)
 summary(gbtModel)
 predictions <- predict(gbtModel, df)
 ```
@@ -818,7 +818,7 @@ In the following example, we use the `Titanic` dataset to train a random forest
 ```{r}
 t <- as.data.frame(Titanic)
 df <- createDataFrame(t)
-rfModel <- spark.randomForest(df, Age ~ ., type = "regression", maxDepth = 2, numTrees = 2)
+rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2)
 summary(rfModel)
 predictions <- predict(rfModel, df)
 ```