From d13ef8d5f2e9a938802940b8f0ea736a6b0b56b3 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 23 May 2017 11:56:20 +0800 Subject: [PATCH 1/8] create pr --- docs/ml-classification-regression.md | 7 +++ examples/src/main/r/ml/decisionTree.R | 65 +++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 examples/src/main/r/ml/decisionTree.R diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index ab6f587e09ef2..083df2e405d62 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -708,6 +708,13 @@ More details on parameters can be found in the [Python API documentation](api/py {% include_example python/ml/decision_tree_regression_example.py %} +
+ +Refer to the [R API docs](api/R/spark.decisionTree.html) for more details. + +{% include_example regression r/ml/decisionTree.R %} +
+ diff --git a/examples/src/main/r/ml/decisionTree.R b/examples/src/main/r/ml/decisionTree.R new file mode 100644 index 0000000000000..9e10ae5519cd3 --- /dev/null +++ b/examples/src/main/r/ml/decisionTree.R @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# To run this example use +# ./bin/spark-submit examples/src/main/r/ml/decisionTree.R + +# Load SparkR library into your R session +library(SparkR) + +# Initialize SparkSession +sparkR.session(appName = "SparkR-ML-decisionTree-example") + +# DecisionTree classification model + +# $example on:classification$ +# Load training data +df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a DecisionTree classification model with spark.decisionTree +model <- spark.decisionTree(training, label ~ features, "classification") + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +head(predictions) +# $example off:classification$ + +# DecisionTree regression model + +# $example on:regression$ +# Load training data +df <- read.df("data/mllib/sample_linear_regression_data.txt", source = "libsvm") +training <- df +test <- df + +# Fit a DecisionTree regression model with spark.decisionTree +model <- spark.decisionTree(training, label ~ features, "regression") + +# Model summary +summary(model) + +# Prediction +predictions <- predict(model, test) +head(predictions) +# $example off:regression$ + +sparkR.session.stop() From c988ec44a444509910260a7750cc224ef5d35460 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 23 May 2017 12:33:33 +0800 Subject: [PATCH 2/8] update vignettes --- R/pkg/vignettes/sparkr-vignettes.Rmd | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 13a399165c8b4..78feffbb88cb6 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -503,6 +503,8 @@ SparkR supports the following machine learning models and algorithms. #### Tree - Classification and Regression +* Decision Tree + * Gradient-Boosted Trees (GBT) * Random Forest @@ -776,6 +778,19 @@ newDF <- createDataFrame(data.frame(x = c(1.5, 3.2))) head(predict(isoregModel, newDF)) ``` +#### Decision Tree + +`spark.decisionTree` fits a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or regression model on a `SparkDataFrame`. +Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. + +We use the `longley` dataset to train a decision tree and make predictions: + +```{r, warning=FALSE} +df <- createDataFrame(longley) +dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2) +summary(dtModel) +predictions <- predict(dtModel, df) + #### Gradient-Boosted Trees `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. From 1003b491b2dde3930cb97861d25e90c708ce03ac Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 23 May 2017 13:44:44 +0800 Subject: [PATCH 3/8] update sparkr.md --- docs/sparkr.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/sparkr.md b/docs/sparkr.md index 569b85e72c3cf..a3254e7654134 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -492,6 +492,7 @@ SparkR supports the following machine learning algorithms currently: #### Tree +* [`spark.decisionTree`](api/R/spark.decisionTree.html): `Decision Tree for` [`Regression`](ml-classification-regression.html#decision-tree-regression) `and` [`Classification`](ml-classification-regression.html#decision-tree-classifier) * [`spark.gbt`](api/R/spark.gbt.html): `Gradient Boosted Trees for` [`Regression`](ml-classification-regression.html#gradient-boosted-tree-regression) `and` [`Classification`](ml-classification-regression.html#gradient-boosted-tree-classifier) * [`spark.randomForest`](api/R/spark.randomForest.html): `Random Forest for` [`Regression`](ml-classification-regression.html#random-forest-regression) `and` [`Classification`](ml-classification-regression.html#random-forest-classifier) From 377588ebdcc581f32659bcb4d179ae8aaf8378e4 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 23 May 2017 14:38:36 +0800 Subject: [PATCH 4/8] fix bug & update r formula --- R/pkg/vignettes/sparkr-vignettes.Rmd | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 78feffbb88cb6..fa1989d2903ee 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -430,7 +430,7 @@ We use `svm` in package `e1071` as an example. We use all default settings excep costs <- exp(seq(from = log(1), to = log(1000), length.out = 5)) train <- function(cost) { stopifnot(requireNamespace("e1071", quietly = TRUE)) - model <- e1071::svm(Species ~ ., data = iris, cost = cost) + model <- e1071::svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, cost = cost) summary(model) } ``` @@ -567,7 +567,7 @@ for binary classification. t <- as.data.frame(Titanic) training <- createDataFrame(t) # fit a Linear SVM classifier model -model <- spark.svmLinear(training, Survived ~ ., regParam = 0.01, maxIter = 10) +model <- spark.svmLinear(training, Survived ~ Class + Sex + Age + Freq, regParam = 0.01, maxIter = 10) summary(model) ``` @@ -590,7 +590,7 @@ Binomial logistic regression ```{r} t <- as.data.frame(Titanic) training <- createDataFrame(t) -model <- spark.logit(training, Survived ~ ., regParam = 0.04741301) +model <- spark.logit(training, Survived ~ Class + Sex + Age + Freq, regParam = 0.04741301) summary(model) ``` @@ -604,7 +604,7 @@ Multinomial logistic regression against three classes t <- as.data.frame(Titanic) training <- createDataFrame(t) # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional. -model <- spark.logit(training, Class ~ ., regParam = 0.07815179) +model <- spark.logit(training, Class ~ Sex + Age + Freq + Survived, regParam = 0.07815179) summary(model) ``` @@ -787,9 +787,10 @@ We use the `longley` dataset to train a decision tree and make predictions: ```{r, warning=FALSE} df <- createDataFrame(longley) -dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2) +dtModel <- spark.decisionTree(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2) summary(dtModel) predictions <- predict(dtModel, df) +``` #### Gradient-Boosted Trees @@ -800,7 +801,7 @@ We use the `longley` dataset to train a gradient-boosted tree and make predictio ```{r, warning=FALSE} df <- createDataFrame(longley) -gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +gbtModel <- spark.gbt(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, maxIter = 2) summary(gbtModel) predictions <- predict(gbtModel, df) ``` @@ -814,7 +815,7 @@ In the following example, we use the `longley` dataset to train a random forest ```{r, warning=FALSE} df <- createDataFrame(longley) -rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +rfModel <- spark.randomForest(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, numTrees = 2) summary(rfModel) predictions <- predict(rfModel, df) ``` From 6c07cff75821f2b4ad2460032dc62b220f895aca Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 25 May 2017 09:46:45 +0800 Subject: [PATCH 5/8] update test with warning --- R/pkg/vignettes/sparkr-vignettes.Rmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index fa1989d2903ee..cdcc2602b44a3 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -567,7 +567,7 @@ for binary classification. t <- as.data.frame(Titanic) training <- createDataFrame(t) # fit a Linear SVM classifier model -model <- spark.svmLinear(training, Survived ~ Class + Sex + Age + Freq, regParam = 0.01, maxIter = 10) +model <- spark.svmLinear(training, Survived ~ ., regParam = 0.01, maxIter = 10) summary(model) ``` @@ -590,7 +590,7 @@ Binomial logistic regression ```{r} t <- as.data.frame(Titanic) training <- createDataFrame(t) -model <- spark.logit(training, Survived ~ Class + Sex + Age + Freq, regParam = 0.04741301) +model <- spark.logit(training, Survived ~ ., regParam = 0.04741301) summary(model) ``` @@ -604,7 +604,7 @@ Multinomial logistic regression against three classes t <- as.data.frame(Titanic) training <- createDataFrame(t) # Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional. -model <- spark.logit(training, Class ~ Sex + Age + Freq + Survived, regParam = 0.07815179) +model <- spark.logit(training, Class ~ ., regParam = 0.07815179) summary(model) ``` @@ -785,9 +785,9 @@ Users can call `summary` to get a summary of the fitted model, `predict` to make We use the `longley` dataset to train a decision tree and make predictions: -```{r, warning=FALSE} +```{r} df <- createDataFrame(longley) -dtModel <- spark.decisionTree(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2) +dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2) summary(dtModel) predictions <- predict(dtModel, df) ``` @@ -801,7 +801,7 @@ We use the `longley` dataset to train a gradient-boosted tree and make predictio ```{r, warning=FALSE} df <- createDataFrame(longley) -gbtModel <- spark.gbt(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, maxIter = 2) +gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) summary(gbtModel) predictions <- predict(gbtModel, df) ``` @@ -815,7 +815,7 @@ In the following example, we use the `longley` dataset to train a random forest ```{r, warning=FALSE} df <- createDataFrame(longley) -rfModel <- spark.randomForest(df, Employed ~ Unemployed + Population, type = "regression", maxDepth = 2, numTrees = 2) +rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) summary(rfModel) predictions <- predict(rfModel, df) ``` From 40cc6be3eaa83e334fe541119e5023a71f14d5f7 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 25 May 2017 09:47:56 +0800 Subject: [PATCH 6/8] revert one nit --- R/pkg/vignettes/sparkr-vignettes.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index cdcc2602b44a3..9bb918f825bb6 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -430,7 +430,7 @@ We use `svm` in package `e1071` as an example. We use all default settings excep costs <- exp(seq(from = log(1), to = log(1000), length.out = 5)) train <- function(cost) { stopifnot(requireNamespace("e1071", quietly = TRUE)) - model <- e1071::svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris, cost = cost) + model <- e1071::svm(Species ~ ., data = iris, cost = cost) summary(model) } ``` From 48a968667cdce59e3d8713220160a3d96b20afcd Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 25 May 2017 15:37:49 +0800 Subject: [PATCH 7/8] use Titanic instead of longley --- R/pkg/vignettes/sparkr-vignettes.Rmd | 40 +++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 9bb918f825bb6..4ea538cb51ef6 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -783,11 +783,12 @@ head(predict(isoregModel, newDF)) `spark.decisionTree` fits a [decision tree](https://en.wikipedia.org/wiki/Decision_tree_learning) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -We use the `longley` dataset to train a decision tree and make predictions: +We use the `Titanic` dataset to train a decision tree and make predictions: ```{r} -df <- createDataFrame(longley) -dtModel <- spark.decisionTree(df, Employed ~ ., type = "regression", maxDepth = 2) +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +dtModel <- spark.decisionTree(df, Age ~ ., type = "regression", maxDepth = 2) summary(dtModel) predictions <- predict(dtModel, df) ``` @@ -797,11 +798,12 @@ predictions <- predict(dtModel, df) `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -We use the `longley` dataset to train a gradient-boosted tree and make predictions: +We use the `Titanic` dataset to train a gradient-boosted tree and make predictions: -```{r, warning=FALSE} -df <- createDataFrame(longley) -gbtModel <- spark.gbt(df, Employed ~ ., type = "regression", maxDepth = 2, maxIter = 2) +```{r} +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +gbtModel <- spark.gbt(df, Age ~ ., type = "regression", maxDepth = 2, maxIter = 2) summary(gbtModel) predictions <- predict(gbtModel, df) ``` @@ -811,11 +813,12 @@ predictions <- predict(gbtModel, df) `spark.randomForest` fits a [random forest](https://en.wikipedia.org/wiki/Random_forest) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -In the following example, we use the `longley` dataset to train a random forest and make predictions: +In the following example, we use the `Titanic` dataset to train a random forest and make predictions: -```{r, warning=FALSE} -df <- createDataFrame(longley) -rfModel <- spark.randomForest(df, Employed ~ ., type = "regression", maxDepth = 2, numTrees = 2) +```{r} +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +rfModel <- spark.randomForest(df, Age ~ ., type = "regression", maxDepth = 2, numTrees = 2) summary(rfModel) predictions <- predict(rfModel, df) ``` @@ -981,17 +984,18 @@ Given a `SparkDataFrame`, the test compares continuous data in a given column `t specified by parameter `nullHypothesis`. Users can call `summary` to get a summary of the test results. -In the following example, we test whether the `longley` dataset's `Armed_Forces` column +In the following example, we test whether the `Titanic` dataset's `Freq` column follows a normal distribution. We set the parameters of the normal distribution using the mean and standard deviation of the sample. -```{r, warning=FALSE} -df <- createDataFrame(longley) -afStats <- head(select(df, mean(df$Armed_Forces), sd(df$Armed_Forces))) -afMean <- afStats[1] -afStd <- afStats[2] +```{r} +t <- as.data.frame(Titanic) +df <- createDataFrame(t) +freqStats <- head(select(df, mean(df$Freq), sd(df$Freq))) +freqMean <- freqStats[1] +freqStd <- freqStats[2] -test <- spark.kstest(df, "Armed_Forces", "norm", c(afMean, afStd)) +test <- spark.kstest(df, "Freq", "norm", c(freqMean, freqStd)) testSummary <- summary(test) testSummary ``` From 1a97e42ea9305a043eccead7013ad35e9aa89f91 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 25 May 2017 15:44:47 +0800 Subject: [PATCH 8/8] use cla instead of reg --- R/pkg/vignettes/sparkr-vignettes.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index 4ea538cb51ef6..2301a64576d0e 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -788,7 +788,7 @@ We use the `Titanic` dataset to train a decision tree and make predictions: ```{r} t <- as.data.frame(Titanic) df <- createDataFrame(t) -dtModel <- spark.decisionTree(df, Age ~ ., type = "regression", maxDepth = 2) +dtModel <- spark.decisionTree(df, Survived ~ ., type = "classification", maxDepth = 2) summary(dtModel) predictions <- predict(dtModel, df) ``` @@ -803,7 +803,7 @@ We use the `Titanic` dataset to train a gradient-boosted tree and make predictio ```{r} t <- as.data.frame(Titanic) df <- createDataFrame(t) -gbtModel <- spark.gbt(df, Age ~ ., type = "regression", maxDepth = 2, maxIter = 2) +gbtModel <- spark.gbt(df, Survived ~ ., type = "classification", maxDepth = 2, maxIter = 2) summary(gbtModel) predictions <- predict(gbtModel, df) ``` @@ -818,7 +818,7 @@ In the following example, we use the `Titanic` dataset to train a random forest ```{r} t <- as.data.frame(Titanic) df <- createDataFrame(t) -rfModel <- spark.randomForest(df, Age ~ ., type = "regression", maxDepth = 2, numTrees = 2) +rfModel <- spark.randomForest(df, Survived ~ ., type = "classification", maxDepth = 2, numTrees = 2) summary(rfModel) predictions <- predict(rfModel, df) ```