From c9555e83409568dc1ef8dfb350538023216b08c5 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 10 Aug 2015 16:01:07 -0700 Subject: [PATCH 1/2] first pass --- R/pkg/R/generics.R | 4 ++-- R/pkg/R/mllib.R | 8 ++++---- docs/sparkr.md | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index c43b947129e87..379a78b1d833e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -535,8 +535,8 @@ setGeneric("showDF", function(x,...) { standardGeneric("showDF") }) #' @export setGeneric("summarize", function(x,...) { standardGeneric("summarize") }) -##' rdname summary -##' @export +#' @rdname summary +#' @export setGeneric("summary", function(x, ...) { standardGeneric("summary") }) # @rdname tojson diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b524d1fd87496..cea3d760d05fe 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -56,10 +56,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFram #' #' Makes predictions from a model produced by glm(), similarly to R's predict(). #' -#' @param model A fitted MLlib model +#' @param object A fitted MLlib model #' @param newData DataFrame for testing #' @return DataFrame containing predicted values -#' @rdname glm +#' @rdname predict #' @export #' @examples #'\dontrun{ @@ -76,10 +76,10 @@ setMethod("predict", signature(object = "PipelineModel"), #' #' Returns the summary of a model produced by glm(), similarly to R's summary(). #' -#' @param model A fitted MLlib model +#' @param x A fitted MLlib model #' @return a list with a 'coefficient' component, which is the matrix of coefficients. See #' summary.glm for more information. -#' @rdname glm +#' @rdname summary #' @export #' @examples #'\dontrun{ diff --git a/docs/sparkr.md b/docs/sparkr.md index 4385a4eeacd5c..e8cdf979bd0f4 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -210,6 +210,43 @@ head(df) {% endhighlight %} +### Model Formulae + +SparkR allows the fitting of generalized linear models over DataFrames using the glm() function. Under the hood, SparkR uses MLlib to train a model of the specified family. In Spark 1.5, we support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of R formulae with SparkR. + +
+{% highlight r %} +# Create the DataFrame +df <- createDataFrame(sqlContext, iris) + +# Fit a linear model over the dataset. +model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian") + +# Model coefficients are returned in a similar format to R's native glm(). +summary(model) +##$coefficients +## Estimate +##(Intercept) 2.2513930 +##Sepal_Width 0.8035609 +##Species_versicolor 1.4587432 +##Species_virginica 1.9468169 + +# Make predictions based on the model. +predictions <- predict(model, newData = df) +predictions +##DataFrame[Sepal_Length:double, Sepal_Width:double, Petal_Length:double, Petal_Width:double, Species:string, features:vector, label:double, prediction:double] + +head(select(predictions, "Sepal_Length", "prediction")) +## Sepal_Length prediction +##1 5.1 5.063856 +##2 4.9 4.662076 +##3 4.7 4.822788 +##4 4.6 4.742432 +##5 5.0 5.144212 +##6 5.4 5.385281 +{% endhighlight %} +
+ ## Running SQL Queries from SparkR A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data. The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`. From f7514111643b9c6545c44737bb36bca14f50160b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 11 Aug 2015 16:14:52 -0700 Subject: [PATCH 2/2] Tue Aug 11 16:14:52 PDT 2015 --- docs/sparkr.md | 52 ++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/docs/sparkr.md b/docs/sparkr.md index e8cdf979bd0f4..7139d16b4a068 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -11,7 +11,8 @@ title: SparkR (R on Spark) SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame implementation that supports operations like selection, filtering, aggregation etc. (similar to R data frames, -[dplyr](https://github.com/hadley/dplyr)) but on large datasets. +[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also supports distributed +machine learning using MLlib. # SparkR DataFrames @@ -210,9 +211,30 @@ head(df) {% endhighlight %} -### Model Formulae +## Running SQL Queries from SparkR +A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data. +The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`. + +
+{% highlight r %} +# Load a JSON file +people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json") + +# Register this DataFrame as a table. +registerTempTable(people, "people") -SparkR allows the fitting of generalized linear models over DataFrames using the glm() function. Under the hood, SparkR uses MLlib to train a model of the specified family. In Spark 1.5, we support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of R formulae with SparkR. +# SQL statements can be run by using the sql method +teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") +head(teenagers) +## name +##1 Justin + +{% endhighlight %} +
+ +# Machine Learning + +SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR.
{% highlight r %} @@ -233,9 +255,6 @@ summary(model) # Make predictions based on the model. predictions <- predict(model, newData = df) -predictions -##DataFrame[Sepal_Length:double, Sepal_Width:double, Petal_Length:double, Petal_Width:double, Species:string, features:vector, label:double, prediction:double] - head(select(predictions, "Sepal_Length", "prediction")) ## Sepal_Length prediction ##1 5.1 5.063856 @@ -246,24 +265,3 @@ head(select(predictions, "Sepal_Length", "prediction")) ##6 5.4 5.385281 {% endhighlight %}
- -## Running SQL Queries from SparkR -A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data. -The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`. - -
-{% highlight r %} -# Load a JSON file -people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json") - -# Register this DataFrame as a table. -registerTempTable(people, "people") - -# SQL statements can be run by using the sql method -teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") -head(teenagers) -## name -##1 Justin - -{% endhighlight %} -