From c9555e83409568dc1ef8dfb350538023216b08c5 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Mon, 10 Aug 2015 16:01:07 -0700
Subject: [PATCH 1/2] first pass

---
 R/pkg/R/generics.R |  4 ++--
 R/pkg/R/mllib.R    |  8 ++++----
 docs/sparkr.md     | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c43b947129e87..379a78b1d833e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -535,8 +535,8 @@ setGeneric("showDF", function(x,...) { standardGeneric("showDF") })
 #' @export
 setGeneric("summarize", function(x,...) { standardGeneric("summarize") })
 
-##' rdname summary
-##' @export
+#' @rdname summary
+#' @export
 setGeneric("summary", function(x, ...) { standardGeneric("summary") })
 
 # @rdname tojson
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b524d1fd87496..cea3d760d05fe 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -56,10 +56,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFram
 #'
 #' Makes predictions from a model produced by glm(), similarly to R's predict().
 #'
-#' @param model A fitted MLlib model
+#' @param object A fitted MLlib model
 #' @param newData DataFrame for testing
 #' @return DataFrame containing predicted values
-#' @rdname glm
+#' @rdname predict
 #' @export
 #' @examples
 #'\dontrun{
@@ -76,10 +76,10 @@ setMethod("predict", signature(object = "PipelineModel"),
 #'
 #' Returns the summary of a model produced by glm(), similarly to R's summary().
 #'
-#' @param model A fitted MLlib model
+#' @param x A fitted MLlib model
 #' @return a list with a 'coefficient' component, which is the matrix of coefficients. See
 #'         summary.glm for more information.
-#' @rdname glm
+#' @rdname summary
 #' @export
 #' @examples
 #'\dontrun{
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 4385a4eeacd5c..e8cdf979bd0f4 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -210,6 +210,43 @@ head(df)
 {% endhighlight %}
 </div>
 
+### Model Formulae
+
+SparkR allows the fitting of generalized linear models over DataFrames using the glm() function. Under the hood, SparkR uses MLlib to train a model of the specified family. In Spark 1.5, we support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of R formulae with SparkR.
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+# Create the DataFrame
+df <- createDataFrame(sqlContext, iris)
+
+# Fit a linear model over the dataset.
+model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian")
+
+# Model coefficients are returned in a similar format to R's native glm().
+summary(model)
+##$coefficients
+##                    Estimate
+##(Intercept)        2.2513930
+##Sepal_Width        0.8035609
+##Species_versicolor 1.4587432
+##Species_virginica  1.9468169
+
+# Make predictions based on the model.
+predictions <- predict(model, newData = df)
+predictions
+##DataFrame[Sepal_Length:double, Sepal_Width:double, Petal_Length:double, Petal_Width:double, Species:string, features:vector, label:double, prediction:double]
+
+head(select(predictions, "Sepal_Length", "prediction"))
+##  Sepal_Length prediction
+##1          5.1   5.063856
+##2          4.9   4.662076
+##3          4.7   4.822788
+##4          4.6   4.742432
+##5          5.0   5.144212
+##6          5.4   5.385281
+{% endhighlight %}
+</div>
+
 ## Running SQL Queries from SparkR
 A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data.
 The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`.

From f7514111643b9c6545c44737bb36bca14f50160b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 11 Aug 2015 16:14:52 -0700
Subject: [PATCH 2/2] Tue Aug 11 16:14:52 PDT 2015

---
 docs/sparkr.md | 52 ++++++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/docs/sparkr.md b/docs/sparkr.md
index e8cdf979bd0f4..7139d16b4a068 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -11,7 +11,8 @@ title: SparkR (R on Spark)
 SparkR is an R package that provides a light-weight frontend to use Apache Spark from R.
 In Spark {{site.SPARK_VERSION}}, SparkR provides a distributed data frame implementation that
 supports operations like selection, filtering, aggregation etc. (similar to R data frames,
-[dplyr](https://github.com/hadley/dplyr)) but on large datasets.
+[dplyr](https://github.com/hadley/dplyr)) but on large datasets. SparkR also supports distributed
+machine learning using MLlib.
 
 # SparkR DataFrames
 
@@ -210,9 +211,30 @@ head(df)
 {% endhighlight %}
 </div>
 
-### Model Formulae
+## Running SQL Queries from SparkR
+A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data.
+The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`.
+
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+# Load a JSON file
+people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
+
+# Register this DataFrame as a table.
+registerTempTable(people, "people")
 
-SparkR allows the fitting of generalized linear models over DataFrames using the glm() function. Under the hood, SparkR uses MLlib to train a model of the specified family. In Spark 1.5, we support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of R formulae with SparkR.
+# SQL statements can be run by using the sql method
+teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")
+head(teenagers)
+##    name
+##1 Justin
+
+{% endhighlight %}
+</div>
+
+# Machine Learning
+
+SparkR allows the fitting of generalized linear models over DataFrames using the [glm()](api/R/glm.html) function. Under the hood, SparkR uses MLlib to train a model of the specified family. Currently the gaussian and binomial families are supported. We support a subset of the available R formula operators for model fitting, including '~', '.', '+', and '-'. The example below shows the use of building a gaussian GLM model using SparkR.
 
 <div data-lang="r"  markdown="1">
 {% highlight r %}
@@ -233,9 +255,6 @@ summary(model)
 
 # Make predictions based on the model.
 predictions <- predict(model, newData = df)
-predictions
-##DataFrame[Sepal_Length:double, Sepal_Width:double, Petal_Length:double, Petal_Width:double, Species:string, features:vector, label:double, prediction:double]
-
 head(select(predictions, "Sepal_Length", "prediction"))
 ##  Sepal_Length prediction
 ##1          5.1   5.063856
@@ -246,24 +265,3 @@ head(select(predictions, "Sepal_Length", "prediction"))
 ##6          5.4   5.385281
 {% endhighlight %}
 </div>
-
-## Running SQL Queries from SparkR
-A SparkR DataFrame can also be registered as a temporary table in Spark SQL and registering a DataFrame as a table allows you to run SQL queries over its data.
-The `sql` function enables applications to run SQL queries programmatically and returns the result as a `DataFrame`.
-
-<div data-lang="r"  markdown="1">
-{% highlight r %}
-# Load a JSON file
-people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
-
-# Register this DataFrame as a table.
-registerTempTable(people, "people")
-
-# SQL statements can be run by using the sql method
-teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")
-head(teenagers)
-##    name
-##1 Justin
-
-{% endhighlight %}
-</div>