From afc1bc72d9e673b1f00a5d22a7cc32000a6026c6 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Fri, 14 Aug 2015 12:15:20 -0700
Subject: [PATCH 1/6] Fri Aug 14 12:15:20 PDT 2015

---
 docs/ml-features.md                           | 116 ++++++++++++++++++
 .../apache/spark/ml/feature/RFormula.scala    |   4 +-
 2 files changed, 118 insertions(+), 2 deletions(-)
diff --git a/docs/ml-features.md b/docs/ml-features.md
index cec2cbe673407..2d1a45c357d6c 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1387,5 +1387,121 @@ print(output.select("features", "clicked").first())
 </div>
 </div>
 
+## RFormula
+
+`RFormula` encodes a string column of labels to a column of label indices.
+The indices are in `[0, numLabels)`, ordered by label frequencies.
+So the most frequent label gets index `0`.
+If the input column is numeric, we cast it to string and index the string values.
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id` and `category`:
+
+~~~~
+ id | category
+----|----------
+ 0  | a
+ 1  | b
+ 2  | c
+ 3  | a
+ 4  | a
+ 5  | c
+~~~~
+
+`category` is a string column with three labels: "a", "b", and "c".
+Applying `RFormula` with `category` as the input column and `categoryIndex` as the output
+column, we should get the following:
+
+~~~~
+ id | category | categoryIndex
+----|----------|---------------
+ 0  | a        | 0.0
+ 1  | b        | 2.0
+ 2  | c        | 1.0
+ 3  | a        | 0.0
+ 4  | a        | 0.0
+ 5  | c        | 1.0
+~~~~
+
+"a" gets index `0` because it is the most frequent, followed by "c" with index `1` and "b" with
+index `2`.
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+
+[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an input
+column name and an output column name.
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.RFormula
+
+val df = sqlContext.createDataFrame(
+  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+).toDF("id", "category")
+val indexer = new RFormula()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex")
+val indexed = indexer.fit(df).transform(df)
+indexed.show()
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an input column
+name and an output column name.
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import static org.apache.spark.sql.types.DataTypes.*;
+
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(0, "a"),
+  RowFactory.create(1, "b"),
+  RowFactory.create(2, "c"),
+  RowFactory.create(3, "a"),
+  RowFactory.create(4, "a"),
+  RowFactory.create(5, "c")
+));
+StructType schema = new StructType(new StructField[] {
+  createStructField("id", DoubleType, false),
+  createStructField("category", StringType, false)
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+RFormula indexer = new RFormula()
+  .setInputCol("category")
+  .setOutputCol("categoryIndex");
+DataFrame indexed = indexer.fit(df).transform(df);
+indexed.show();
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an input
+column name and an output column name.
+
+{% highlight python %}
+from pyspark.ml.feature import RFormula
+
+df = sqlContext.createDataFrame(
+    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
+    ["id", "category"])
+indexer = RFormula(inputCol="category", outputCol="categoryIndex")
+indexed = indexer.fit(df).transform(df)
+indexed.show()
+{% endhighlight %}
+</div>
+</div>
+
 # Feature Selectors
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index a752dacd72d95..a7fa50444209b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -42,8 +42,8 @@ private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
 /**
  * :: Experimental ::
  * Implements the transforms required for fitting a dataset against an R model formula. Currently
- * we support a limited subset of the R operators, including '~' and '+'. Also see the R formula
- * docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
+ * we support a limited subset of the R operators, including '.', '~', '+', and '-'. Also see the
+ * R formula docs here: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/formula.html
  */
 @Experimental
 class RFormula(override val uid: String) extends Estimator[RFormulaModel] with RFormulaBase {

From 32ed60e80daff875b05bce161f4ddcb9260a506a Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Aug 2015 18:00:49 -0700
Subject: [PATCH 2/6] formula doc

---
 docs/ml-features.md | 126 +++++++++++++++++++-------------------------
 1 file changed, 53 insertions(+), 73 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 2d1a45c357d6c..078ab314ea731 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1386,119 +1386,99 @@ print(output.select("features", "clicked").first())
 {% endhighlight %}
 </div>
 </div>
-
 ## RFormula
 
-`RFormula` encodes a string column of labels to a column of label indices.
-The indices are in `[0, numLabels)`, ordered by label frequencies.
-So the most frequent label gets index `0`.
-If the input column is numeric, we cast it to string and index the string values.
+`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a numeric column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.
 
 **Examples**
 
-Assume that we have the following DataFrame with columns `id` and `category`:
-
-~~~~
- id | category
-----|----------
- 0  | a
- 1  | b
- 2  | c
- 3  | a
- 4  | a
- 5  | c
-~~~~
+Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `clicked`:
 
-`category` is a string column with three labels: "a", "b", and "c".
-Applying `RFormula` with `category` as the input column and `categoryIndex` as the output
-column, we should get the following:
+~~~
+id | country | hour | clicked
+---|---------|------|---------
+ 7 | "US"    | 18   | 1.0
+~~~
 
-~~~~
- id | category | categoryIndex
-----|----------|---------------
- 0  | a        | 0.0
- 1  | b        | 2.0
- 2  | c        | 1.0
- 3  | a        | 0.0
- 4  | a        | 0.0
- 5  | c        | 1.0
-~~~~
+If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to
+predict `clicked` based on `country` and `hour`, after transformation we should get the following DataFrame:
 
-"a" gets index `0` because it is the most frequent, followed by "c" with index `1` and "b" with
-index `2`.
+~~~
+id | country | hour | clicked | label | features
+---|---------|------|---------|-------|-----------------------------
+ 7 | "US"    | 18   | 1.0     | 1.0   | [0.0, 1.0, 18.0]
+~~~
 
 <div class="codetabs">
-
 <div data-lang="scala" markdown="1">
 
-[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an input
-column name and an output column name.
+[`RFormula`](api/scala/index.html#org.apache.spark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
 
 {% highlight scala %}
 import org.apache.spark.ml.feature.RFormula
 
-val df = sqlContext.createDataFrame(
-  Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
-).toDF("id", "category")
-val indexer = new RFormula()
-  .setInputCol("category")
-  .setOutputCol("categoryIndex")
-val indexed = indexer.fit(df).transform(df)
-indexed.show()
+val dataset = sqlContext.createDataFrame(
+  Seq((7, "US", 18, 1.0))
+).toDF("id", "country", "hour", "clicked")
+val formula = new RFormula()
+  .setFormula("clicked ~ country + hour")
+  .setFeaturesCol("features")
+  .setLabelCol("label")
+val output = formula.fit(dataset).transform(dataset)
+println(output.select("features", "label").first())
 {% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
-[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an input column
-name and an output column name.
+
+[`RFormula`](api/java/org/apache/spark/ml/feature/RFormula.html) takes an R formula string, and optional parameters for the names of its output columns.
 
 {% highlight java %}
 import java.util.Arrays;
 
 import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.ml.feature.RFormula;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.types.*;
 import static org.apache.spark.sql.types.DataTypes.*;
 
-JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
-  RowFactory.create(0, "a"),
-  RowFactory.create(1, "b"),
-  RowFactory.create(2, "c"),
-  RowFactory.create(3, "a"),
-  RowFactory.create(4, "a"),
-  RowFactory.create(5, "c")
-));
-StructType schema = new StructType(new StructField[] {
-  createStructField("id", DoubleType, false),
-  createStructField("category", StringType, false)
+StructType schema = createStructType(new StructField[] {
+  createStructField("id", IntegerType, false),
+  createStructField("country", StringType, false),
+  createStructField("hour", IntegerType, false),
+  createStructField("clicked", DoubleType, false)
 });
-DataFrame df = sqlContext.createDataFrame(jrdd, schema);
-RFormula indexer = new RFormula()
-  .setInputCol("category")
-  .setOutputCol("categoryIndex");
-DataFrame indexed = indexer.fit(df).transform(df);
-indexed.show();
+Row row = RowFactory.create(7, "US", 18, 1.0);
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+RFormula formula = new RFormula()
+  .setFormula("clicked ~ country + hour")
+  .setFeaturesCol("features")
+  .setLabelCol("label")
+
+DataFrame output = formula.fit(dataset).transform(dataset);
+System.out.println(output.select("features", "label").first());
 {% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
 
-[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an input
-column name and an output column name.
+[`RFormula`](api/python/pyspark.ml.html#pyspark.ml.feature.RFormula) takes an R formula string, and optional parameters for the names of its output columns.
 
 {% highlight python %}
 from pyspark.ml.feature import RFormula
 
-df = sqlContext.createDataFrame(
-    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
-    ["id", "category"])
-indexer = RFormula(inputCol="category", outputCol="categoryIndex")
-indexed = indexer.fit(df).transform(df)
-indexed.show()
+dataset = sqlContext.createDataFrame(
+    [(7, "US", 18, 1.0)],
+    ["id", "country", "hour", "clicked"])
+formula = RFormula(
+    formula="clicked ~ country + hour",
+    featuresCol="features",
+    labelCol="label")
+output = formula.fit(dataset).transform(dataset)
+print(output.select("features", "label").first())
 {% endhighlight %}
 </div>
 </div>

From 83961ad53d512f9a9b26d9565e00864fd40b0e0d Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Tue, 18 Aug 2015 18:01:19 -0700
Subject: [PATCH 3/6] Tue Aug 18 18:01:19 PDT 2015

---
 docs/ml-features.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 078ab314ea731..44c2dfa52e6ab 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1388,7 +1388,7 @@ print(output.select("features", "clicked").first())
 </div>
 ## RFormula
 
-`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a numeric column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.
+`RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.
 
 **Examples**
 

From 6999d764d8930a8ab100bca75b34faa8e1c98afa Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Aug 2015 11:59:07 -0700
Subject: [PATCH 4/6] comments

---
 docs/ml-features.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 44c2dfa52e6ab..97e8063e990ad 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1386,6 +1386,7 @@ print(output.select("features", "clicked").first())
 {% endhighlight %}
 </div>
 </div>
+
 ## RFormula
 
 `RFormula` selects columns specified by an [R model formula](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/formula.html). It produces a vector column of features and a double column of labels. Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles. If not already present in the DataFrame, the output label column will be created from the specified response variable in the formula.
@@ -1398,6 +1399,7 @@ Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `c
 id | country | hour | clicked
 ---|---------|------|---------
  7 | "US"    | 18   | 1.0
+ 8 | "CA"    | 12   | 0.0
 ~~~
 
 If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to
@@ -1407,6 +1409,7 @@ predict `clicked` based on `country` and `hour`, after transformation we should
 id | country | hour | clicked | label | features
 ---|---------|------|---------|-------|-----------------------------
  7 | "US"    | 18   | 1.0     | 1.0   | [0.0, 1.0, 18.0]
+ 8 | "CA"    | 12   | 0.0     | 0.0   | [1.0, 0.0, 12.0]
 ~~~
 
 <div class="codetabs">
@@ -1418,7 +1421,8 @@ id | country | hour | clicked | label | features
 import org.apache.spark.ml.feature.RFormula
 
 val dataset = sqlContext.createDataFrame(
-  Seq((7, "US", 18, 1.0))
+  Seq((7, "US", 18, 1.0)),
+  Seq((8, "CA", 12, 0.0))
 ).toDF("id", "country", "hour", "clicked")
 val formula = new RFormula()
   .setFormula("clicked ~ country + hour")
@@ -1449,14 +1453,15 @@ StructType schema = createStructType(new StructField[] {
   createStructField("hour", IntegerType, false),
   createStructField("clicked", DoubleType, false)
 });
-Row row = RowFactory.create(7, "US", 18, 1.0);
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row));
+Row row1 = RowFactory.create(7, "US", 18, 1.0);
+Row row2 = RowFactory.create(8, "CA", 12, 0.0);
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row1, row2));
 DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
 
 RFormula formula = new RFormula()
   .setFormula("clicked ~ country + hour")
   .setFeaturesCol("features")
-  .setLabelCol("label")
+  .setLabelCol("label");
 
 DataFrame output = formula.fit(dataset).transform(dataset);
 System.out.println(output.select("features", "label").first());
@@ -1472,6 +1477,7 @@ from pyspark.ml.feature import RFormula
 
 dataset = sqlContext.createDataFrame(
     [(7, "US", 18, 1.0)],
+    [(8, "CA", 12, 0.0)],
     ["id", "country", "hour", "clicked"])
 formula = RFormula(
     formula="clicked ~ country + hour",

From b6f699e116336acc47d8fc9d3cd039da76ddf534 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Aug 2015 14:15:29 -0700
Subject: [PATCH 5/6] fix bugs

---
 docs/ml-features.md | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 97e8063e990ad..740846b728ac2 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1400,16 +1400,18 @@ id | country | hour | clicked
 ---|---------|------|---------
  7 | "US"    | 18   | 1.0
  8 | "CA"    | 12   | 0.0
+ 9 | "NZ"    | 15   | 0.0
 ~~~
 
 If we use `RFormula` with a formula string of `clicked ~ country + hour`, which indicates that we want to
 predict `clicked` based on `country` and `hour`, after transformation we should get the following DataFrame:
 
 ~~~
-id | country | hour | clicked | label | features
----|---------|------|---------|-------|-----------------------------
- 7 | "US"    | 18   | 1.0     | 1.0   | [0.0, 1.0, 18.0]
- 8 | "CA"    | 12   | 0.0     | 0.0   | [1.0, 0.0, 12.0]
+id | country | hour | clicked | features         | label
+---|---------|------|---------|------------------|-------
+ 7 | "US"    | 18   | 1.0     | [0.0, 0.0, 18.0] | 1.0
+ 8 | "CA"    | 12   | 0.0     | [0.0, 1.0, 12.0] | 0.0
+ 9 | "NZ"    | 15   | 0.0     | [1.0, 0.0, 15.0] | 0.0
 ~~~
 
 <div class="codetabs">
@@ -1420,16 +1422,17 @@ id | country | hour | clicked | label | features
 {% highlight scala %}
 import org.apache.spark.ml.feature.RFormula
 
-val dataset = sqlContext.createDataFrame(
-  Seq((7, "US", 18, 1.0)),
-  Seq((8, "CA", 12, 0.0))
-).toDF("id", "country", "hour", "clicked")
+val dataset = sqlContext.createDataFrame(Seq(
+  (7, "US", 18, 1.0),
+  (8, "CA", 12, 0.0),
+  (9, "NZ", 15, 0.0)
+)).toDF("id", "country", "hour", "clicked")
 val formula = new RFormula()
   .setFormula("clicked ~ country + hour")
   .setFeaturesCol("features")
   .setLabelCol("label")
 val output = formula.fit(dataset).transform(dataset)
-println(output.select("features", "label").first())
+output.select("features", "label").show()
 {% endhighlight %}
 </div>
 
@@ -1441,6 +1444,7 @@ println(output.select("features", "label").first())
 import java.util.Arrays;
 
 import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.RFormula;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -1453,9 +1457,11 @@ StructType schema = createStructType(new StructField[] {
   createStructField("hour", IntegerType, false),
   createStructField("clicked", DoubleType, false)
 });
-Row row1 = RowFactory.create(7, "US", 18, 1.0);
-Row row2 = RowFactory.create(8, "CA", 12, 0.0);
-JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(row1, row2));
+JavaRDD<Row> rdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(7, "US", 18, 1.0),
+  RowFactory.create(8, "CA", 12, 0.0),
+  RowFactory.create(9, "NZ", 15, 0.0)
+));
 DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
 
 RFormula formula = new RFormula()
@@ -1464,7 +1470,7 @@ RFormula formula = new RFormula()
   .setLabelCol("label");
 
 DataFrame output = formula.fit(dataset).transform(dataset);
-System.out.println(output.select("features", "label").first());
+output.select("features", "label").show();
 {% endhighlight %}
 </div>
 
@@ -1476,15 +1482,16 @@ System.out.println(output.select("features", "label").first());
 from pyspark.ml.feature import RFormula
 
 dataset = sqlContext.createDataFrame(
-    [(7, "US", 18, 1.0)],
-    [(8, "CA", 12, 0.0)],
+    [(7, "US", 18, 1.0),
+     (8, "CA", 12, 0.0),
+     (9, "NZ", 15, 0.0)],
     ["id", "country", "hour", "clicked"])
 formula = RFormula(
     formula="clicked ~ country + hour",
     featuresCol="features",
     labelCol="label")
 output = formula.fit(dataset).transform(dataset)
-print(output.select("features", "label").first())
+output.select("features", "label").show()
 {% endhighlight %}
 </div>
 </div>

From 562b074ad7373ae9e329e85bc05f2f567ca04c25 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekl@databricks.com>
Date: Wed, 19 Aug 2015 14:16:46 -0700
Subject: [PATCH 6/6] fix merge

---
 docs/ml-features.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 4e93a9fdc7b15..6309db97be4d0 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1585,6 +1585,3 @@ output.select("features", "label").show()
 {% endhighlight %}
 </div>
 </div>
-
-# Feature Selectors
-