From 520b75cc9dbd73121fc9476153774b97c52e7963 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 26 Nov 2015 20:49:12 +0800
Subject: [PATCH 1/6] SQLTransformer user guide and example code

---
 R/pkg/DESCRIPTION                             |  5 +-
 docs/ml-features.md                           | 59 +++++++++++++++++++
 .../ml/JavaSQLTransformerExample.java         | 59 +++++++++++++++++++
 .../src/main/python/ml/sql_transformer.py     | 40 +++++++++++++
 .../examples/ml/SQLTransformerExample.scala   | 45 ++++++++++++++
 .../spark/ml/feature/SQLTransformer.scala     |  9 ++-
 6 files changed, 214 insertions(+), 3 deletions(-)
 create mode 100644 examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
 create mode 100644 examples/src/main/python/ml/sql_transformer.py
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 369714f7b99c2..48fb9095573f5 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -18,10 +18,10 @@ Collate:
     'schema.R'
     'generics.R'
     'jobj.R'
-    'RDD.R'
-    'pairRDD.R'
     'column.R'
     'group.R'
+    'RDD.R'
+    'pairRDD.R'
     'DataFrame.R'
     'SQLContext.R'
     'backend.R'
@@ -36,3 +36,4 @@ Collate:
     'stats.R'
     'types.R'
     'utils.R'
+RoxygenNote: 5.0.0
diff --git a/docs/ml-features.md b/docs/ml-features.md
index cd1838d6d2882..08a86cc73c6d5 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1591,6 +1591,65 @@ transformer.transform(df).show()
 
 </div>
 
+## SQLTransformer
+
+`SQLTransformer` implements the transforms which are defined by SQL statement.
+Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
+where `"__THIS__"` represents the underlying table of the input dataset.
+The select clause specifies the fields, constants, and expressions to display in
+the output, it can be any select clause that Spark SQL supported. Users can also
+use Spark SQL build-in function and UDFs to operate on these selected columns.
+For example, `SQLTransformer` support statements like:
+
+* `SELECT a, a + b AS a_b FROM __THIS__`
+* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
+* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b`
+
+**Examples**
+
+Assume that we have the following DataFrame with columns `id`, `v1` and `v2`:
+
+~~~~
+ id |  v1 |  v2
+----|-----|-----
+ 0  | 1.0 | 3.0  
+ 2  | 2.0 | 5.0
+~~~~
+
+This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`:
+
+~~~~
+ id |  v1 |  v2 |  v3 |  v4
+----|-----|-----|-----|-----
+ 0  | 1.0 | 3.0 | 4.0 | 3.0
+ 2  | 2.0 | 5.0 | 7.0 |10.0
+~~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [SQLTransformer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [SQLTransformer Java docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [SQLTransformer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more details on the API.
+
+{% include_example python/ml/sql_transformer.py %}
+</div>
+</div>
+
 ## VectorAssembler
 
 `VectorAssembler` is a transformer that combines a given list of columns into a single vector
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
new file mode 100644
index 0000000000000..d55c70796a967
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.SQLTransformer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+public class JavaSQLTransformerExample {
+  public static void main(String[] args) {
+
+    SparkConf conf = new SparkConf().setAppName("JavaSQLTransformerExample");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext sqlContext = new SQLContext(jsc);
+
+    // $example on$
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+      RowFactory.create(0, 1.0, 3.0),
+      RowFactory.create(2, 2.0, 5.0)
+    ));
+    StructType schema = new StructType(new StructField [] {
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
+    });
+    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+    SQLTransformer sqlTrans = new SQLTransformer().setStatement(
+      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");
+
+    sqlTrans.transform(df).show();
+    // $example off$
+  }
+}
diff --git a/examples/src/main/python/ml/sql_transformer.py b/examples/src/main/python/ml/sql_transformer.py
new file mode 100644
index 0000000000000..9575d728d8159
--- /dev/null
+++ b/examples/src/main/python/ml/sql_transformer.py
@@ -0,0 +1,40 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.ml.feature import SQLTransformer
+# $example off$
+from pyspark.sql import SQLContext
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="SQLTransformerExample")
+    sqlContext = SQLContext(sc)
+
+    # $example on$
+    df = sqlContext.createDataFrame([
+        (0, 1.0, 3.0),
+        (2, 2.0, 5.0)
+    ], ["id", "v1", "v2"])
+    sqlTrans = SQLTransformer(
+        statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+    sqlTrans.transform(df).show()
+    # $example off$
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
new file mode 100644
index 0000000000000..014abd1fdbc63
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SQLTransformerExample.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.SQLTransformer
+// $example off$
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkConf, SparkContext}
+
+
+object SQLTransformerExample {
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("SQLTransformerExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+
+    // $example on$
+    val df = sqlContext.createDataFrame(
+      Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
+
+    val sqlTrans = new SQLTransformer().setStatement(
+      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
+
+    sqlTrans.transform(df).show()
+    // $example off$
+  }
+}
+// scalastyle:on println
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index 3a735017ba836..1872c0f842943 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -28,8 +28,15 @@ import org.apache.spark.sql.types.StructType
 /**
  * :: Experimental ::
  * Implements the transforms which are defined by SQL statement.
- * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+ * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
  * where '__THIS__' represents the underlying table of the input dataset.
+ * The select clause specifies the fields, constants, and expressions to display in
+ * the output, it can be any select clause that Spark SQL supported. Users can also
+ * use Spark SQL build-in function and UDFs to operate on these selected columns.
+ * For example, [[SQLTransformer]] support statements like:
+ *  - SELECT a, a + b AS a_b FROM __THIS__
+ *  - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
+ *  - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
  */
 @Experimental
 @Since("1.6.0")

From 92d6df28e199c9cbbeed0712c014ee184ba7f9a1 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 26 Nov 2015 22:24:09 +0800
Subject: [PATCH 2/6] fix typo

---
 R/pkg/DESCRIPTION | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 48fb9095573f5..c2c8b59a6aeb0 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -18,10 +18,10 @@ Collate:
     'schema.R'
     'generics.R'
     'jobj.R'
-    'column.R'
-    'group.R'
     'RDD.R'
     'pairRDD.R'
+    'column.R'
+    'group.R'
     'DataFrame.R'
     'SQLContext.R'
     'backend.R'

From 4c4049d7379b040f3f2761f9a32ca90c4d3e152a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 26 Nov 2015 22:25:11 +0800
Subject: [PATCH 3/6] fix typo

---
 R/pkg/DESCRIPTION | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index c2c8b59a6aeb0..b40e898abd1bf 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -35,5 +35,4 @@ Collate:
     'sparkR.R'
     'stats.R'
     'types.R'
-    'utils.R'
-RoxygenNote: 5.0.0
+    'utils.R'
\ No newline at end of file

From 4ad27f9c71d37594755db5e6ab8ab8f05f4558dc Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 1 Dec 2015 10:18:58 +0800
Subject: [PATCH 4/6] fix typo

---
 R/pkg/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index b40e898abd1bf..369714f7b99c2 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -35,4 +35,4 @@ Collate:
     'sparkR.R'
     'stats.R'
     'types.R'
-    'utils.R'
\ No newline at end of file
+    'utils.R'

From ab44e9aa6dc3663757b2cbc0f39508b739dac4e8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 3 Dec 2015 10:52:44 +0800
Subject: [PATCH 5/6] fix typos

---
 docs/ml-features.md                                         | 6 +++---
 .../scala/org/apache/spark/ml/feature/SQLTransformer.scala  | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 08a86cc73c6d5..b0680310c88d2 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1597,9 +1597,9 @@ transformer.transform(df).show()
 Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
 where `"__THIS__"` represents the underlying table of the input dataset.
 The select clause specifies the fields, constants, and expressions to display in
-the output, it can be any select clause that Spark SQL supported. Users can also
-use Spark SQL build-in function and UDFs to operate on these selected columns.
-For example, `SQLTransformer` support statements like:
+the output, it can be any select clause that Spark SQL supports. Users can also
+use Spark SQL built-in function and UDFs to operate on these selected columns.
+For example, `SQLTransformer` supports statements like:
 
 * `SELECT a, a + b AS a_b FROM __THIS__`
 * `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index 1872c0f842943..0daa96fb67ac3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -31,9 +31,9 @@ import org.apache.spark.sql.types.StructType
  * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
  * where '__THIS__' represents the underlying table of the input dataset.
  * The select clause specifies the fields, constants, and expressions to display in
- * the output, it can be any select clause that Spark SQL supported. Users can also
- * use Spark SQL build-in function and UDFs to operate on these selected columns.
- * For example, [[SQLTransformer]] support statements like:
+ * the output, it can be any select clause that Spark SQL supports. Users can also
+ * use Spark SQL built-in function and UDFs to operate on these selected columns.
+ * For example, [[SQLTransformer]] supports statements like:
  *  - SELECT a, a + b AS a_b FROM __THIS__
  *  - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
  *  - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b

From f59a5e34e8360865174ba37edb3f53cc3eff602b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 4 Dec 2015 10:58:06 +0800
Subject: [PATCH 6/6] update doc

---
 docs/ml-features.md                                             | 2 +-
 .../main/scala/org/apache/spark/ml/feature/SQLTransformer.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index b0680310c88d2..29dcb5183ccdc 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1593,7 +1593,7 @@ transformer.transform(df).show()
 
 ## SQLTransformer
 
-`SQLTransformer` implements the transforms which are defined by SQL statement.
+`SQLTransformer` implements the transformations which are defined by SQL statement.
 Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
 where `"__THIS__"` represents the underlying table of the input dataset.
 The select clause specifies the fields, constants, and expressions to display in
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
index 0daa96fb67ac3..c09f4d076c964 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.types.StructType
 
 /**
  * :: Experimental ::
- * Implements the transforms which are defined by SQL statement.
+ * Implements the transformations which are defined by SQL statement.
  * Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
  * where '__THIS__' represents the underlying table of the input dataset.
  * The select clause specifies the fields, constants, and expressions to display in