From b95be04614618c5971cfb5f08bf3af76d9b94d39 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 21 Jul 2017 01:14:43 +0900
Subject: [PATCH 1/3] SQL documentation generation for built-in functions

---
 .gitignore                                    |  2 +
 docs/README.md                                |  6 +-
 docs/_layouts/global.html                     |  1 +
 docs/_plugins/copy_api_dirs.rb                | 27 ++++++
 docs/api.md                                   |  1 +
 docs/index.md                                 |  1 +
 sql/README.md                                 |  2 +
 .../spark/sql/api/python/PythonSQLUtils.scala |  7 ++
 sql/create-docs.sh                            | 56 +++++++++++
 sql/gen-sql-markdown.py                       | 96 +++++++++++++++++++
 sql/log4j.properties                          | 24 +++++
 sql/mkdocs.yml                                | 19 ++++
 12 files changed, 239 insertions(+), 3 deletions(-)
 create mode 100755 sql/create-docs.sh
 create mode 100644 sql/gen-sql-markdown.py
 create mode 100644 sql/log4j.properties
 create mode 100644 sql/mkdocs.yml
diff --git a/.gitignore b/.gitignore
index cf9780db37ad7..903297db96901 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,8 @@ dev/pr-deps/
 dist/
 docs/_site
 docs/api
+sql/docs
+sql/site
 lib_managed/
 lint-r-report.log
 log/
diff --git a/docs/README.md b/docs/README.md
index 90e10a104b517..0090dd071e15f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -68,6 +68,6 @@ jekyll plugin to run `build/sbt unidoc` before building the site so if you haven
 may take some time as it generates all of the scaladoc.  The jekyll plugin also generates the
 PySpark docs using [Sphinx](http://sphinx-doc.org/).
 
-NOTE: To skip the step of building and copying over the Scala, Python, R API docs, run `SKIP_API=1
-jekyll`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, and `SKIP_RDOC=1` can be used to skip a single
-step of the corresponding language.
+NOTE: To skip the step of building and copying over the Scala, Python, R and SQL API docs, run `SKIP_API=1
+jekyll`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, `SKIP_RDOC=1` and `SKIP_SQLDOC=1` can be used
+to skip a single step of the corresponding language.
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 570483c0b04ea..67b05ecf7a858 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -86,6 +86,7 @@
                                 <li><a href="api/java/index.html">Java</a></li>
                                 <li><a href="api/python/index.html">Python</a></li>
                                 <li><a href="api/R/index.html">R</a></li>
+                                <li><a href="api/sql/index.html">SQL, Built-in Functions</a></li>
                             </ul>
                         </li>
 
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index 95e3ba35e9027..00366f803c2ad 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -150,4 +150,31 @@
     cp("../R/pkg/DESCRIPTION", "api")
   end
 
+  if not (ENV['SKIP_SQLDOC'] == '1')
+    # Build SQL API docs
+
+    puts "Moving to project root and building API docs."
+    curr_dir = pwd
+    cd("..")
+
+    puts "Running 'build/sbt clean package' from " + pwd + "; this may take a few minutes..."
+    system("build/sbt clean package") || raise("SQL doc generation failed")
+
+    puts "Moving back into docs dir."
+    cd("docs")
+
+    puts "Moving to SQL directory and building docs."
+    cd("../sql")
+    system("./create-docs.sh") || raise("SQL doc generation failed")
+
+    puts "Moving back into docs dir."
+    cd("../docs")
+
+    puts "Making directory api/sql"
+    mkdir_p "api/sql"
+
+    puts "cp -r ../sql/site/. api/sql"
+    cp_r("../sql/site/.", "api/sql")
+  end
+
 end
diff --git a/docs/api.md b/docs/api.md
index ae7d51c2aefbf..70484f02de78d 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -9,3 +9,4 @@ Here you can read API docs for Spark and its submodules.
 - [Spark Java API (Javadoc)](api/java/index.html)
 - [Spark Python API (Sphinx)](api/python/index.html)
 - [Spark R API (Roxygen2)](api/R/index.html)
+- [Spark SQL, Built-in Functions (MkDocs)](api/sql/index.html)
diff --git a/docs/index.md b/docs/index.md
index 2d4607b3119bd..b867c972b4b48 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -100,6 +100,7 @@ options for deployment:
 * [Spark Java API (Javadoc)](api/java/index.html)
 * [Spark Python API (Sphinx)](api/python/index.html)
 * [Spark R API (Roxygen2)](api/R/index.html)
+* [Spark SQL, Built-in Functions (MkDocs)](api/sql/index.html)
 
 **Deployment Guides:**
 
diff --git a/sql/README.md b/sql/README.md
index 58e9097ed4db1..fe1d352050c09 100644
--- a/sql/README.md
+++ b/sql/README.md
@@ -8,3 +8,5 @@ Spark SQL is broken up into four subprojects:
  - Execution (sql/core) - A query planner / execution engine for translating Catalyst's logical query plans into Spark RDDs.  This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files.
  - Hive Support (sql/hive) - Includes an extension of SQLContext called HiveContext that allows users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes.  There are also wrappers that allows users to run queries that include Hive UDFs, UDAFs, and UDTFs.
  - HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server.
+
+Running `sql/create-docs.sh` generates SQL documentation for built-in functions under `sql/site`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
index 731feb914d251..4d5ce0bb60c0b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -17,9 +17,16 @@
 
 package org.apache.spark.sql.api.python
 
+import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
+import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.types.DataType
 
 private[sql] object PythonSQLUtils {
   def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)
+
+  // This is needed when generating SQL documentation for built-in functions.
+  def listBuiltinFunctionInfos(): Array[ExpressionInfo] = {
+    FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
+  }
 }
diff --git a/sql/create-docs.sh b/sql/create-docs.sh
new file mode 100755
index 0000000000000..e44f2f6a0540f
--- /dev/null
+++ b/sql/create-docs.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Script to create SQL API docs. This requires `mkdocs`.
+# Also, it needs to build Spark first.
+
+# After running this script the html docs can be found in
+# $SPARK_HOME/sql/docs/site
+
+set -o pipefail
+set -e
+
+FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
+SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)"
+WAREHOUSE_DIR="$FWDIR/_spark-warehouse"
+
+if ! hash python 2>/dev/null; then
+  echo "Missing python in the path, skipping SQL documentation generation."
+  exit 0
+fi
+
+if ! hash mkdocs 2>/dev/null; then
+  echo "Missing mkdocs in the path, skipping SQL documentation generation."
+  exit 0
+fi
+
+# Now create markdown file
+rm -fr docs
+rm -rf "$WAREHOUSE_DIR"
+mkdir docs
+echo "Generating markdown files for SQL documentation."
+"$SPARK_HOME/bin/spark-submit" \
+  --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" \
+  --conf spark.sql.warehouse.dir="$WAREHOUSE_DIR" \
+  gen-sql-markdown.py
+rm -rf "$WAREHOUSE_DIR"
+
+# Now create HTML files
+echo "Generating HTML files for SQL documentation."
+mkdocs build --clean
diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-markdown.py
new file mode 100644
index 0000000000000..8c6090ac0f10d
--- /dev/null
+++ b/sql/gen-sql-markdown.py
@@ -0,0 +1,96 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+import os
+from collections import namedtuple
+
+from pyspark.sql import SparkSession
+
+ExpressionInfo = namedtuple("ExpressionInfo", "className usage name extended")
+
+
+def _list_function_infos(spark):
+    """
+    Returns a list of function information via JVM. Sorts wrapped expression infos by name
+    and returns them.
+    """
+
+    jinfos = spark.sparkContext._jvm \
+        .org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
+    infos = []
+    for jinfo in jinfos:
+        name = jinfo.getName()
+        usage = jinfo.getUsage()
+        usage = usage.replace("_FUNC_", name) if usage is not None else usage
+        extended = jinfo.getExtended()
+        extended = extended.replace("_FUNC_", name) if extended is not None else extended
+        infos.append(ExpressionInfo(
+            className=jinfo.getClassName(),
+            usage=usage,
+            name=name,
+            extended=extended))
+    return sorted(infos, key=lambda i: i.name)
+
+
+def _make_pretty_usage(usage):
+    """
+    Makes the usage description pretty and returns a formatted string.
+    Otherwise, returns None.
+    """
+
+    if usage is not None and usage.strip() != "":
+        usage = "\n".join(map(lambda u: u.strip(), usage.split("\n")))
+        return "%s\n\n" % usage
+
+
+def _make_pretty_extended(extended):
+    """
+    Makes the extended description pretty and returns a formatted string.
+    Otherwise, returns None.
+    """
+
+    if extended is not None and extended.strip() != "":
+        extended = "\n".join(map(lambda u: u.strip(), extended.split("\n")))
+        return "```%s```\n\n" % extended
+
+
+def generate_sql_markdown(spark, path):
+    """
+    Generates a markdown file after listing the function information. The output file
+    is created in `path`.
+    """
+
+    with open(path, 'w') as mdfile:
+        for info in _list_function_infos(spark):
+            mdfile.write("### %s\n\n" % info.name)
+            usage = _make_pretty_usage(info.usage)
+            extended = _make_pretty_extended(info.extended)
+            if usage is not None:
+                mdfile.write(usage)
+            if extended is not None:
+                mdfile.write(extended)
+
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("GenSQLDocs") \
+        .getOrCreate()
+    markdown_file_path = "%s/docs/index.md" % os.path.dirname(sys.argv[0])
+    generate_sql_markdown(spark, markdown_file_path)
+    spark.stop()
diff --git a/sql/log4j.properties b/sql/log4j.properties
new file mode 100644
index 0000000000000..001285116ce3b
--- /dev/null
+++ b/sql/log4j.properties
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# SQL documentation generation simply accesses to JVM and gets the list of functions.
+# Just suppresses info level logs.
+log4j.rootCategory=ERROR, console
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
diff --git a/sql/mkdocs.yml b/sql/mkdocs.yml
new file mode 100644
index 0000000000000..c34c891bb9e42
--- /dev/null
+++ b/sql/mkdocs.yml
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+site_name: Spark SQL, Built-in Functions
+theme: readthedocs
+pages:
+  - 'Functions': 'index.md'

From c92533b3e36eb5ff7437aeb1f55fb2ddaf5cc407 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 25 Jul 2017 18:47:14 +0900
Subject: [PATCH 2/3] Avoid initialize SparkSession but only JVM

---
 sql/create-docs.sh      |  8 ++------
 sql/gen-sql-markdown.py | 21 ++++++++-------------
 sql/log4j.properties    | 24 ------------------------
 3 files changed, 10 insertions(+), 43 deletions(-)
 delete mode 100644 sql/log4j.properties

diff --git a/sql/create-docs.sh b/sql/create-docs.sh
index e44f2f6a0540f..b0f65e9f04707 100755
--- a/sql/create-docs.sh
+++ b/sql/create-docs.sh
@@ -42,15 +42,11 @@ fi
 
 # Now create markdown file
 rm -fr docs
-rm -rf "$WAREHOUSE_DIR"
 mkdir docs
 echo "Generating markdown files for SQL documentation."
-"$SPARK_HOME/bin/spark-submit" \
-  --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" \
-  --conf spark.sql.warehouse.dir="$WAREHOUSE_DIR" \
-  gen-sql-markdown.py
-rm -rf "$WAREHOUSE_DIR"
+"$SPARK_HOME/bin/spark-submit" gen-sql-markdown.py
 
 # Now create HTML files
 echo "Generating HTML files for SQL documentation."
 mkdocs build --clean
+rm -fr docs
diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-markdown.py
index 8c6090ac0f10d..8132af2708aea 100644
--- a/sql/gen-sql-markdown.py
+++ b/sql/gen-sql-markdown.py
@@ -19,19 +19,16 @@
 import os
 from collections import namedtuple
 
-from pyspark.sql import SparkSession
-
 ExpressionInfo = namedtuple("ExpressionInfo", "className usage name extended")
 
 
-def _list_function_infos(spark):
+def _list_function_infos(jvm):
     """
     Returns a list of function information via JVM. Sorts wrapped expression infos by name
     and returns them.
     """
 
-    jinfos = spark.sparkContext._jvm \
-        .org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
+    jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
     infos = []
     for jinfo in jinfos:
         name = jinfo.getName()
@@ -69,14 +66,14 @@ def _make_pretty_extended(extended):
         return "```%s```\n\n" % extended
 
 
-def generate_sql_markdown(spark, path):
+def generate_sql_markdown(jvm, path):
     """
     Generates a markdown file after listing the function information. The output file
     is created in `path`.
     """
 
     with open(path, 'w') as mdfile:
-        for info in _list_function_infos(spark):
+        for info in _list_function_infos(jvm):
             mdfile.write("### %s\n\n" % info.name)
             usage = _make_pretty_usage(info.usage)
             extended = _make_pretty_extended(info.extended)
@@ -87,10 +84,8 @@ def generate_sql_markdown(spark, path):
 
 
 if __name__ == "__main__":
-    spark = SparkSession \
-        .builder \
-        .appName("GenSQLDocs") \
-        .getOrCreate()
+    from pyspark.java_gateway import launch_gateway
+
+    jvm = launch_gateway().jvm
     markdown_file_path = "%s/docs/index.md" % os.path.dirname(sys.argv[0])
-    generate_sql_markdown(spark, markdown_file_path)
-    spark.stop()
+    generate_sql_markdown(jvm, markdown_file_path)
diff --git a/sql/log4j.properties b/sql/log4j.properties
deleted file mode 100644
index 001285116ce3b..0000000000000
--- a/sql/log4j.properties
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# SQL documentation generation simply accesses to JVM and gets the list of functions.
-# Just suppresses info level logs.
-log4j.rootCategory=ERROR, console
-log4j.appender.console=org.apache.log4j.ConsoleAppender
-log4j.appender.console.target=System.err
-log4j.appender.console.layout=org.apache.log4j.PatternLayout
-log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n

From c711ff5363a0608c11b97c915bfc8b8cdbf2ba95 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 25 Jul 2017 20:02:54 +0900
Subject: [PATCH 3/3] Minor cleanup

---
 sql/create-docs.sh | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/sql/create-docs.sh b/sql/create-docs.sh
index b0f65e9f04707..275e4c391a388 100755
--- a/sql/create-docs.sh
+++ b/sql/create-docs.sh
@@ -17,36 +17,33 @@
 # limitations under the License.
 #
 
-# Script to create SQL API docs. This requires `mkdocs`.
-# Also, it needs to build Spark first.
-
-# After running this script the html docs can be found in
-# $SPARK_HOME/sql/docs/site
+# Script to create SQL API docs. This requires `mkdocs` and to build
+# Spark first. After running this script the html docs can be found in
+# $SPARK_HOME/sql/site
 
 set -o pipefail
 set -e
 
 FWDIR="$(cd "`dirname "${BASH_SOURCE[0]}"`"; pwd)"
 SPARK_HOME="$(cd "`dirname "${BASH_SOURCE[0]}"`"/..; pwd)"
-WAREHOUSE_DIR="$FWDIR/_spark-warehouse"
 
 if ! hash python 2>/dev/null; then
-  echo "Missing python in the path, skipping SQL documentation generation."
+  echo "Missing python in your path, skipping SQL documentation generation."
   exit 0
 fi
 
 if ! hash mkdocs 2>/dev/null; then
-  echo "Missing mkdocs in the path, skipping SQL documentation generation."
+  echo "Missing mkdocs in your path, skipping SQL documentation generation."
   exit 0
 fi
 
-# Now create markdown file
+# Now create the markdown file
 rm -fr docs
 mkdir docs
 echo "Generating markdown files for SQL documentation."
 "$SPARK_HOME/bin/spark-submit" gen-sql-markdown.py
 
-# Now create HTML files
+# Now create the HTML files
 echo "Generating HTML files for SQL documentation."
 mkdocs build --clean
 rm -fr docs