merge master

apache · Jul 3, 2016 · 50977da · 50977da
2 parents c46e5d0 + 54b27c1
commit 50977da
Show file tree

Hide file tree

Showing 928 changed files with 30,423 additions and 9,960 deletions.
diff --git a/.gitignore b/.gitignore
@@ -77,3 +77,4 @@ spark-warehouse/
 # For R session data
 .RData
 .RHistory
+.Rhistory
diff --git a/LICENSE b/LICENSE
@@ -296,3 +296,4 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (MIT License) blockUI (http://jquery.malsup.com/block/)
      (MIT License) RowsGroup (http://datatables.net/license/mit)
      (MIT License) jsonFormatter (http://www.jqueryscript.net/other/jQuery-Plugin-For-Pretty-JSON-Formatting-jsonFormatter.html)
+     (MIT License) modernizr (https://github.com/Modernizr/Modernizr/blob/master/LICENSE)
diff --git a/NOTICE b/NOTICE
@@ -1,5 +1,5 @@
 Apache Spark
-Copyright 2014 The Apache Software Foundation.
+Copyright 2014 and onwards The Apache Software Foundation.
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).

diff --git a/R/DOCUMENTATION.md b/R/DOCUMENTATION.md
@@ -1,12 +1,12 @@
 # SparkR Documentation
 
-SparkR documentation is generated using in-source comments annotated using using
-`roxygen2`. After making changes to the documentation, to generate man pages,
+SparkR documentation is generated by using in-source comments and annotated by using
+[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages,
 you can run the following from an R console in the SparkR home directory
-
-    library(devtools)
-    devtools::document(pkg="./pkg", roclets=c("rd"))
-
+```R
+library(devtools)
+devtools::document(pkg="./pkg", roclets=c("rd"))
+```
 You can verify if your changes are good by running
 
     R CMD check pkg/
diff --git a/R/README.md b/R/README.md
@@ -7,8 +7,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
 Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`.
 By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script.
 Example: 
-
-```
+```bash
 # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript
 export R_HOME=/home/username/R
 ./install-dev.sh
@@ -20,8 +19,8 @@ export R_HOME=/home/username/R
 
 Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
 
-```
-  build/mvn -DskipTests -Psparkr package
+```bash
+build/mvn -DskipTests -Psparkr package
 ```
 
 #### Running sparkR
@@ -40,9 +39,8 @@ To set other options like driver memory, executor memory etc. you can pass in th
 
 #### Using SparkR from RStudio
 
-If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example 
-
-```
+If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example
+```R
 # Set this to where Spark is installed
 Sys.setenv(SPARK_HOME="/Users/username/spark")
 # This line loads SparkR from the installed directory
@@ -59,25 +57,25 @@ Once you have made your changes, please include unit tests for them and run exis
 
 #### Generating documentation
 
-The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script.
+The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. Also, you may need to install these [prerequisites](https://github.com/apache/spark/tree/master/docs#prerequisites). See also, `R/DOCUMENTATION.md`
 
 ### Examples, Unit tests
 
 SparkR comes with several sample programs in the `examples/src/main/r` directory.
 To run one of them, use `./bin/spark-submit <filename> <args>`. For example:
-
-    ./bin/spark-submit examples/src/main/r/dataframe.R
-
+```bash
+./bin/spark-submit examples/src/main/r/dataframe.R
+```
 You can also run the unit tests for SparkR by running. You need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first:
-
-    R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
-    ./R/run-tests.sh
+```bash
+R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
+./R/run-tests.sh
+```
 
 ### Running on YARN
 
 The `./bin/spark-submit` can also be used to submit jobs to YARN clusters. You will need to set YARN conf dir before doing so. For example on CDH you can run
-
-```
+```bash
 export YARN_CONF_DIR=/etc/hadoop/conf
 ./bin/spark-submit --master yarn examples/src/main/r/dataframe.R
 ```
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -6,10 +6,16 @@ importFrom(methods, setGeneric, setMethod, setOldClass)
 #useDynLib(SparkR, stringHashCode)
 
 # S3 methods exported
+export("sparkR.session")
 export("sparkR.init")
 export("sparkR.stop")
+export("sparkR.session.stop")
+export("sparkR.conf")
 export("print.jobj")
 
+export("sparkRSQL.init",
+       "sparkRHive.init")
+
 # MLlib integration
 exportMethods("glm",
               "spark.glm",
@@ -45,6 +51,7 @@ exportMethods("arrange",
               "corr",
               "covar_samp",
               "covar_pop",
+              "createOrReplaceTempView",
               "crosstab",
               "dapply",
               "dapplyCollect",
@@ -61,6 +68,8 @@ exportMethods("arrange",
               "filter",
               "first",
               "freqItems",
+              "gapply",
+              "gapplyCollect",
               "group_by",
               "groupBy",
               "head",
@@ -79,6 +88,7 @@ exportMethods("arrange",
               "orderBy",
               "persist",
               "printSchema",
+              "randomSplit",
               "rbind",
               "registerTempTable",
               "rename",
@@ -99,6 +109,7 @@ exportMethods("arrange",
               "summary",
               "take",
               "transform",
+              "union",
               "unionAll",
               "unique",
               "unpersist",
@@ -109,6 +120,7 @@ exportMethods("arrange",
               "write.df",
               "write.jdbc",
               "write.json",
+              "write.orc",
               "write.parquet",
               "write.text",
               "write.ml")
@@ -185,6 +197,8 @@ exportMethods("%in%",
               "isNaN",
               "isNotNull",
               "isNull",
+              "is.nan",
+              "isnan",
               "kurtosis",
               "lag",
               "last",
@@ -208,6 +222,7 @@ exportMethods("%in%",
               "mean",
               "min",
               "minute",
+              "monotonically_increasing_id",
               "month",
               "months_between",
               "n",
@@ -220,6 +235,7 @@ exportMethods("%in%",
               "over",
               "percent_rank",
               "pmod",
+              "posexplode",
               "quarter",
               "rand",
               "randn",
@@ -248,6 +264,7 @@ exportMethods("%in%",
               "skewness",
               "sort_array",
               "soundex",
+              "spark_partition_id",
               "stddev",
               "stddev_pop",
               "stddev_samp",
@@ -281,22 +298,22 @@ exportMethods("%in%",
 
 exportClasses("GroupedData")
 exportMethods("agg")
-
-export("sparkRSQL.init",
-       "sparkRHive.init")
+exportMethods("pivot")
 
 export("as.DataFrame",
        "cacheTable",
        "clearCache",
        "createDataFrame",
        "createExternalTable",
        "dropTempTable",
+       "dropTempView",
        "jsonFile",
        "loadDF",
        "parquetFile",
        "read.df",
        "read.jdbc",
        "read.json",
+       "read.orc",
        "read.parquet",
        "read.text",
        "spark.lapply",