merge with master

apache · Oct 7, 2016 · 265a62f · 265a62f
2 parents c28fdb8 + 2badb58
commit 265a62f
Show file tree

Hide file tree

Showing 1,754 changed files with 65,274 additions and 28,218 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,11 +17,14 @@
 .idea/
 .idea_modules/
 .project
+.pydevproject
 .scala_dependencies
 .settings
 /lib/
 R-unit-tests.log
 R/unit-tests.out
+R/cran-check.out
+R/pkg/vignettes/sparkr-vignettes.html
 build/*.jar
 build/apache-maven*
 build/scala*
@@ -78,3 +81,7 @@ spark-warehouse/
 .RData
 .RHistory
 .Rhistory
+*.Rproj
+*.Rproj.*
+
+.Rproj.user
diff --git a/.travis.yml b/.travis.yml
@@ -44,7 +44,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Phadoop-2.3 -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -6,7 +6,7 @@ It lists steps that are required before creating a PR. In particular, consider:
 
 - Is the change important and ready enough to ask the community to spend time reviewing?
 - Have you searched for existing, related JIRAs and pull requests?
-- Is this a new feature that can stand alone as a package on http://spark-packages.org ?
+- Is this a new feature that can stand alone as a [third party project](https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects) ?
 - Is the change being proposed clearly explained and motivated?
 
 When you contribute code, you affirm that the contribution is your original work and that you 

diff --git a/LICENSE b/LICENSE
@@ -263,7 +263,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.10.3 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

diff --git a/R/.gitignore b/R/.gitignore
@@ -4,3 +4,5 @@
 lib
 pkg/man
 pkg/html
+SparkR.Rcheck/
+SparkR_*.tar.gz
diff --git a/R/WINDOWS.md b/R/WINDOWS.md
@@ -4,13 +4,23 @@ To build SparkR on Windows, the following steps are required
 
 1. Install R (>= 3.1) and [Rtools](http://cran.r-project.org/bin/windows/Rtools/). Make sure to
 include Rtools and R in `PATH`.
+
 2. Install
 [JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set
 `JAVA_HOME` in the system environment variables.
+
 3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin`
 directory in Maven in `PATH`.
+
 4. Set `MAVEN_OPTS` as described in [Building Spark](http://spark.apache.org/docs/latest/building-spark.html).
-5. Open a command shell (`cmd`) in the Spark directory and run `mvn -DskipTests -Psparkr package`
+
+5. Open a command shell (`cmd`) in the Spark directory and build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
+
+    ```bash
+    mvn.cmd -DskipTests -Psparkr package
+    ```
+
+    `.\build\mvn` is a shell script so `mvn.cmd` should be used directly on Windows.
 
 ##  Unit tests
 

diff --git a/R/check-cran.sh b/R/check-cran.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -o pipefail
+set -e
+
+FWDIR="$(cd `dirname $0`; pwd)"
+pushd $FWDIR > /dev/null
+
+if [ ! -z "$R_HOME" ]
+  then
+    R_SCRIPT_PATH="$R_HOME/bin"
+  else
+    # if system wide R_HOME is not found, then exit
+    if [ ! `command -v R` ]; then
+      echo "Cannot find 'R_HOME'. Please specify 'R_HOME' or make sure R is properly installed."
+      exit 1
+    fi
+    R_SCRIPT_PATH="$(dirname $(which R))"
+fi
+echo "USING R_HOME = $R_HOME"
+
+# Build the latest docs
+$FWDIR/create-docs.sh
+
+# Build a zip file containing the source package
+"$R_SCRIPT_PATH/"R CMD build $FWDIR/pkg
+
+# Run check as-cran.
+VERSION=`grep Version $FWDIR/pkg/DESCRIPTION | awk '{print $NF}'`
+
+CRAN_CHECK_OPTIONS="--as-cran"
+
+if [ -n "$NO_TESTS" ]
+then
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-tests"
+fi
+
+if [ -n "$NO_MANUAL" ]
+then
+  CRAN_CHECK_OPTIONS=$CRAN_CHECK_OPTIONS" --no-manual"
+fi
+
+echo "Running CRAN check with $CRAN_CHECK_OPTIONS options"
+
+"$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
+
+popd > /dev/null
diff --git a/R/create-docs.sh b/R/create-docs.sh
@@ -17,17 +17,26 @@
 # limitations under the License.
 #
 
-# Script to create API docs for SparkR
-# This requires `devtools` and `knitr` to be installed on the machine.
+# Script to create API docs and vignettes for SparkR
+# This requires `devtools`, `knitr` and `rmarkdown` to be installed on the machine.
 
 # After running this script the html docs can be found in 
 # $SPARK_HOME/R/pkg/html
+# The vignettes can be found in
+# $SPARK_HOME/R/pkg/vignettes/sparkr_vignettes.html
 
 set -o pipefail
 set -e
 
 # Figure out where the script is
 export FWDIR="$(cd "`dirname "$0"`"; pwd)"
+export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+
+# Required for setting SPARK_SCALA_VERSION
+. "${SPARK_HOME}"/bin/load-spark-env.sh
+
+echo "Using Scala $SPARK_SCALA_VERSION"
+
 pushd $FWDIR
 
 # Install the package (this will also generate the Rd files)
@@ -43,4 +52,21 @@ Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knit
 
 popd
 
+# Find Spark jars.
+if [ -f "${SPARK_HOME}/RELEASE" ]; then
+  SPARK_JARS_DIR="${SPARK_HOME}/jars"
+else
+  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars"
+fi
+
+# Only create vignettes if Spark JARs exist
+if [ -d "$SPARK_JARS_DIR" ]; then
+  # render creates SparkR vignettes
+  Rscript -e 'library(rmarkdown); paths <- .libPaths(); .libPaths(c("lib", paths)); Sys.setenv(SPARK_HOME=tools::file_path_as_absolute("..")); render("pkg/vignettes/sparkr-vignettes.Rmd"); .libPaths(paths)'
+
+  find pkg/vignettes/. -not -name '.' -not -name '*.Rmd' -not -name '*.md' -not -name '*.pdf' -not -name '*.html' -delete
+else
+  echo "Skipping R vignettes as Spark JARs not found in $SPARK_HOME"
+fi
+
 popd
diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
@@ -0,0 +1,5 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^\.lintr$
+^src-native$
+^html$
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,20 +1,25 @@
 Package: SparkR
 Type: Package
-Title: R frontend for Spark
+Title: R Frontend for Apache Spark
 Version: 2.0.0
-Date: 2013-09-09
-Author: The Apache Software Foundation
-Maintainer: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
-Imports:
-    methods
+Date: 2016-08-27
+Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
+                    email = "shivaram@cs.berkeley.edu"),
+             person("Xiangrui", "Meng", role = "aut",
+                    email = "meng@databricks.com"),
+             person("Felix", "Cheung", role = "aut",
+                    email = "felixcheung@apache.org"),
+             person(family = "The Apache Software Foundation", role = c("aut", "cph")))
+URL: http://www.apache.org/ http://spark.apache.org/
+BugReports: https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-ContributingBugReports
 Depends:
     R (>= 3.0),
-    methods,
+    methods
 Suggests:
     testthat,
     e1071,
     survival
-Description: R frontend for Spark
+Description: The SparkR package provides an R frontend for Apache Spark.
 License: Apache License (== 2.0)
 Collate:
     'schema.R'
@@ -33,6 +38,8 @@ Collate:
     'context.R'
     'deserialize.R'
     'functions.R'
+    'install.R'
+    'jvm.R'
     'mllib.R'
     'serialize.R'
     'sparkR.R'

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -1,5 +1,9 @@
 # Imports from base R
-importFrom(methods, setGeneric, setMethod, setOldClass)
+# Do not include stats:: "rpois", "runif" - causes error at runtime
+importFrom("methods", "setGeneric", "setMethod", "setOldClass")
+importFrom("methods", "is", "new", "signature", "show")
+importFrom("stats", "gaussian", "setNames")
+importFrom("utils", "download.file", "packageVersion", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839
@@ -11,8 +15,15 @@ export("sparkR.init")
 export("sparkR.stop")
 export("sparkR.session.stop")
 export("sparkR.conf")
+export("sparkR.version")
 export("print.jobj")
 
+export("sparkR.newJObject")
+export("sparkR.callJMethod")
+export("sparkR.callJStatic")
+
+export("install.spark")
+
 export("sparkRSQL.init",
        "sparkRHive.init")
 
@@ -23,8 +34,16 @@ exportMethods("glm",
               "summary",
               "spark.kmeans",
               "fitted",
+              "spark.mlp",
               "spark.naiveBayes",
-              "spark.survreg")
+              "spark.survreg",
+              "spark.lda",
+              "spark.posterior",
+              "spark.perplexity",
+              "spark.isoreg",
+              "spark.gaussianMixture",
+              "spark.als",
+              "spark.kstest")
 
 # Job group lifecycle management methods
 export("setJobGroup",
@@ -317,14 +336,18 @@ export("as.DataFrame",
        "read.parquet",
        "read.text",
        "spark.lapply",
+       "spark.addFile",
+       "spark.getSparkFilesRootDirectory",
+       "spark.getSparkFiles",
        "sql",
        "str",
        "tableToDF",
        "tableNames",
        "tables",
        "uncacheTable",
        "print.summary.GeneralizedLinearRegressionModel",
-       "read.ml")
+       "read.ml",
+       "print.summary.KSTest")
 
 export("structField",
        "structField.jobj",
@@ -341,5 +364,15 @@ export("partitionBy",
        "rowsBetween",
        "rangeBetween")
 
-export("window.partitionBy",
-       "window.orderBy")
+export("windowPartitionBy",
+       "windowOrderBy")
+
+S3method(print, jobj)
+S3method(print, structField)
+S3method(print, structType)
+S3method(print, summary.GeneralizedLinearRegressionModel)
+S3method(print, summary.KSTest)
+S3method(structField, character)
+S3method(structField, jobj)
+S3method(structType, jobj)
+S3method(structType, structField)