Merge in master

apache · Jan 16, 2016 · c4a2919 · c4a2919
2 parents 7aecb59 + 86972fa
commit c4a2919
Show file tree

Hide file tree

Showing 329 changed files with 7,748 additions and 3,958 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -86,3 +86,5 @@ org.apache.spark.scheduler.SparkHistoryListenerFactory
 .*parquet
 LZ4BlockInputStream.java
 spark-deps-.*
+.*csv
+.*tsv
diff --git a/LICENSE b/LICENSE
@@ -264,7 +264,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9.1 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)

diff --git a/NOTICE b/NOTICE
@@ -610,7 +610,43 @@ Vis.js uses and redistributes the following third-party libraries:
 
 ===============================================================================
 
-The CSS style for the navigation sidebar of the documentation was originally 
+The CSS style for the navigation sidebar of the documentation was originally
 submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project
 is distributed under the 3-Clause BSD license.
 ===============================================================================
+
+For CSV functionality:
+
+/*
+ * Copyright 2014 Databricks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2015 Ayasdi Inc
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -278,6 +278,7 @@ export("as.DataFrame",
        "read.parquet",
        "read.text",
        "sql",
+       "str",
        "table",
        "tableNames",
        "tables",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2299,3 +2299,76 @@ setMethod("with",
             newEnv <- assignNewEnv(data)
             eval(substitute(expr), envir = newEnv, enclos = newEnv)
           })
+
+#' Display the structure of a DataFrame, including column names, column types, as well as a
+#' a small sample of rows.
+#' @name str
+#' @title Compactly display the structure of a dataset
+#' @rdname str
+#' @family DataFrame functions
+#' @param object a DataFrame
+#' @examples \dontrun{
+#' # Create a DataFrame from the Iris dataset
+#' irisDF <- createDataFrame(sqlContext, iris)
+#' 
+#' # Show the structure of the DataFrame
+#' str(irisDF)
+#' }
+setMethod("str",
+          signature(object = "DataFrame"),
+          function(object) {
+
+            # TODO: These could be made global parameters, though in R it's not the case
+            MAX_CHAR_PER_ROW <- 120
+            MAX_COLS <- 100
+
+            # Get the column names and types of the DataFrame
+            names <- names(object)
+            types <- coltypes(object)
+
+            # Get the first elements of the dataset. Limit number of columns accordingly
+            localDF <- if (ncol(object) > MAX_COLS) {
+              head(object[, c(1:MAX_COLS)])
+            } else {
+              head(object)
+            }
+
+            # The number of observations will not be displayed as computing the
+            # number of rows is a very expensive operation
+            cat(paste0("'", class(object), "': ", length(names), " variables:\n"))
+
+            if (nrow(localDF) > 0) {
+              for (i in 1 : ncol(localDF)) {
+                # Get the first elements for each column
+
+                firstElements <- if (types[i] == "character") {
+                  paste(paste0("\"", localDF[,i], "\""), collapse = " ")
+                } else {
+                  paste(localDF[,i], collapse = " ")
+                }
+
+                # Add the corresponding number of spaces for alignment
+                spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="")
+
+                # Get the short type. For 'character', it would be 'chr';
+                # 'for numeric', it's 'num', etc.
+                dataType <- SHORT_TYPES[[types[i]]]
+                if (is.null(dataType)) {
+                  dataType <- substring(types[i], 1, 3)
+                }
+
+                # Concatenate the colnames, coltypes, and first
+                # elements of each column
+                line <- paste0(" $ ", names[i], spaces, ": ",
+                               dataType, " ",firstElements)
+
+                # Chop off extra characters if this is too long
+                cat(substr(line, 1, MAX_CHAR_PER_ROW))
+                cat("\n")
+              }
+
+              if (ncol(localDF) < ncol(object)) {
+                cat(paste0("\nDisplaying first ", ncol(localDF), " columns only."))
+              }
+            }
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -378,7 +378,6 @@ setGeneric("subtractByKey",
 setGeneric("value", function(bcast) { standardGeneric("value") })
 
 
-
 ####################  DataFrame Methods ########################
 
 #' @rdname agg
@@ -389,6 +388,14 @@ setGeneric("agg", function (x, ...) { standardGeneric("agg") })
 #' @export
 setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })
 
+#' @rdname as.data.frame
+#' @export
+setGeneric("as.data.frame")
+
+#' @rdname attach
+#' @export
+setGeneric("attach")
+
 #' @rdname columns
 #' @export
 setGeneric("colnames", function(x, do.NULL = TRUE, prefix = "col") { standardGeneric("colnames") })
@@ -525,13 +532,12 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
   standardGeneric("saveAsTable")
 })
 
-#' @rdname withColumn
 #' @export
-setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
+setGeneric("str")
 
-#' @rdname write.df
+#' @rdname mutate
 #' @export
-setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
+setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
 
 #' @rdname write.df
 #' @export
@@ -593,6 +599,10 @@ setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 #' @export
 setGeneric("where", function(x, condition) { standardGeneric("where") })
 
+#' @rdname with
+#' @export
+setGeneric("with")
+
 #' @rdname withColumn
 #' @export
 setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn") })
@@ -602,6 +612,9 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
 setGeneric("withColumnRenamed",
            function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
 
+#' @rdname write.df
+#' @export
+setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
 
 ###################### Column Methods ##########################
 
@@ -1109,7 +1122,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
 #' @export
 setGeneric("year", function(x) { standardGeneric("year") })
 
-
 #' @rdname glm
 #' @export
 setGeneric("glm")
@@ -1121,15 +1133,3 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
 #' @rdname rbind
 #' @export
 setGeneric("rbind", signature = "...")
-
-#' @rdname as.data.frame
-#' @export
-setGeneric("as.data.frame")
-
-#' @rdname attach
-#' @export
-setGeneric("attach")
-
-#' @rdname with
-#' @export
-setGeneric("with")
diff --git a/R/pkg/R/types.R b/R/pkg/R/types.R
@@ -47,10 +47,23 @@ COMPLEX_TYPES <- list(
 # The full list of data types.
 DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))
 
+SHORT_TYPES <- as.environment(list(
+  "character" = "chr",
+  "logical" = "logi",
+  "POSIXct" = "POSIXct",
+  "integer" = "int",
+  "numeric" = "num",
+  "raw" = "raw",
+  "Date" = "Date",
+  "map" = "map",
+  "array" = "array",
+  "struct" = "struct"
+))
+
 # An environment for mapping R to Scala, names are R types and values are Scala types.
 rToSQLTypes <- as.environment(list(
-  "integer"   = "integer", # in R, integer is 32bit
-  "numeric"   = "double",  # in R, numeric == double which is 64bit
-  "double"    = "double",
+  "integer" = "integer", # in R, integer is 32bit
+  "numeric" = "double",  # in R, numeric == double which is 64bit
+  "double" = "double",
   "character" = "string",
-  "logical"   = "boolean"))
+  "logical" = "boolean"))
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1173,7 +1173,7 @@ test_that("group by, agg functions", {
 
   expect_equal(3, count(mean(gd)))
   expect_equal(3, count(max(gd)))
-  expect_equal(30, collect(max(gd))[1, 2])
+  expect_equal(30, collect(max(gd))[2, 2])
   expect_equal(1, collect(count(gd))[1, 2])
 
   mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
@@ -1799,6 +1799,37 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", {
                "Only atomic type is supported for column types")
 })
 
+test_that("Method str()", {
+  # Structure of Iris
+  iris2 <- iris
+  colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species")
+  iris2$col <- TRUE
+  irisDF2 <- createDataFrame(sqlContext, iris2)
+
+  out <- capture.output(str(irisDF2))
+  expect_equal(length(out), 7)
+  expect_equal(out[1], "'DataFrame': 6 variables:")
+  expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4")
+  expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9")
+  expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7")
+  expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4")
+  expect_equal(out[6], paste0(" $ Species     : chr \"setosa\" \"setosa\" \"",
+                              "setosa\" \"setosa\" \"setosa\" \"setosa\""))
+  expect_equal(out[7], " $ col         : logi TRUE TRUE TRUE TRUE TRUE TRUE")
+
+  # A random dataset with many columns. This test is to check str limits
+  # the number of columns. Therefore, it will suffice to check for the
+  # number of returned rows
+  x <- runif(200, 1, 10)
+  df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x))))
+  DF <- createDataFrame(sqlContext, df)
+  out <- capture.output(str(DF))
+  expect_equal(length(out), 103)
+
+  # Test utils:::str
+  expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris)))
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)
diff --git a/bin/pyspark b/bin/pyspark
@@ -67,7 +67,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
-export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
+export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.1-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"

diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.1-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py

diff --git a/checkstyle.xml b/checkstyle.xml
@@ -58,6 +58,12 @@
         <property name="eachLine" value="true"/>
     </module>
 
+    <module name="RegexpSingleline">
+        <!-- \s matches whitespace character, $ matches end of line. -->
+        <property name="format" value="\s+$"/>
+        <property name="message" value="No trailing whitespace allowed."/>
+    </module>
+
     <module name="TreeWalker">
         <module name="OuterTypeFilename"/>
         <module name="IllegalTokenText">
@@ -84,7 +90,7 @@
         </module>
         <module name="NeedBraces">
             <property name="allowSingleLineStatement" value="true"/>
-	</module>
+        </module>
         <module name="OneStatementPerLine"/>
         <module name="ArrayTypeStyle"/>
         <module name="FallThrough"/>

diff --git a/core/pom.xml b/core/pom.xml
@@ -267,33 +267,6 @@
       <artifactId>oro</artifactId>
       <version>${oro.version}</version>
     </dependency>
-    <dependency>
-      <groupId>org.tachyonproject</groupId>
-      <artifactId>tachyon-client</artifactId>
-      <version>0.8.2</version>
-      <exclusions>
-        <exclusion>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-client</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-client</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-framework</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.apache.curator</groupId>
-          <artifactId>curator-recipes</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.tachyonproject</groupId>
-          <artifactId>tachyon-underfs-glusterfs</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
@@ -350,7 +323,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.9</version>
+      <version>0.9.1</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>