Skip to content

Commit

Permalink
Merge in master
Browse files Browse the repository at this point in the history
  • Loading branch information
holdenk committed Jan 16, 2016
2 parents 7aecb59 + 86972fa commit c4a2919
Show file tree
Hide file tree
Showing 329 changed files with 7,748 additions and 3,958 deletions.
2 changes: 2 additions & 0 deletions .rat-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,5 @@ org.apache.spark.scheduler.SparkHistoryListenerFactory
.*parquet
LZ4BlockInputStream.java
spark-deps-.*
.*csv
.*tsv
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
(The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
(The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
(The New BSD License) Py4J (net.sf.py4j:py4j:0.9.1 - http://py4j.sourceforge.net/)
(Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
(BSD licence) sbt and sbt-launch-lib.bash
(BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
Expand Down
38 changes: 37 additions & 1 deletion NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,43 @@ Vis.js uses and redistributes the following third-party libraries:

===============================================================================

The CSS style for the navigation sidebar of the documentation was originally
The CSS style for the navigation sidebar of the documentation was originally
submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project
is distributed under the 3-Clause BSD license.
===============================================================================

For CSV functionality:

/*
* Copyright 2014 Databricks
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* Copyright 2015 Ayasdi Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ export("as.DataFrame",
"read.parquet",
"read.text",
"sql",
"str",
"table",
"tableNames",
"tables",
Expand Down
73 changes: 73 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2299,3 +2299,76 @@ setMethod("with",
newEnv <- assignNewEnv(data)
eval(substitute(expr), envir = newEnv, enclos = newEnv)
})

#' Display the structure of a DataFrame, including column names, column types, as well as a
#' a small sample of rows.
#' @name str
#' @title Compactly display the structure of a dataset
#' @rdname str
#' @family DataFrame functions
#' @param object a DataFrame
#' @examples \dontrun{
#' # Create a DataFrame from the Iris dataset
#' irisDF <- createDataFrame(sqlContext, iris)
#'
#' # Show the structure of the DataFrame
#' str(irisDF)
#' }
setMethod("str",
signature(object = "DataFrame"),
function(object) {

# TODO: These could be made global parameters, though in R it's not the case
MAX_CHAR_PER_ROW <- 120
MAX_COLS <- 100

# Get the column names and types of the DataFrame
names <- names(object)
types <- coltypes(object)

# Get the first elements of the dataset. Limit number of columns accordingly
localDF <- if (ncol(object) > MAX_COLS) {
head(object[, c(1:MAX_COLS)])
} else {
head(object)
}

# The number of observations will not be displayed as computing the
# number of rows is a very expensive operation
cat(paste0("'", class(object), "': ", length(names), " variables:\n"))

if (nrow(localDF) > 0) {
for (i in 1 : ncol(localDF)) {
# Get the first elements for each column

firstElements <- if (types[i] == "character") {
paste(paste0("\"", localDF[,i], "\""), collapse = " ")
} else {
paste(localDF[,i], collapse = " ")
}

# Add the corresponding number of spaces for alignment
spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="")

# Get the short type. For 'character', it would be 'chr';
# 'for numeric', it's 'num', etc.
dataType <- SHORT_TYPES[[types[i]]]
if (is.null(dataType)) {
dataType <- substring(types[i], 1, 3)
}

# Concatenate the colnames, coltypes, and first
# elements of each column
line <- paste0(" $ ", names[i], spaces, ": ",
dataType, " ",firstElements)

# Chop off extra characters if this is too long
cat(substr(line, 1, MAX_CHAR_PER_ROW))
cat("\n")
}

if (ncol(localDF) < ncol(object)) {
cat(paste0("\nDisplaying first ", ncol(localDF), " columns only."))
}
}
})
36 changes: 18 additions & 18 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,6 @@ setGeneric("subtractByKey",
setGeneric("value", function(bcast) { standardGeneric("value") })



#################### DataFrame Methods ########################

#' @rdname agg
Expand All @@ -389,6 +388,14 @@ setGeneric("agg", function (x, ...) { standardGeneric("agg") })
#' @export
setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")

#' @rdname attach
#' @export
setGeneric("attach")

#' @rdname columns
#' @export
setGeneric("colnames", function(x, do.NULL = TRUE, prefix = "col") { standardGeneric("colnames") })
Expand Down Expand Up @@ -525,13 +532,12 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
standardGeneric("saveAsTable")
})

#' @rdname withColumn
#' @export
setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
setGeneric("str")

#' @rdname write.df
#' @rdname mutate
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })

#' @rdname write.df
#' @export
Expand Down Expand Up @@ -593,6 +599,10 @@ setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
#' @export
setGeneric("where", function(x, condition) { standardGeneric("where") })

#' @rdname with
#' @export
setGeneric("with")

#' @rdname withColumn
#' @export
setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn") })
Expand All @@ -602,6 +612,9 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
setGeneric("withColumnRenamed",
function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })

#' @rdname write.df
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })

###################### Column Methods ##########################

Expand Down Expand Up @@ -1109,7 +1122,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
#' @export
setGeneric("year", function(x) { standardGeneric("year") })


#' @rdname glm
#' @export
setGeneric("glm")
Expand All @@ -1121,15 +1133,3 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
#' @rdname rbind
#' @export
setGeneric("rbind", signature = "...")

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")

#' @rdname attach
#' @export
setGeneric("attach")

#' @rdname with
#' @export
setGeneric("with")
21 changes: 17 additions & 4 deletions R/pkg/R/types.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,23 @@ COMPLEX_TYPES <- list(
# The full list of data types.
DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))

SHORT_TYPES <- as.environment(list(
"character" = "chr",
"logical" = "logi",
"POSIXct" = "POSIXct",
"integer" = "int",
"numeric" = "num",
"raw" = "raw",
"Date" = "Date",
"map" = "map",
"array" = "array",
"struct" = "struct"
))

# An environment for mapping R to Scala, names are R types and values are Scala types.
rToSQLTypes <- as.environment(list(
"integer" = "integer", # in R, integer is 32bit
"numeric" = "double", # in R, numeric == double which is 64bit
"double" = "double",
"integer" = "integer", # in R, integer is 32bit
"numeric" = "double", # in R, numeric == double which is 64bit
"double" = "double",
"character" = "string",
"logical" = "boolean"))
"logical" = "boolean"))
33 changes: 32 additions & 1 deletion R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1173,7 +1173,7 @@ test_that("group by, agg functions", {

expect_equal(3, count(mean(gd)))
expect_equal(3, count(max(gd)))
expect_equal(30, collect(max(gd))[1, 2])
expect_equal(30, collect(max(gd))[2, 2])
expect_equal(1, collect(count(gd))[1, 2])

mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
Expand Down Expand Up @@ -1799,6 +1799,37 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", {
"Only atomic type is supported for column types")
})

test_that("Method str()", {
# Structure of Iris
iris2 <- iris
colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species")
iris2$col <- TRUE
irisDF2 <- createDataFrame(sqlContext, iris2)

out <- capture.output(str(irisDF2))
expect_equal(length(out), 7)
expect_equal(out[1], "'DataFrame': 6 variables:")
expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4")
expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9")
expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7")
expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4")
expect_equal(out[6], paste0(" $ Species : chr \"setosa\" \"setosa\" \"",
"setosa\" \"setosa\" \"setosa\" \"setosa\""))
expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE")

# A random dataset with many columns. This test is to check str limits
# the number of columns. Therefore, it will suffice to check for the
# number of returned rows
x <- runif(200, 1, 10)
df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x))))
DF <- createDataFrame(sqlContext, df)
out <- capture.output(str(DF))
expect_equal(length(out), 103)

# Test utils:::str
expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris)))
})

unlink(parquetPath)
unlink(jsonPath)
unlink(jsonPathNa)
2 changes: 1 addition & 1 deletion bin/pyspark
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ export PYSPARK_PYTHON

# Add the PySpark classes to the Python path:
export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH"
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.9.1-src.zip:$PYTHONPATH"

# Load the PySpark shell.py script when ./pyspark is used interactively:
export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
Expand Down
2 changes: 1 addition & 1 deletion bin/pyspark2.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
)

set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%
set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9.1-src.zip;%PYTHONPATH%

set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
Expand Down
8 changes: 7 additions & 1 deletion checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@
<property name="eachLine" value="true"/>
</module>

<module name="RegexpSingleline">
<!-- \s matches whitespace character, $ matches end of line. -->
<property name="format" value="\s+$"/>
<property name="message" value="No trailing whitespace allowed."/>
</module>

<module name="TreeWalker">
<module name="OuterTypeFilename"/>
<module name="IllegalTokenText">
Expand All @@ -84,7 +90,7 @@
</module>
<module name="NeedBraces">
<property name="allowSingleLineStatement" value="true"/>
</module>
</module>
<module name="OneStatementPerLine"/>
<module name="ArrayTypeStyle"/>
<module name="FallThrough"/>
Expand Down
29 changes: 1 addition & 28 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -267,33 +267,6 @@
<artifactId>oro</artifactId>
<version>${oro.version}</version>
</dependency>
<dependency>
<groupId>org.tachyonproject</groupId>
<artifactId>tachyon-client</artifactId>
<version>0.8.2</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
</exclusion>
<exclusion>
<groupId>org.tachyonproject</groupId>
<artifactId>tachyon-underfs-glusterfs</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
Expand Down Expand Up @@ -350,7 +323,7 @@
<dependency>
<groupId>net.sf.py4j</groupId>
<artifactId>py4j</artifactId>
<version>0.9</version>
<version>0.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
Expand Down
Loading

0 comments on commit c4a2919

Please sign in to comment.