Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into SPARK-10985
Browse files Browse the repository at this point in the history
  • Loading branch information
JoshRosen committed Jan 15, 2016
2 parents 848320f + ad1503f commit deef931
Show file tree
Hide file tree
Showing 113 changed files with 2,271 additions and 1,530 deletions.
2 changes: 2 additions & 0 deletions .rat-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,5 @@ org.apache.spark.scheduler.SparkHistoryListenerFactory
.*parquet
LZ4BlockInputStream.java
spark-deps-.*
.*csv
.*tsv
38 changes: 37 additions & 1 deletion NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,43 @@ Vis.js uses and redistributes the following third-party libraries:

===============================================================================

The CSS style for the navigation sidebar of the documentation was originally
The CSS style for the navigation sidebar of the documentation was originally
submitted by Óscar Nájera for the scikit-learn project. The scikit-learn project
is distributed under the 3-Clause BSD license.
===============================================================================

For CSV functionality:

/*
* Copyright 2014 Databricks
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* Copyright 2015 Ayasdi Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ export("as.DataFrame",
"read.parquet",
"read.text",
"sql",
"str",
"table",
"tableNames",
"tables",
Expand Down
73 changes: 73 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2299,3 +2299,76 @@ setMethod("with",
newEnv <- assignNewEnv(data)
eval(substitute(expr), envir = newEnv, enclos = newEnv)
})

#' Display the structure of a DataFrame, including column names, column types, as well as a
#' a small sample of rows.
#' @name str
#' @title Compactly display the structure of a dataset
#' @rdname str
#' @family DataFrame functions
#' @param object a DataFrame
#' @examples \dontrun{
#' # Create a DataFrame from the Iris dataset
#' irisDF <- createDataFrame(sqlContext, iris)
#'
#' # Show the structure of the DataFrame
#' str(irisDF)
#' }
setMethod("str",
signature(object = "DataFrame"),
function(object) {

# TODO: These could be made global parameters, though in R it's not the case
MAX_CHAR_PER_ROW <- 120
MAX_COLS <- 100

# Get the column names and types of the DataFrame
names <- names(object)
types <- coltypes(object)

# Get the first elements of the dataset. Limit number of columns accordingly
localDF <- if (ncol(object) > MAX_COLS) {
head(object[, c(1:MAX_COLS)])
} else {
head(object)
}

# The number of observations will not be displayed as computing the
# number of rows is a very expensive operation
cat(paste0("'", class(object), "': ", length(names), " variables:\n"))

if (nrow(localDF) > 0) {
for (i in 1 : ncol(localDF)) {
# Get the first elements for each column

firstElements <- if (types[i] == "character") {
paste(paste0("\"", localDF[,i], "\""), collapse = " ")
} else {
paste(localDF[,i], collapse = " ")
}

# Add the corresponding number of spaces for alignment
spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="")

# Get the short type. For 'character', it would be 'chr';
# 'for numeric', it's 'num', etc.
dataType <- SHORT_TYPES[[types[i]]]
if (is.null(dataType)) {
dataType <- substring(types[i], 1, 3)
}

# Concatenate the colnames, coltypes, and first
# elements of each column
line <- paste0(" $ ", names[i], spaces, ": ",
dataType, " ",firstElements)

# Chop off extra characters if this is too long
cat(substr(line, 1, MAX_CHAR_PER_ROW))
cat("\n")
}

if (ncol(localDF) < ncol(object)) {
cat(paste0("\nDisplaying first ", ncol(localDF), " columns only."))
}
}
})
36 changes: 18 additions & 18 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,6 @@ setGeneric("subtractByKey",
setGeneric("value", function(bcast) { standardGeneric("value") })



#################### DataFrame Methods ########################

#' @rdname agg
Expand All @@ -389,6 +388,14 @@ setGeneric("agg", function (x, ...) { standardGeneric("agg") })
#' @export
setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")

#' @rdname attach
#' @export
setGeneric("attach")

#' @rdname columns
#' @export
setGeneric("colnames", function(x, do.NULL = TRUE, prefix = "col") { standardGeneric("colnames") })
Expand Down Expand Up @@ -525,13 +532,12 @@ setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
standardGeneric("saveAsTable")
})

#' @rdname withColumn
#' @export
setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })
setGeneric("str")

#' @rdname write.df
#' @rdname mutate
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
setGeneric("transform", function(`_data`, ...) {standardGeneric("transform") })

#' @rdname write.df
#' @export
Expand Down Expand Up @@ -593,6 +599,10 @@ setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
#' @export
setGeneric("where", function(x, condition) { standardGeneric("where") })

#' @rdname with
#' @export
setGeneric("with")

#' @rdname withColumn
#' @export
setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn") })
Expand All @@ -602,6 +612,9 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
setGeneric("withColumnRenamed",
function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })

#' @rdname write.df
#' @export
setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })

###################### Column Methods ##########################

Expand Down Expand Up @@ -1109,7 +1122,6 @@ setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
#' @export
setGeneric("year", function(x) { standardGeneric("year") })


#' @rdname glm
#' @export
setGeneric("glm")
Expand All @@ -1121,15 +1133,3 @@ setGeneric("predict", function(object, ...) { standardGeneric("predict") })
#' @rdname rbind
#' @export
setGeneric("rbind", signature = "...")

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")

#' @rdname attach
#' @export
setGeneric("attach")

#' @rdname with
#' @export
setGeneric("with")
21 changes: 17 additions & 4 deletions R/pkg/R/types.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,23 @@ COMPLEX_TYPES <- list(
# The full list of data types.
DATA_TYPES <- as.environment(c(as.list(PRIMITIVE_TYPES), COMPLEX_TYPES))

SHORT_TYPES <- as.environment(list(
"character" = "chr",
"logical" = "logi",
"POSIXct" = "POSIXct",
"integer" = "int",
"numeric" = "num",
"raw" = "raw",
"Date" = "Date",
"map" = "map",
"array" = "array",
"struct" = "struct"
))

# An environment for mapping R to Scala, names are R types and values are Scala types.
rToSQLTypes <- as.environment(list(
"integer" = "integer", # in R, integer is 32bit
"numeric" = "double", # in R, numeric == double which is 64bit
"double" = "double",
"integer" = "integer", # in R, integer is 32bit
"numeric" = "double", # in R, numeric == double which is 64bit
"double" = "double",
"character" = "string",
"logical" = "boolean"))
"logical" = "boolean"))
31 changes: 31 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1799,6 +1799,37 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", {
"Only atomic type is supported for column types")
})

test_that("Method str()", {
# Structure of Iris
iris2 <- iris
colnames(iris2) <- c("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species")
iris2$col <- TRUE
irisDF2 <- createDataFrame(sqlContext, iris2)

out <- capture.output(str(irisDF2))
expect_equal(length(out), 7)
expect_equal(out[1], "'DataFrame': 6 variables:")
expect_equal(out[2], " $ Sepal_Length: num 5.1 4.9 4.7 4.6 5 5.4")
expect_equal(out[3], " $ Sepal_Width : num 3.5 3 3.2 3.1 3.6 3.9")
expect_equal(out[4], " $ Petal_Length: num 1.4 1.4 1.3 1.5 1.4 1.7")
expect_equal(out[5], " $ Petal_Width : num 0.2 0.2 0.2 0.2 0.2 0.4")
expect_equal(out[6], paste0(" $ Species : chr \"setosa\" \"setosa\" \"",
"setosa\" \"setosa\" \"setosa\" \"setosa\""))
expect_equal(out[7], " $ col : logi TRUE TRUE TRUE TRUE TRUE TRUE")

# A random dataset with many columns. This test is to check str limits
# the number of columns. Therefore, it will suffice to check for the
# number of returned rows
x <- runif(200, 1, 10)
df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x))))
DF <- createDataFrame(sqlContext, df)
out <- capture.output(str(DF))
expect_equal(length(out), 103)

# Test utils:::str
expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris)))
})

unlink(parquetPath)
unlink(jsonPath)
unlink(jsonPathNa)
8 changes: 7 additions & 1 deletion checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@
<property name="eachLine" value="true"/>
</module>

<module name="RegexpSingleline">
<!-- \s matches whitespace character, $ matches end of line. -->
<property name="format" value="\s+$"/>
<property name="message" value="No trailing whitespace allowed."/>
</module>

<module name="TreeWalker">
<module name="OuterTypeFilename"/>
<module name="IllegalTokenText">
Expand All @@ -84,7 +90,7 @@
</module>
<module name="NeedBraces">
<property name="allowSingleLineStatement" value="true"/>
</module>
</module>
<module name="OneStatementPerLine"/>
<module name="ArrayTypeStyle"/>
<module name="FallThrough"/>
Expand Down
27 changes: 0 additions & 27 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -267,33 +267,6 @@
<artifactId>oro</artifactId>
<version>${oro.version}</version>
</dependency>
<dependency>
<groupId>org.tachyonproject</groupId>
<artifactId>tachyon-client</artifactId>
<version>0.8.2</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
</exclusion>
<exclusion>
<groupId>org.tachyonproject</groupId>
<artifactId>tachyon-underfs-glusterfs</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
Expand Down
6 changes: 0 additions & 6 deletions core/src/main/scala/org/apache/spark/SparkContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
private[spark] def eventLogDir: Option[URI] = _eventLogDir
private[spark] def eventLogCodec: Option[String] = _eventLogCodec

// Generate the random name for a temp folder in external block store.
// Add a timestamp as the suffix here to make it more safe
val externalBlockStoreFolderName = "spark-" + randomUUID.toString()

def isLocal: Boolean = (master == "local" || master.startsWith("local["))

/**
Expand Down Expand Up @@ -423,8 +419,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
}
}

_conf.set("spark.externalBlockStore.folderName", externalBlockStoreFolderName)

if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")

// "_jobProgressListener" should be set up before creating SparkEnv because when creating
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext}
import org.apache.spark.{Logging, SparkEnv, TaskContext}
import org.apache.spark.storage.{RDDBlockId, StorageLevel}
import org.apache.spark.util.Utils

Expand Down Expand Up @@ -72,12 +72,6 @@ private[spark] object LocalRDDCheckpointData {
* This method is idempotent.
*/
def transformStorageLevel(level: StorageLevel): StorageLevel = {
// If this RDD is to be cached off-heap, fail fast since we cannot provide any
// correctness guarantees about subsequent computations after the first one
if (level.useOffHeap) {
throw new SparkException("Local checkpointing is not compatible with off-heap caching.")
}

StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication)
}
}
2 changes: 0 additions & 2 deletions core/src/main/scala/org/apache/spark/status/api/v1/api.scala
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ class JobData private[spark](
val numSkippedStages: Int,
val numFailedStages: Int)

// Q: should Tachyon size go in here as well? currently the UI only shows it on the overall storage
// page ... does anybody pay attention to it?
class RDDStorageInfo private[spark](
val id: Int,
val name: String,
Expand Down

0 comments on commit deef931

Please sign in to comment.