Skip to content

Commit

Permalink
Merge branch 'master' into decimal-parsing-locale
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxGekk committed Nov 27, 2018
2 parents 5236336 + 6a064ba commit 0d1a4f0
Show file tree
Hide file tree
Showing 148 changed files with 4,738 additions and 2,168 deletions.
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ exportMethods("arrange",
"toJSON",
"transform",
"union",
"unionAll",
"unionByName",
"unique",
"unpersist",
Expand Down
14 changes: 14 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2732,6 +2732,20 @@ setMethod("union",
dataFrame(unioned)
})

#' Return a new SparkDataFrame containing the union of rows
#'
#' This is an alias for `union`.
#'
#' @rdname union
#' @name unionAll
#' @aliases unionAll,SparkDataFrame,SparkDataFrame-method
#' @note unionAll since 1.4.0
setMethod("unionAll",
signature(x = "SparkDataFrame", y = "SparkDataFrame"),
function(x, y) {
union(x, y)
})

#' Return a new SparkDataFrame containing the union of rows, matched by column names
#'
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
Expand Down
3 changes: 3 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,9 @@ setGeneric("toRDD", function(x) { standardGeneric("toRDD") })
#' @rdname union
setGeneric("union", function(x, y) { standardGeneric("union") })

#' @rdname union
setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })

#' @rdname unionByName
setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })

Expand Down
4 changes: 2 additions & 2 deletions R/pkg/R/stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ setMethod("corr",
#'
#' Finding frequent items for columns, possibly with false positives.
#' Using the frequent element count algorithm described in
#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#' \url{https://doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
#'
#' @param x A SparkDataFrame.
#' @param cols A vector column names to search frequent items in.
Expand Down Expand Up @@ -143,7 +143,7 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols = "character"),
#' *exact* rank of x is close to (p * N). More precisely,
#' floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
#' optimizations). The algorithm was first present in [[https://doi.org/10.1145/375663.375670
#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
#' Note that NA values will be ignored in numerical columns before calculation. For
#' columns only containing NA values, an empty list is returned.
Expand Down
1 change: 1 addition & 0 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -2458,6 +2458,7 @@ test_that("union(), unionByName(), rbind(), except(), and intersect() on a DataF
expect_equal(count(unioned), 6)
expect_equal(first(unioned)$name, "Michael")
expect_equal(count(arrange(suppressWarnings(union(df, df2)), df$age)), 6)
expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)

df1 <- select(df2, "age", "name")
unioned1 <- arrange(unionByName(df1, df), df1$age)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,26 +371,30 @@ private void assertErrorsContain(Set<String> errors, Set<String> contains) {

private void assertErrorAndClosed(RpcResult result, String expectedError) {
assertTrue("unexpected success: " + result.successMessages, result.successMessages.isEmpty());
// we expect 1 additional error, which should contain one of the follow messages:
// - "closed"
// - "Connection reset"
// - "java.nio.channels.ClosedChannelException"
Set<String> errors = result.errorMessages;
assertEquals("Expected 2 errors, got " + errors.size() + "errors: " +
errors, 2, errors.size());

// We expect 1 additional error due to closed connection and here are possible keywords in the
// error message.
Set<String> possibleClosedErrors = Sets.newHashSet(
"closed",
"Connection reset",
"java.nio.channels.ClosedChannelException",
"java.io.IOException: Broken pipe"
);
Set<String> containsAndClosed = Sets.newHashSet(expectedError);
containsAndClosed.add("closed");
containsAndClosed.add("Connection reset");
containsAndClosed.add("java.nio.channels.ClosedChannelException");
containsAndClosed.addAll(possibleClosedErrors);

Pair<Set<String>, Set<String>> r = checkErrorsContain(errors, containsAndClosed);

assertTrue("Got a non-empty set " + r.getLeft(), r.getLeft().isEmpty());

Set<String> errorsNotFound = r.getRight();
assertEquals(
"The size of " + errorsNotFound.toString() + " was not 2", 2, errorsNotFound.size());
"The size of " + errorsNotFound + " was not " + (possibleClosedErrors.size() - 1),
possibleClosedErrors.size() - 1,
errorsNotFound.size());
for (String err: errorsNotFound) {
assertTrue("Found a wrong error " + err, containsAndClosed.contains(err));
}
Expand Down
10 changes: 5 additions & 5 deletions core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ public void spill() throws IOException {
public abstract long spill(long size, MemoryConsumer trigger) throws IOException;

/**
* Allocates a LongArray of `size`. Note that this method may throw `OutOfMemoryError` if Spark
* doesn't have enough memory for this allocation, or throw `TooLargePageException` if this
* `LongArray` is too large to fit in a single page. The caller side should take care of these
* two exceptions, or make sure the `size` is small enough that won't trigger exceptions.
* Allocates a LongArray of `size`. Note that this method may throw `SparkOutOfMemoryError`
* if Spark doesn't have enough memory for this allocation, or throw `TooLargePageException`
* if this `LongArray` is too large to fit in a single page. The caller side should take care of
* these two exceptions, or make sure the `size` is small enough that won't trigger exceptions.
*
* @throws SparkOutOfMemoryError
* @throws TooLargePageException
Expand All @@ -111,7 +111,7 @@ public void freeArray(LongArray array) {
/**
* Allocate a memory block with at least `required` bytes.
*
* @throws OutOfMemoryError
* @throws SparkOutOfMemoryError
*/
protected MemoryBlock allocatePage(long required) {
MemoryBlock page = taskMemoryManager.allocatePage(Math.max(pageSize, required), this);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,11 @@
import org.apache.spark.Partitioner;
import org.apache.spark.ShuffleDependency;
import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.executor.ShuffleWriteMetrics;
import org.apache.spark.scheduler.MapStatus;
import org.apache.spark.scheduler.MapStatus$;
import org.apache.spark.serializer.Serializer;
import org.apache.spark.serializer.SerializerInstance;
import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
import org.apache.spark.shuffle.IndexShuffleBlockResolver;
import org.apache.spark.shuffle.ShuffleWriter;
import org.apache.spark.storage.*;
Expand Down Expand Up @@ -79,7 +78,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
private final int numPartitions;
private final BlockManager blockManager;
private final Partitioner partitioner;
private final ShuffleWriteMetrics writeMetrics;
private final ShuffleWriteMetricsReporter writeMetrics;
private final int shuffleId;
private final int mapId;
private final Serializer serializer;
Expand All @@ -103,8 +102,8 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
IndexShuffleBlockResolver shuffleBlockResolver,
BypassMergeSortShuffleHandle<K, V> handle,
int mapId,
TaskContext taskContext,
SparkConf conf) {
SparkConf conf,
ShuffleWriteMetricsReporter writeMetrics) {
// Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
Expand All @@ -114,7 +113,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
this.shuffleId = dep.shuffleId();
this.partitioner = dep.partitioner();
this.numPartitions = partitioner.numPartitions();
this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics();
this.writeMetrics = writeMetrics;
this.serializer = dep.serializer();
this.shuffleBlockResolver = shuffleBlockResolver;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.apache.spark.memory.TooLargePageException;
import org.apache.spark.serializer.DummySerializerInstance;
import org.apache.spark.serializer.SerializerInstance;
import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
import org.apache.spark.storage.BlockManager;
import org.apache.spark.storage.DiskBlockObjectWriter;
import org.apache.spark.storage.FileSegment;
Expand Down Expand Up @@ -75,7 +76,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
private final TaskMemoryManager taskMemoryManager;
private final BlockManager blockManager;
private final TaskContext taskContext;
private final ShuffleWriteMetrics writeMetrics;
private final ShuffleWriteMetricsReporter writeMetrics;

/**
* Force this sorter to spill when there are this many elements in memory.
Expand Down Expand Up @@ -113,7 +114,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
int initialSize,
int numPartitions,
SparkConf conf,
ShuffleWriteMetrics writeMetrics) {
ShuffleWriteMetricsReporter writeMetrics) {
super(memoryManager,
(int) Math.min(PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES, memoryManager.pageSizeBytes()),
memoryManager.getTungstenMemoryMode());
Expand Down Expand Up @@ -144,7 +145,7 @@ final class ShuffleExternalSorter extends MemoryConsumer {
*/
private void writeSortedFile(boolean isLastFile) {

final ShuffleWriteMetrics writeMetricsToUse;
final ShuffleWriteMetricsReporter writeMetricsToUse;

if (isLastFile) {
// We're writing the final non-spill file, so we _do_ want to count this as shuffle bytes.
Expand Down Expand Up @@ -241,9 +242,14 @@ private void writeSortedFile(boolean isLastFile) {
//
// Note that we intentionally ignore the value of `writeMetricsToUse.shuffleWriteTime()`.
// Consistent with ExternalSorter, we do not count this IO towards shuffle write time.
// This means that this IO time is not accounted for anywhere; SPARK-3577 will fix this.
writeMetrics.incRecordsWritten(writeMetricsToUse.recordsWritten());
taskContext.taskMetrics().incDiskBytesSpilled(writeMetricsToUse.bytesWritten());
// SPARK-3577 tracks the spill time separately.

// This is guaranteed to be a ShuffleWriteMetrics based on the if check in the beginning
// of this method.
writeMetrics.incRecordsWritten(
((ShuffleWriteMetrics)writeMetricsToUse).recordsWritten());
taskContext.taskMetrics().incDiskBytesSpilled(
((ShuffleWriteMetrics)writeMetricsToUse).bytesWritten());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@

import org.apache.spark.*;
import org.apache.spark.annotation.Private;
import org.apache.spark.executor.ShuffleWriteMetrics;
import org.apache.spark.io.CompressionCodec;
import org.apache.spark.io.CompressionCodec$;
import org.apache.spark.io.NioBufferedFileInputStream;
Expand All @@ -47,6 +46,7 @@
import org.apache.spark.network.util.LimitedInputStream;
import org.apache.spark.scheduler.MapStatus;
import org.apache.spark.scheduler.MapStatus$;
import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
import org.apache.spark.serializer.SerializationStream;
import org.apache.spark.serializer.SerializerInstance;
import org.apache.spark.shuffle.IndexShuffleBlockResolver;
Expand All @@ -73,7 +73,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
private final TaskMemoryManager memoryManager;
private final SerializerInstance serializer;
private final Partitioner partitioner;
private final ShuffleWriteMetrics writeMetrics;
private final ShuffleWriteMetricsReporter writeMetrics;
private final int shuffleId;
private final int mapId;
private final TaskContext taskContext;
Expand Down Expand Up @@ -122,7 +122,8 @@ public UnsafeShuffleWriter(
SerializedShuffleHandle<K, V> handle,
int mapId,
TaskContext taskContext,
SparkConf sparkConf) throws IOException {
SparkConf sparkConf,
ShuffleWriteMetricsReporter writeMetrics) throws IOException {
final int numPartitions = handle.dependency().partitioner().numPartitions();
if (numPartitions > SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE()) {
throw new IllegalArgumentException(
Expand All @@ -138,7 +139,7 @@ public UnsafeShuffleWriter(
this.shuffleId = dep.shuffleId();
this.serializer = dep.serializer().newInstance();
this.partitioner = dep.partitioner();
this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics();
this.writeMetrics = writeMetrics;
this.taskContext = taskContext;
this.sparkConf = sparkConf;
this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import java.io.OutputStream;

import org.apache.spark.annotation.Private;
import org.apache.spark.executor.ShuffleWriteMetrics;
import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;

/**
* Intercepts write calls and tracks total time spent writing in order to update shuffle write
Expand All @@ -30,10 +30,11 @@
@Private
public final class TimeTrackingOutputStream extends OutputStream {

private final ShuffleWriteMetrics writeMetrics;
private final ShuffleWriteMetricsReporter writeMetrics;
private final OutputStream outputStream;

public TimeTrackingOutputStream(ShuffleWriteMetrics writeMetrics, OutputStream outputStream) {
public TimeTrackingOutputStream(
ShuffleWriteMetricsReporter writeMetrics, OutputStream outputStream) {
this.writeMetrics = writeMetrics;
this.outputStream = outputStream;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.spark.SparkEnv;
import org.apache.spark.executor.ShuffleWriteMetrics;
import org.apache.spark.memory.MemoryConsumer;
import org.apache.spark.memory.SparkOutOfMemoryError;
import org.apache.spark.memory.TaskMemoryManager;
import org.apache.spark.serializer.SerializerManager;
import org.apache.spark.storage.BlockManager;
Expand Down Expand Up @@ -741,7 +742,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
if (numKeys >= growthThreshold && longArray.size() < MAX_CAPACITY) {
try {
growAndRehash();
} catch (OutOfMemoryError oom) {
} catch (SparkOutOfMemoryError oom) {
canGrowArray = false;
}
}
Expand All @@ -757,7 +758,7 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
private boolean acquireNewPage(long required) {
try {
currentPage = allocatePage(required);
} catch (OutOfMemoryError e) {
} catch (SparkOutOfMemoryError e) {
return false;
}
dataPages.add(currentPage);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
-->

<script id="executors-summary-template" type="text/html">
<h4 style="clear: left; display: inline-block;">Summary</h4>
<h4 class="title-table">Summary</h4>
<div class="container-fluid">
<div class="container-fluid">
<table id="summary-execs-table" class="table table-striped compact">
<table id="summary-execs-table" class="table table-striped compact cell-border">
<thead>
<th></th>
<th>RDD Blocks</th>
Expand Down Expand Up @@ -64,10 +64,10 @@ <h4 style="clear: left; display: inline-block;">Summary</h4>
</table>
</div>
</div>
<h4 style="clear: left; display: inline-block;">Executors</h4>
<h4 class="title-table">Executors</h4>
<div class="container-fluid">
<div class="container-fluid">
<table id="active-executors-table" class="table table-striped compact">
<table id="active-executors-table" class="table table-striped compact cell-border">
<thead>
<tr>
<th>
Expand Down

0 comments on commit 0d1a4f0

Please sign in to comment.