From ff92ba3ae3abaae0d8ce8bb8c08465f43c14906f Mon Sep 17 00:00:00 2001 From: xuanyuanking Date: Mon, 17 Jul 2017 15:38:56 +0800 Subject: [PATCH 1/2] empty files should be skipped while write to file --- .../datasources/FileFormatWriter.scala | 18 +++++++- .../datasources/FileFormatWriterSuite.scala | 45 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index 9eb9eae699e94..3a8505fb68384 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -236,7 +236,9 @@ object FileFormatWriter extends Logging { committer.setupTask(taskAttemptContext) val writeTask = - if (description.partitionColumns.isEmpty && description.bucketIdExpression.isEmpty) { + if (!iterator.hasNext) { + new EmptyDirectoryWriteTask + } else if (description.partitionColumns.isEmpty && description.bucketIdExpression.isEmpty) { new SingleDirectoryWriteTask(description, taskAttemptContext, committer) } else { new DynamicPartitionWriteTask(description, taskAttemptContext, committer) @@ -301,6 +303,20 @@ object FileFormatWriter extends Logging { } } + /** ExecuteWriteTask for empty partitions */ + private class EmptyDirectoryWriteTask extends ExecuteWriteTask { + + override def execute(iter: Iterator[InternalRow]): ExecutedWriteSummary = { + ExecutedWriteSummary( + updatedPartitions = Set.empty, + numOutputFile = 0, + numOutputBytes = 0, + numOutputRows = 0) + } + + override def releaseResources(): Unit = {} + } + /** Writes data to a single directory (used for non-dynamic-partition writes). */ private class SingleDirectoryWriteTask( description: WriteJobDescription, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala new file mode 100644 index 0000000000000..acf47bcd94ccb --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileFormatWriterSuite.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources + +import java.io.{File, FilenameFilter} + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.test.SharedSQLContext + +class FileFormatWriterSuite extends QueryTest with SharedSQLContext { + + test("empty file should be skipped while write to file") { + withTempDir { dir => + dir.delete() + spark.range(1000).repartition(2).write.parquet(dir.toString) + val df = spark.read.parquet(dir.toString) + + withTempDir { dst_dir => + dst_dir.delete() + df.where("id = 50").write.parquet(dst_dir.toString) + val allFiles = dst_dir.listFiles(new FilenameFilter { + override def accept(dir: File, name: String): Boolean = { + !name.startsWith(".") && !name.startsWith("_") + } + }) + assert(allFiles.length == 1) + } + } + } +} From e08fb1939d5267a38f3318af7506b6ed8628ebbf Mon Sep 17 00:00:00 2001 From: xuanyuanking Date: Mon, 17 Jul 2017 18:20:34 +0800 Subject: [PATCH 2/2] Handle the empty result of parquet --- .../spark/sql/execution/datasources/FileFormatWriter.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index 3a8505fb68384..2654704c1da05 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -236,7 +236,7 @@ object FileFormatWriter extends Logging { committer.setupTask(taskAttemptContext) val writeTask = - if (!iterator.hasNext) { + if (sparkPartitionId != 0 && !iterator.hasNext) { new EmptyDirectoryWriteTask } else if (description.partitionColumns.isEmpty && description.bucketIdExpression.isEmpty) { new SingleDirectoryWriteTask(description, taskAttemptContext, committer)